Edit

IABSD.fr/xenocara/lib/mesa/src/broadcom/vulkan/v3dv_queue.c

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/broadcom/vulkan/v3dv_queue.c
  • /*
     * Copyright © 2019 Raspberry Pi Ltd
     *
     * Permission is hereby granted, free of charge, to any person obtaining a
     * copy of this software and associated documentation files (the "Software"),
     * to deal in the Software without restriction, including without limitation
     * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     * and/or sell copies of the Software, and to permit persons to whom the
     * Software is furnished to do so, subject to the following conditions:
     *
     * The above copyright notice and this permission notice (including the next
     * paragraph) shall be included in all copies or substantial portions of the
     * Software.
     *
     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     * IN THE SOFTWARE.
     */
    
    #include "v3dv_private.h"
    #include "drm-uapi/v3d_drm.h"
    
    #include "broadcom/clif/clif_dump.h"
    #include "util/libsync.h"
    #include "util/os_time.h"
    #include "util/perf/cpu_trace.h"
    #include "vk_drm_syncobj.h"
    
    #include <errno.h>
    #include <time.h>
    
    static void
    v3dv_clif_dump(struct v3dv_device *device,
                   struct v3dv_job *job,
                   struct drm_v3d_submit_cl *submit)
    {
       if (!(V3D_DBG(CL) ||
             V3D_DBG(CL_NO_BIN) ||
             V3D_DBG(CLIF)))
          return;
    
       struct clif_dump *clif = clif_dump_init(&device->devinfo,
                                               stderr,
                                               V3D_DBG(CL) ||
                                               V3D_DBG(CL_NO_BIN),
                                               V3D_DBG(CL_NO_BIN));
    
       set_foreach(job->bos, entry) {
          struct v3dv_bo *bo = (void *)entry->key;
          char *name = ralloc_asprintf(NULL, "%s_0x%x",
                                       bo->name, bo->offset);
    
          bool ok = v3dv_bo_map(device, bo, bo->size);
          if (!ok) {
             mesa_loge("failed to map BO for clif_dump.\n");
             ralloc_free(name);
             goto free_clif;
          }
          clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
    
          ralloc_free(name);
       }
    
       clif_dump(clif, submit);
    
     free_clif:
       clif_dump_destroy(clif);
    }
    
    static VkResult
    queue_wait_idle(struct v3dv_queue *queue,
                    struct v3dv_submit_sync_info *sync_info)
    {
       int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
                                queue->last_job_syncs.syncs, 4,
                                INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
                                NULL);
       if (ret)
          return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m");
    
       bool first = true;
       for (int i = 0; i < 4; i++) {
          if (!queue->last_job_syncs.first[i])
             first = false;
       }
    
       /* If we're not the first job, that means we're waiting on some
        * per-queue-type syncobj which transitively waited on the semaphores
        * so we can skip the semaphore wait.
        */
       if (first) {
          VkResult result = vk_sync_wait_many(&queue->device->vk,
                                              sync_info->wait_count,
                                              sync_info->waits,
                                              VK_SYNC_WAIT_COMPLETE,
                                              UINT64_MAX);
          if (result != VK_SUCCESS)
             return result;
       }
    
       for (int i = 0; i < 4; i++)
          queue->last_job_syncs.first[i] = false;
    
       return VK_SUCCESS;
    }
    
    static void
    multisync_free(struct v3dv_device *device,
                   struct drm_v3d_multi_sync *ms)
    {
       vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
       vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
    }
    
    static struct drm_v3d_sem *
    set_in_syncs(struct v3dv_queue *queue,
                 struct v3dv_job *job,
                 enum v3dv_queue_type queue_sync,
                 uint32_t *count,
                 struct vk_sync_wait *waits,
                 unsigned wait_count,
                 struct v3dv_submit_sync_info *sync_info)
    {
       struct v3dv_device *device = queue->device;
       uint32_t n_syncs = 0;
    
       /* If this is the first job submitted to a given GPU queue in this cmd buf
        * batch, it has to wait on wait semaphores (if any) before running.
        */
       if (queue->last_job_syncs.first[queue_sync])
          n_syncs = sync_info->wait_count;
    
       /* If the serialize flag is set the job needs to be serialized in the
        * corresponding queues. Notice that we may implement transfer operations
        * as both CL or TFU jobs.
        *
        * FIXME: maybe we could track more precisely if the source of a transfer
        * barrier is a CL and/or a TFU job.
        */
       bool sync_csd  = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
       bool sync_tfu  = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
       bool sync_cl   = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
                                          V3DV_BARRIER_TRANSFER_BIT);
       bool sync_cpu  = job->serialize & V3DV_BARRIER_CPU_BIT;
    
       *count = n_syncs;
       if (sync_cl)
          (*count)++;
       if (sync_tfu)
          (*count)++;
       if (sync_csd)
          (*count)++;
       if (sync_cpu)
          (*count)++;
    
       *count += wait_count;
    
       if (!*count)
          return NULL;
    
       struct drm_v3d_sem *syncs =
          vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
                    8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    
       if (!syncs)
          return NULL;
    
       for (int i = 0; i < n_syncs; i++) {
          syncs[i].handle =
             vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
       }
    
       for (int i = 0; i < wait_count; i++) {
          syncs[n_syncs++].handle =
             vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
       }
    
       if (sync_cl)
          syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
    
       if (sync_csd)
          syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
    
       if (sync_tfu)
          syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
    
       if (sync_cpu)
          syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
    
       assert(n_syncs == *count);
       return syncs;
    }
    
    static struct drm_v3d_sem *
    set_out_syncs(struct v3dv_queue *queue,
                  struct v3dv_job *job,
                  enum v3dv_queue_type queue_sync,
                  uint32_t *count,
                  struct v3dv_submit_sync_info *sync_info,
                  bool signal_syncs)
    {
       struct v3dv_device *device = queue->device;
    
       uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
    
       /* We always signal the syncobj from `device->last_job_syncs` related to
        * this v3dv_queue_type to track the last job submitted to this queue.
        */
       (*count) = n_vk_syncs + 1;
    
       struct drm_v3d_sem *syncs =
          vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
                    8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
    
       if (!syncs)
          return NULL;
    
       if (n_vk_syncs) {
          for (unsigned i = 0; i < n_vk_syncs; i++) {
             syncs[i].handle =
                vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
          }
       }
    
       syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
    
       return syncs;
    }
    
    static void
    set_ext(struct drm_v3d_extension *ext,
    	struct drm_v3d_extension *next,
    	uint32_t id,
    	uintptr_t flags)
    {
       ext->next = (uintptr_t)(void *)next;
       ext->id = id;
       ext->flags = flags;
    }
    
    /* This function sets the extension for multiple in/out syncobjs. When it is
     * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
     * Otherwise, the extension id is 0, which means an out-of-memory error.
     */
    static void
    set_multisync(struct drm_v3d_multi_sync *ms,
                  struct v3dv_submit_sync_info *sync_info,
                  struct vk_sync_wait *waits,
                  unsigned wait_count,
                  struct drm_v3d_extension *next,
                  struct v3dv_device *device,
                  struct v3dv_job *job,
                  enum v3dv_queue_type in_queue_sync,
                  enum v3dv_queue_type out_queue_sync,
                  enum v3d_queue wait_stage,
                  bool signal_syncs)
    {
       struct v3dv_queue *queue = &device->queue;
       uint32_t out_sync_count = 0, in_sync_count = 0;
       struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
    
       in_syncs = set_in_syncs(queue, job, in_queue_sync,
                               &in_sync_count, waits, wait_count, sync_info);
       if (!in_syncs && in_sync_count)
          goto fail;
    
       out_syncs = set_out_syncs(queue, job, out_queue_sync,
                                 &out_sync_count, sync_info, signal_syncs);
    
       assert(out_sync_count > 0);
    
       if (!out_syncs)
          goto fail;
    
       set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
       ms->wait_stage = wait_stage;
       ms->out_sync_count = out_sync_count;
       ms->out_syncs = (uintptr_t)(void *)out_syncs;
       ms->in_sync_count = in_sync_count;
       ms->in_syncs = (uintptr_t)(void *)in_syncs;
    
       return;
    
    fail:
       if (in_syncs)
          vk_free(&device->vk.alloc, in_syncs);
       assert(!out_syncs);
    
       return;
    }
    
    static VkResult
    handle_reset_query_cpu_job(struct v3dv_queue *queue,
                               struct v3dv_job *job,
                               struct v3dv_submit_sync_info *sync_info,
                               bool signal_syncs)
    {
       MESA_TRACE_FUNC();
       struct v3dv_device *device = queue->device;
       struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
       assert(info->pool);
    
       assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
    
       if (device->pdevice->caps.cpu_queue) {
          assert(info->first + info->count <= info->pool->query_count);
    
          struct drm_v3d_submit_cpu submit = {0};
          struct drm_v3d_multi_sync ms = {0};
    
          uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
          uintptr_t *kperfmon_ids = NULL;
    
          if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
             submit.bo_handle_count = 1;
             submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
    
             struct drm_v3d_reset_timestamp_query reset = {0};
    
             set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
    
             reset.count = info->count;
             reset.offset = info->pool->queries[info->first].timestamp.offset;
    
             for (uint32_t i = 0; i < info->count; i++) {
                struct v3dv_query *query = &info->pool->queries[info->first + i];
                syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
             }
    
             reset.syncs = (uintptr_t)(void *)syncs;
    
             set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
                           V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
             if (!ms.base.id) {
                free(syncs);
                return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
             }
          } else {
             assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
             struct drm_v3d_reset_performance_query reset = {0};
    
             set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
    
             struct vk_sync_wait waits[info->count];
             unsigned wait_count = 0;
             for (int i = 0; i < info->count; i++) {
                struct v3dv_query *query = &info->pool->queries[info->first + i];
                /* Only wait for a query if we've used it otherwise we will be
                 * waiting forever for the fence to become signaled.
                 */
                if (query->maybe_available) {
                   waits[wait_count] = (struct vk_sync_wait){
                      .sync = query->perf.last_job_sync
                   };
                   wait_count++;
                };
             }
    
             reset.count = info->count;
             reset.nperfmons = info->pool->perfmon.nperfmons;
    
             kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
    
             for (uint32_t i = 0; i < info->count; i++) {
                struct v3dv_query *query = &info->pool->queries[info->first + i];
    
                syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
                kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
             }
    
             reset.syncs = (uintptr_t)(void *)syncs;
             reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
    
             set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
                           V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
             if (!ms.base.id) {
                free(syncs);
                free(kperfmon_ids);
                return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
             }
          }
    
          submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
          submit.extensions = (uintptr_t)(void *)&ms;
    
          /* From the Vulkan spec for vkCmdResetQueryPool:
           *
           *    "This command defines an execution dependency between other query commands
           *     that reference the same query.
           *     ...
           *     The second synchronization scope includes all commands which reference the
           *     queries in queryPool indicated by firstQuery and queryCount that occur later
           *     in submission order."
           *
           * This means we should ensure that any timestamps after a reset don't execute before
           * the reset, however, for timestamps queries in particular we don't have to do
           * anything special because timestamp queries have to wait for all previously
           * submitted work to complete before executing (which we accomplish by using
           * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
           */
          int ret = v3d_ioctl(device->pdevice->render_fd,
                              DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
    
          free(syncs);
          free(kperfmon_ids);
          multisync_free(device, &ms);
    
          queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
    
          if (ret)
             return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
    
          return VK_SUCCESS;
       }
    
       /* We are about to reset query counters in user-space so we need to make
        * sure that the GPU is not using them.
        */
       if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
          VkResult result = queue_wait_idle(queue, sync_info);
          if (result != VK_SUCCESS)
             return result;
    
          v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
       }
    
       if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
          struct vk_sync_wait waits[info->count];
          unsigned wait_count = 0;
          for (int i = 0; i < info->count; i++) {
             struct v3dv_query *query = &info->pool->queries[info->first + i];
             /* Only wait for a query if we've used it otherwise we will be
              * waiting forever for the fence to become signaled.
              */
             if (query->maybe_available) {
                waits[wait_count] = (struct vk_sync_wait){
                   .sync = query->perf.last_job_sync
                };
                wait_count++;
             };
          }
    
          VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
                                              VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
    
          if (result != VK_SUCCESS)
             return result;
       }
    
       v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
    
       return VK_SUCCESS;
    }
    
    static VkResult
    export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
    {
       int err;
       static const enum v3dv_queue_type queues_to_sync[] = {
          V3DV_QUEUE_CL,
          V3DV_QUEUE_CSD,
       };
    
       for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
          enum v3dv_queue_type queue_type = queues_to_sync[i];
          int tmp_fd = -1;
    
          err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
                                         queue->last_job_syncs.syncs[queue_type],
                                         &tmp_fd);
    
          if (err) {
             close(*fd);
             return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
                              "sync file export failed: %m");
          }
    
          err = sync_accumulate("v3dv", fd, tmp_fd);
    
          if (err) {
             close(tmp_fd);
             close(*fd);
             return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
                              "failed to accumulate sync files: %m");
          }
       }
    
       return VK_SUCCESS;
    }
    
    static VkResult
    handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
    {
       MESA_TRACE_FUNC();
       VkResult result = VK_SUCCESS;
    
       mtx_lock(&job->device->query_mutex);
    
       struct v3dv_end_query_info *info = &job->cpu.query_end;
       struct v3dv_queue *queue = &job->device->queue;
    
       int err = 0;
       int fd = -1;
    
       assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
    
       if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
          result = export_perfmon_last_job_sync(queue, job, &fd);
    
          if (result != VK_SUCCESS)
             goto fail;
    
          assert(fd >= 0);
       }
    
       for (uint32_t i = 0; i < info->count; i++) {
          assert(info->query + i < info->pool->query_count);
          struct v3dv_query *query = &info->pool->queries[info->query + i];
    
          if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
             uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
             err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
                                            syncobj, fd);
    
             if (err) {
                result = vk_errorf(queue, VK_ERROR_UNKNOWN,
                                   "sync file import failed: %m");
                goto fail;
             }
          }
    
          query->maybe_available = true;
       }
    
    fail:
       if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
          close(fd);
    
       cnd_broadcast(&job->device->query_ended);
       mtx_unlock(&job->device->query_mutex);
    
       return result;
    }
    
    static VkResult
    handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
                                      struct v3dv_job *job,
                                      struct v3dv_submit_sync_info *sync_info,
                                      bool signal_syncs)
    {
       MESA_TRACE_FUNC();
       struct v3dv_device *device = queue->device;
       struct v3dv_copy_query_results_cpu_job_info *info =
          &job->cpu.query_copy_results;
    
       assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
              info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
    
       assert(info->dst && info->dst->mem && info->dst->mem->bo);
       struct v3dv_bo *bo = info->dst->mem->bo;
    
       if (device->pdevice->caps.cpu_queue) {
          struct drm_v3d_submit_cpu submit = {0};
          struct drm_v3d_multi_sync ms = {0};
    
          uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
          uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
          uint32_t *bo_handles = NULL;
          uintptr_t *kperfmon_ids = NULL;
    
          if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
             submit.bo_handle_count = 2;
    
             bo_handles = (uint32_t *)
                malloc(sizeof(uint32_t) * submit.bo_handle_count);
    
             bo_handles[0] = bo->handle;
             bo_handles[1] = info->pool->timestamp.bo->handle;
             submit.bo_handles = (uintptr_t)(void *)bo_handles;
    
             struct drm_v3d_copy_timestamp_query copy = {0};
    
             set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
    
             copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
             copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
             copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
             copy.offset = info->offset + info->dst->mem_offset;
             copy.stride = info->stride;
             copy.count = info->count;
    
             for (uint32_t i = 0; i < info->count; i++) {
                assert(info->first < info->pool->query_count);
                assert(info->first + info->count <= info->pool->query_count);
                struct v3dv_query *query = &info->pool->queries[info->first + i];
    
                offsets[i] = query->timestamp.offset;
                syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
             }
    
             copy.offsets = (uintptr_t)(void *)offsets;
             copy.syncs = (uintptr_t)(void *)syncs;
    
             set_multisync(&ms, sync_info, NULL, 0, (void *)&copy, device, job,
                           V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
             if (!ms.base.id) {
                free(bo_handles);
                free(offsets);
                free(syncs);
                return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
             }
          } else {
             assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
    
             submit.bo_handle_count = 1;
             submit.bo_handles = (uintptr_t)(void *)&bo->handle;
    
             struct drm_v3d_copy_performance_query copy = {0};
    
             set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
    
    	 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
    	  * results for each query are written as an array of the type indicated
    	  * by VkPerformanceCounterKHR::storage for the counter being queried.
    	  * For v3dv, VkPerformanceCounterKHR::storage is
    	  * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
    	  */
             copy.do_64bit = true;
             copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
             copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
             copy.offset = info->offset + info->dst->mem_offset;
             copy.stride = info->stride;
             copy.count = info->count;
             copy.nperfmons = info->pool->perfmon.nperfmons;
             copy.ncounters = info->pool->perfmon.ncounters;
    
             kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
    
             struct vk_sync_wait waits[info->count];
             unsigned wait_count = 0;
    
             for (uint32_t i = 0; i < info->count; i++) {
                assert(info->first < info->pool->query_count);
                assert(info->first + info->count <= info->pool->query_count);
                struct v3dv_query *query = &info->pool->queries[info->first + i];
    
                syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
                kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
    
                if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
                    waits[wait_count] = (struct vk_sync_wait){
                       .sync = query->perf.last_job_sync
                    };
                    wait_count++;
                }
             }
    
             copy.syncs = (uintptr_t)(void *)syncs;
             copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
    
             set_multisync(&ms, sync_info, waits, wait_count, (void *)&copy, device, job,
                           V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
             if (!ms.base.id) {
                free(kperfmon_ids);
                free(bo_handles);
                free(offsets);
                free(syncs);
                return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
             }
          }
    
          submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
          submit.extensions = (uintptr_t)(void *)&ms;
    
          int ret = v3d_ioctl(device->pdevice->render_fd,
                              DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
    
          free(kperfmon_ids);
          free(bo_handles);
          free(offsets);
          free(syncs);
          multisync_free(device, &ms);
    
          queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
    
          if (ret)
             return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
    
          return VK_SUCCESS;
       }
    
       /* Map the entire dst buffer for the CPU copy if needed */
       assert(!bo->map || bo->map_size == bo->size);
       if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
          return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
    
       uint8_t *offset = ((uint8_t *) bo->map) +
                         info->offset + info->dst->mem_offset;
       v3dv_get_query_pool_results_cpu(job->device,
                                       info->pool,
                                       info->first,
                                       info->count,
                                       offset,
                                       info->stride,
                                       info->flags);
    
       return VK_SUCCESS;
    }
    
    static VkResult
    handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
                                   struct v3dv_job *job,
                                   struct v3dv_submit_sync_info *sync_info,
                                   bool signal_syncs)
    {
       MESA_TRACE_FUNC();
       struct v3dv_device *device = queue->device;
    
       assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
       struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
    
       if (!device->pdevice->caps.cpu_queue) {
          /* Wait for completion of all work queued before the timestamp query */
          VkResult result = queue_wait_idle(queue, sync_info);
          if (result != VK_SUCCESS)
             return result;
    
          mtx_lock(&job->device->query_mutex);
    
          /* Compute timestamp */
          struct timespec t;
          clock_gettime(CLOCK_MONOTONIC, &t);
    
          for (uint32_t i = 0; i < info->count; i++) {
             assert(info->query + i < info->pool->query_count);
    	 struct v3dv_query *query = &info->pool->queries[info->query + i];
             query->maybe_available = true;
    
             /* Value */
             uint8_t *value_addr =
                ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
             *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
    
             /* Availability */
             result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
          }
    
          cnd_broadcast(&job->device->query_ended);
          mtx_unlock(&job->device->query_mutex);
    
          return result;
       }
    
       struct drm_v3d_submit_cpu submit = {0};
    
       submit.bo_handle_count = 1;
       submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
    
       struct drm_v3d_timestamp_query timestamp = {0};
    
       set_ext(&timestamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
    
       timestamp.count = info->count;
    
       uint32_t *offsets =
          (uint32_t *) malloc(sizeof(uint32_t) * info->count);
       uint32_t *syncs =
          (uint32_t *) malloc(sizeof(uint32_t) * info->count);
    
       for (uint32_t i = 0; i < info->count; i++) {
          assert(info->query + i < info->pool->query_count);
          struct v3dv_query *query = &info->pool->queries[info->query + i];
          query->maybe_available = true;
    
          offsets[i] = query->timestamp.offset;
          syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
       }
    
       timestamp.offsets = (uintptr_t)(void *)offsets;
       timestamp.syncs = (uintptr_t)(void *)syncs;
    
       struct drm_v3d_multi_sync ms = {0};
    
       /* The CPU job should be serialized so it only executes after all previously
        * submitted work has completed
        */
       job->serialize = V3DV_BARRIER_ALL;
    
       set_multisync(&ms, sync_info, NULL, 0, (void *)&timestamp, device, job,
    	         V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
       if (!ms.base.id) {
          free(offsets);
          free(syncs);
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    
       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
       submit.extensions = (uintptr_t)(void *)&ms;
    
       int ret = v3d_ioctl(device->pdevice->render_fd,
    			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
    
       free(offsets);
       free(syncs);
       multisync_free(device, &ms);
    
       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
    
       if (ret)
          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
    
       return VK_SUCCESS;
    }
    
    static VkResult
    handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
                                struct v3dv_job *job,
                                struct v3dv_submit_sync_info *sync_info,
                                bool signal_syncs)
    {
       MESA_TRACE_FUNC();
       struct v3dv_device *device = queue->device;
    
       assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
       struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
       assert(info->csd_job);
    
       assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
       struct v3dv_bo *bo = info->buffer->mem->bo;
    
       if (!device->pdevice->caps.cpu_queue) {
          /* Make sure the GPU is no longer using the indirect buffer*/
          v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
    
          /* Map the indirect buffer and read the dispatch parameters */
          if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
             return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
          assert(bo->map);
    
          const uint32_t offset = info->buffer->mem_offset + info->offset;
          const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
          if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
             return VK_SUCCESS;
    
          if (memcmp(group_counts, info->csd_job->csd.wg_count,
    		 sizeof(info->csd_job->csd.wg_count)) != 0) {
             v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
          }
    
          return VK_SUCCESS;
       }
    
       struct v3dv_job *csd_job = info->csd_job;
    
       struct drm_v3d_submit_cpu submit = {0};
    
       submit.bo_handle_count = 1;
       submit.bo_handles = (uintptr_t)(void *)&bo->handle;
    
       csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
       uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
       uint32_t bo_idx = 0;
       set_foreach (csd_job->bos, entry) {
          struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
          bo_handles[bo_idx++] = bo->handle;
       }
       csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
    
       struct drm_v3d_indirect_csd indirect = {0};
    
       set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
    
       indirect.submit = csd_job->csd.submit;
       indirect.offset = info->buffer->mem_offset + info->offset;
       indirect.wg_size = info->wg_size;
    
       for (int i = 0; i < 3; i++) {
          if (info->wg_uniform_offsets[i]) {
             assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
             indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
          } else {
             indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
          }
       }
    
       indirect.indirect = csd_job->indirect.bo->handle;
    
       struct drm_v3d_multi_sync ms = {0};
    
       /* We need to configure the semaphores of this job with the indirect
        * CSD job, as the CPU job must obey to the CSD job synchronization
        * demands, such as barriers.
        */
       set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
    	         V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
    
       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
       submit.extensions = (uintptr_t)(void *)&ms;
    
       int ret = v3d_ioctl(device->pdevice->render_fd,
    			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
    
       free(bo_handles);
       multisync_free(device, &ms);
    
       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
       queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
    
       if (ret)
          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
    
       return VK_SUCCESS;
    }
    
    static VkResult
    handle_cl_job(struct v3dv_queue *queue,
                  struct v3dv_job *job,
                  uint32_t counter_pass_idx,
                  struct v3dv_submit_sync_info *sync_info,
                  bool signal_syncs)
    {
       MESA_TRACE_FUNC();
       struct v3dv_device *device = queue->device;
    
       struct drm_v3d_submit_cl submit = { 0 };
    
       /* Sanity check: we should only flag a bcl sync on a job that needs to be
        * serialized.
        */
       assert(job->serialize || !job->needs_bcl_sync);
    
       /* We expect to have just one RCL per job which should fit in just one BO.
        * Our BCL, could chain multiple BOS together though.
        */
       assert(list_length(&job->rcl.bo_list) == 1);
       assert(list_length(&job->bcl.bo_list) >= 1);
       struct v3dv_bo *bcl_fist_bo =
          list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
       submit.bcl_start = bcl_fist_bo->offset;
       submit.bcl_end = job->suspending ? job->suspended_bcl_end :
                                          job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
       submit.rcl_start = job->rcl.bo->offset;
       submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
    
       submit.qma = job->tile_alloc->offset;
       submit.qms = job->tile_alloc->size;
       submit.qts = job->tile_state->offset;
    
       submit.flags = 0;
       if (job->tmu_dirty_rcl)
          submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
    
       /* If the job uses VK_KHR_buffer_device_address we need to ensure all
        * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
        * are included.
        */
       if (job->uses_buffer_device_address) {
          util_dynarray_foreach(&queue->device->device_address_bo_list,
                                struct v3dv_bo *, bo) {
             v3dv_job_add_bo(job, *bo);
          }
       }
    
       submit.bo_handle_count = job->bo_count;
       uint32_t *bo_handles =
          (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
       uint32_t bo_idx = 0;
       set_foreach(job->bos, entry) {
          struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
          bo_handles[bo_idx++] = bo->handle;
       }
       assert(bo_idx == submit.bo_handle_count);
       submit.bo_handles = (uintptr_t)(void *)bo_handles;
    
       submit.perfmon_id = job->perf ?
          job->perf->kperfmon_ids[counter_pass_idx] : 0;
       const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
       queue->last_perfmon_id = submit.perfmon_id;
    
       /* We need a binning sync if we are the first CL job waiting on a semaphore
        * with a wait stage that involves the geometry pipeline, or if the job
        * comes after a pipeline barrier that involves geometry stages
        * (needs_bcl_sync) or when performance queries are in use.
        *
        * We need a render sync if the job doesn't need a binning sync but has
        * still been flagged for serialization. It should be noted that RCL jobs
        * don't start until the previous RCL job has finished so we don't really
        * need to add a fence for those, however, we might need to wait on a CSD or
        * TFU job, which are not automatically serialized with CL jobs.
        */
       bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
       if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
          for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
             needs_bcl_sync = sync_info->waits[i].stage_mask &
                 (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
                  VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
                  VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
                  VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
                  VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
                  VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
                  VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
                  VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
                  VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
                  VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
                  VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
                  VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
          }
       }
    
       bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
    
       /* Replace single semaphore settings whenever our kernel-driver supports
        * multiple semaphores extension.
        */
       struct drm_v3d_multi_sync ms = { 0 };
       enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
       set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
                     V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
       if (!ms.base.id) {
          free(bo_handles);
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
       }
    
       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
       submit.extensions = (uintptr_t)(void *)&ms;
    
       /* We are using multisync so disable legacy single-sync interface */
       submit.in_sync_rcl = 0;
       submit.in_sync_bcl = 0;
       submit.out_sync = 0;
    
       v3dv_clif_dump(device, job, &submit);
       int ret = v3d_ioctl(device->pdevice->render_fd,
                           DRM_IOCTL_V3D_SUBMIT_CL, &submit);
    
       static bool warned = false;
       if (ret && !warned) {
          mesa_loge("Draw call returned %s. Expect corruption.\n",
                    strerror(errno));
          warned = true;
       }
    
       free(bo_handles);
       multisync_free(device, &ms);
    
       queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
    
       if (ret)
          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
    
       return VK_SUCCESS;
    }
    
    static VkResult
    handle_tfu_job(struct v3dv_queue *queue,
                   struct v3dv_job *job,
                   struct v3dv_submit_sync_info *sync_info,
                   bool signal_syncs)
    {
       MESA_TRACE_FUNC();
       assert(!V3D_DBG(DISABLE_TFU));
    
       struct v3dv_device *device = queue->device;
    
       /* Replace single semaphore settings whenever our kernel-driver supports
        * multiple semaphore extension.
        */
       struct drm_v3d_multi_sync ms = { 0 };
       set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
                     V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
    
       job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
       job->tfu.extensions = (uintptr_t)(void *)&ms;
    
       /* We are using multisync so disable legacy single-sync interface */
       job->tfu.in_sync = 0;
       job->tfu.out_sync = 0;
    
       int ret = v3d_ioctl(device->pdevice->render_fd,
                           DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
    
       multisync_free(device, &ms);
       queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
    
       if (ret != 0)
          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
    
       return VK_SUCCESS;
    }
    
    static VkResult
    handle_csd_job(struct v3dv_queue *queue,
                   struct v3dv_job *job,
                   uint32_t counter_pass_idx,
                   struct v3dv_submit_sync_info *sync_info,
                   bool signal_syncs)
    {
       MESA_TRACE_FUNC();
       struct v3dv_device *device = queue->device;
    
       struct drm_v3d_submit_csd *submit = &job->csd.submit;
    
       /* If the job uses VK_KHR_buffer_device_address we need to ensure all
        * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
        * are included.
        */
       if (job->uses_buffer_device_address) {
          util_dynarray_foreach(&queue->device->device_address_bo_list,
                                struct v3dv_bo *, bo) {
             v3dv_job_add_bo(job, *bo);
          }
       }
    
       submit->bo_handle_count = job->bo_count;
       uint32_t *bo_handles =
          (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
       uint32_t bo_idx = 0;
       set_foreach(job->bos, entry) {
          struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
          bo_handles[bo_idx++] = bo->handle;
       }
       assert(bo_idx == submit->bo_handle_count);
       submit->bo_handles = (uintptr_t)(void *)bo_handles;
    
       /* Replace single semaphore settings whenever our kernel-driver supports
        * multiple semaphore extension.
        */
       struct drm_v3d_multi_sync ms = { 0 };
       set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
                     V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
       if (!ms.base.id)
          return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
    
       submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
       submit->extensions = (uintptr_t)(void *)&ms;
    
       /* We are using multisync so disable legacy single-sync interface */
       submit->in_sync = 0;
       submit->out_sync = 0;
    
       submit->perfmon_id = job->perf ?
          job->perf->kperfmon_ids[counter_pass_idx] : 0;
       queue->last_perfmon_id = submit->perfmon_id;
    
       int ret = v3d_ioctl(device->pdevice->render_fd,
                           DRM_IOCTL_V3D_SUBMIT_CSD, submit);
    
       static bool warned = false;
       if (ret && !warned) {
          mesa_loge("Compute dispatch returned %s. Expect corruption.\n",
                    strerror(errno));
          warned = true;
       }
    
       free(bo_handles);
    
       multisync_free(device, &ms);
       queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
    
       if (ret)
          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
    
       return VK_SUCCESS;
    }
    
    static VkResult
    queue_handle_job(struct v3dv_queue *queue,
                     struct v3dv_job *job,
                     uint32_t counter_pass_idx,
                     struct v3dv_submit_sync_info *sync_info,
                     bool signal_syncs)
    {
       switch (job->type) {
       case V3DV_JOB_TYPE_GPU_CL:
          return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
       case V3DV_JOB_TYPE_GPU_TFU:
          return handle_tfu_job(queue, job, sync_info, signal_syncs);
       case V3DV_JOB_TYPE_GPU_CSD:
          return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
       case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
          return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
       case V3DV_JOB_TYPE_CPU_END_QUERY:
          return handle_end_query_cpu_job(job, counter_pass_idx);
       case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
          return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
       case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
          return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
       case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
          return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
       default:
          unreachable("Unhandled job type");
       }
    }
    
    static VkResult
    queue_create_noop_job(struct v3dv_queue *queue)
    {
       struct v3dv_device *device = queue->device;
       queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
                                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (!queue->noop_job)
          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
       v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
    
       v3d_X((&device->devinfo), job_emit_noop)(queue->noop_job);
    
       /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
        * serialized across all hw queues to comply with Vulkan's signal operation
        * order requirements, which basically require that signal operations occur
        * in submission order.
        */
       queue->noop_job->serialize = V3DV_BARRIER_ALL;
    
       return VK_SUCCESS;
    }
    
    static VkResult
    queue_submit_noop_job(struct v3dv_queue *queue,
                          uint32_t counter_pass_idx,
                          struct v3dv_submit_sync_info *sync_info,
                          bool signal_syncs)
    {
       if (!queue->noop_job) {
          VkResult result = queue_create_noop_job(queue);
          if (result != VK_SUCCESS)
             return result;
       }
    
       assert(queue->noop_job);
       return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
                               sync_info, signal_syncs);
    }
    
    VkResult
    v3dv_queue_driver_submit(struct vk_queue *vk_queue,
                             struct vk_queue_submit *submit)
    {
       MESA_TRACE_FUNC();
       struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
       VkResult result;
    
       struct v3dv_submit_sync_info sync_info = {
          .wait_count = submit->wait_count,
          .waits = submit->waits,
          .signal_count = submit->signal_count,
          .signals = submit->signals,
       };
    
       for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
          queue->last_job_syncs.first[i] = true;
    
       struct v3dv_job *first_suspend_job = NULL;
       struct v3dv_job *current_suspend_job = NULL;
       for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
          struct v3dv_cmd_buffer *cmd_buffer =
             container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
          list_for_each_entry_safe(struct v3dv_job, job,
                                   &cmd_buffer->jobs, list_link) {
             if (job->suspending) {
                job = v3d_X((&job->device->devinfo),
                             cmd_buffer_prepare_suspend_job_for_submit)(job);
                if (!job)
                   return VK_ERROR_OUT_OF_DEVICE_MEMORY;
             }
    
             if (job->suspending && !job->resuming) {
                assert(!first_suspend_job);
                assert(!current_suspend_job);
                first_suspend_job = job;
             }
    
             if (job->resuming) {
                assert(first_suspend_job);
                assert(current_suspend_job);
                v3d_X((&job->device->devinfo), job_patch_resume_address)(first_suspend_job,
                                                              current_suspend_job,
                                                              job);
                current_suspend_job = NULL;
             }
    
             if (job->suspending) {
                current_suspend_job = job;
             } else {
                assert(!current_suspend_job);
                struct v3dv_job *submit_job = first_suspend_job ?
                                              first_suspend_job : job;
                result =
                   queue_handle_job(queue, submit_job, submit->perf_pass_index,
                                    &sync_info, false);
    
                if (result != VK_SUCCESS)
                   return result;
    
                first_suspend_job = NULL;
             }
          }
    
          /* If the command buffer ends with a barrier we need to consume it now.
           *
           * FIXME: this will drain all hw queues. Instead, we could use the pending
           * barrier state to limit the queues we serialize against.
           */
          if (cmd_buffer->state.barrier.dst_mask) {
             result = queue_submit_noop_job(queue, submit->perf_pass_index,
                                            &sync_info, false);
             if (result != VK_SUCCESS)
                return result;
          }
       }
    
       assert(!first_suspend_job);
       assert(!current_suspend_job);
    
       /* Handle signaling now */
       if (submit->signal_count > 0) {
          /* Finish by submitting a no-op job that synchronizes across all queues.
           * This will ensure that the signal semaphores don't get triggered until
           * all work on any queue completes. See Vulkan's signal operation order
           * requirements.
           */
          return queue_submit_noop_job(queue, submit->perf_pass_index,
                                       &sync_info, true);
       }
    
       return VK_SUCCESS;
    }
    
    VKAPI_ATTR VkResult VKAPI_CALL
    v3dv_QueueBindSparse(VkQueue _queue,
                         uint32_t bindInfoCount,
                         const VkBindSparseInfo *pBindInfo,
                         VkFence fence)
    {
       V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
       return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
    }