Edit

IABSD.fr/xenocara/lib/mesa/src/virtio/vulkan/vn_feedback.c

Branch :

  • Show log

    Commit

  • Author : jsg
    Date : 2025-06-05 11:23:11
    Hash : 67d6f117
    Message : Import Mesa 25.0.7

  • lib/mesa/src/virtio/vulkan/vn_feedback.c
  • /*
     * Copyright 2022 Google LLC
     * SPDX-License-Identifier: MIT
     */
    
    #include "vn_feedback.h"
    
    #include "vn_command_buffer.h"
    #include "vn_device.h"
    #include "vn_physical_device.h"
    #include "vn_query_pool.h"
    #include "vn_queue.h"
    
    static uint32_t
    vn_get_memory_type_index(const VkPhysicalDeviceMemoryProperties *mem_props,
                             uint32_t mem_type_bits,
                             VkMemoryPropertyFlags required_mem_flags)
    {
       u_foreach_bit(mem_type_index, mem_type_bits)
       {
          assert(mem_type_index < mem_props->memoryTypeCount);
          if ((mem_props->memoryTypes[mem_type_index].propertyFlags &
               required_mem_flags) == required_mem_flags)
             return mem_type_index;
       }
    
       return UINT32_MAX;
    }
    
    VkResult
    vn_feedback_buffer_create(struct vn_device *dev,
                              uint32_t size,
                              const VkAllocationCallbacks *alloc,
                              struct vn_feedback_buffer **out_fb_buf)
    {
       const bool exclusive = dev->queue_family_count == 1;
       const VkPhysicalDeviceMemoryProperties *mem_props =
          &dev->physical_device->memory_properties;
       VkDevice dev_handle = vn_device_to_handle(dev);
       VkResult result;
    
       struct vn_feedback_buffer *fb_buf =
          vk_zalloc(alloc, sizeof(*fb_buf), VN_DEFAULT_ALIGN,
                    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (!fb_buf)
          return VK_ERROR_OUT_OF_HOST_MEMORY;
    
       /* use concurrent to avoid explicit queue family ownership transfer for
        * device created with queues from multiple queue families
        */
       const VkBufferCreateInfo buf_create_info = {
          .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
          .size = size,
          /* Feedback for fences and timeline semaphores will write to this buffer
           * as a DST when signalling. Timeline semaphore feedback will also read
           * from this buffer as a SRC to retrieve the counter value to signal.
           */
          .usage =
             VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
          .sharingMode =
             exclusive ? VK_SHARING_MODE_EXCLUSIVE : VK_SHARING_MODE_CONCURRENT,
          /* below favors the current venus protocol */
          .queueFamilyIndexCount = exclusive ? 0 : dev->queue_family_count,
          .pQueueFamilyIndices = exclusive ? NULL : dev->queue_families,
       };
       result = vn_CreateBuffer(dev_handle, &buf_create_info, alloc,
                                &fb_buf->buf_handle);
       if (result != VK_SUCCESS)
          goto out_free_feedback_buffer;
    
       struct vn_buffer *buf = vn_buffer_from_handle(fb_buf->buf_handle);
       const VkMemoryRequirements *mem_req =
          &buf->requirements.memory.memoryRequirements;
       const uint32_t mem_type_index =
          vn_get_memory_type_index(mem_props, mem_req->memoryTypeBits,
                                   VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
       if (mem_type_index >= mem_props->memoryTypeCount) {
          result = VK_ERROR_INITIALIZATION_FAILED;
          goto out_destroy_buffer;
       }
    
       const VkMemoryAllocateInfo mem_alloc_info = {
          .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
          .allocationSize = mem_req->size,
          .memoryTypeIndex = mem_type_index,
       };
       result = vn_AllocateMemory(dev_handle, &mem_alloc_info, alloc,
                                  &fb_buf->mem_handle);
       if (result != VK_SUCCESS)
          goto out_destroy_buffer;
    
       const VkBindBufferMemoryInfo bind_info = {
          .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
          .buffer = fb_buf->buf_handle,
          .memory = fb_buf->mem_handle,
          .memoryOffset = 0,
       };
       result = vn_BindBufferMemory2(dev_handle, 1, &bind_info);
       if (result != VK_SUCCESS)
          goto out_free_memory;
    
       result = vn_MapMemory(dev_handle, fb_buf->mem_handle, 0, VK_WHOLE_SIZE, 0,
                             &fb_buf->data);
       if (result != VK_SUCCESS)
          goto out_free_memory;
    
       *out_fb_buf = fb_buf;
    
       return VK_SUCCESS;
    
    out_free_memory:
       vn_FreeMemory(dev_handle, fb_buf->mem_handle, alloc);
    
    out_destroy_buffer:
       vn_DestroyBuffer(dev_handle, fb_buf->buf_handle, alloc);
    
    out_free_feedback_buffer:
       vk_free(alloc, fb_buf);
    
       return result;
    }
    
    void
    vn_feedback_buffer_destroy(struct vn_device *dev,
                               struct vn_feedback_buffer *fb_buf,
                               const VkAllocationCallbacks *alloc)
    {
       VkDevice dev_handle = vn_device_to_handle(dev);
    
       vn_UnmapMemory(dev_handle, fb_buf->mem_handle);
       vn_FreeMemory(dev_handle, fb_buf->mem_handle, alloc);
       vn_DestroyBuffer(dev_handle, fb_buf->buf_handle, alloc);
       vk_free(alloc, fb_buf);
    }
    
    static inline uint32_t
    vn_get_feedback_buffer_alignment(struct vn_device *dev,
                                     struct vn_feedback_buffer *fb_buf)
    {
       struct vn_buffer *buf = vn_buffer_from_handle(fb_buf->buf_handle);
       return align(buf->requirements.memory.memoryRequirements.alignment,
                    dev->physical_device->wa_min_fb_align);
    }
    
    static VkResult
    vn_feedback_pool_grow_locked(struct vn_feedback_pool *pool)
    {
       VN_TRACE_FUNC();
       struct vn_feedback_buffer *fb_buf = NULL;
       VkResult result;
    
       result =
          vn_feedback_buffer_create(pool->dev, pool->size, pool->alloc, &fb_buf);
       if (result != VK_SUCCESS)
          return result;
    
       pool->used = 0;
       pool->alignment = vn_get_feedback_buffer_alignment(pool->dev, fb_buf);
    
       list_add(&fb_buf->head, &pool->fb_bufs);
    
       return VK_SUCCESS;
    }
    
    VkResult
    vn_feedback_pool_init(struct vn_device *dev,
                          struct vn_feedback_pool *pool,
                          uint32_t size,
                          const VkAllocationCallbacks *alloc)
    {
       simple_mtx_init(&pool->mutex, mtx_plain);
    
       pool->dev = dev;
       pool->alloc = alloc;
       pool->size = size;
       pool->used = size;
       pool->alignment = 1;
       list_inithead(&pool->fb_bufs);
       list_inithead(&pool->free_slots);
    
       return VK_SUCCESS;
    }
    
    void
    vn_feedback_pool_fini(struct vn_feedback_pool *pool)
    {
       list_for_each_entry_safe(struct vn_feedback_slot, slot, &pool->free_slots,
                                head)
          vk_free(pool->alloc, slot);
    
       list_for_each_entry_safe(struct vn_feedback_buffer, fb_buf, &pool->fb_bufs,
                                head)
          vn_feedback_buffer_destroy(pool->dev, fb_buf, pool->alloc);
    
       simple_mtx_destroy(&pool->mutex);
    }
    
    static struct vn_feedback_buffer *
    vn_feedback_pool_alloc_locked(struct vn_feedback_pool *pool,
                                  uint32_t size,
                                  uint32_t *out_offset)
    {
       /* Default values of pool->used and pool->alignment are used to trigger the
        * initial pool grow, and will be properly initialized after that.
        */
       if (unlikely(align(size, pool->alignment) > pool->size - pool->used)) {
          VkResult result = vn_feedback_pool_grow_locked(pool);
          if (result != VK_SUCCESS)
             return NULL;
    
          assert(align(size, pool->alignment) <= pool->size - pool->used);
       }
    
       *out_offset = pool->used;
       pool->used += align(size, pool->alignment);
    
       return list_first_entry(&pool->fb_bufs, struct vn_feedback_buffer, head);
    }
    
    struct vn_feedback_slot *
    vn_feedback_pool_alloc(struct vn_feedback_pool *pool,
                           enum vn_feedback_type type)
    {
       static const uint32_t slot_size = 8;
       struct vn_feedback_buffer *fb_buf;
       uint32_t offset;
       struct vn_feedback_slot *slot;
    
       simple_mtx_lock(&pool->mutex);
       if (!list_is_empty(&pool->free_slots)) {
          slot =
             list_first_entry(&pool->free_slots, struct vn_feedback_slot, head);
          list_del(&slot->head);
          simple_mtx_unlock(&pool->mutex);
    
          slot->type = type;
          return slot;
       }
    
       slot = vk_alloc(pool->alloc, sizeof(*slot), VN_DEFAULT_ALIGN,
                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
       if (!slot) {
          simple_mtx_unlock(&pool->mutex);
          return NULL;
       }
    
       fb_buf = vn_feedback_pool_alloc_locked(pool, slot_size, &offset);
       simple_mtx_unlock(&pool->mutex);
    
       if (!fb_buf) {
          vk_free(pool->alloc, slot);
          return NULL;
       }
    
       slot->type = type;
       slot->offset = offset;
       slot->buf_handle = fb_buf->buf_handle;
       slot->data = fb_buf->data + offset;
    
       return slot;
    }
    
    void
    vn_feedback_pool_free(struct vn_feedback_pool *pool,
                          struct vn_feedback_slot *slot)
    {
       simple_mtx_lock(&pool->mutex);
       list_add(&slot->head, &pool->free_slots);
       simple_mtx_unlock(&pool->mutex);
    }
    
    static inline bool
    mask_is_32bit(uint64_t x)
    {
       return (x & 0xffffffff00000000) == 0;
    }
    
    static void
    vn_build_buffer_memory_barrier(const VkDependencyInfo *dep_info,
                                   VkBufferMemoryBarrier *barrier1,
                                   VkPipelineStageFlags *src_stage_mask,
                                   VkPipelineStageFlags *dst_stage_mask)
    {
    
       assert(dep_info->pNext == NULL);
       assert(dep_info->memoryBarrierCount == 0);
       assert(dep_info->bufferMemoryBarrierCount == 1);
       assert(dep_info->imageMemoryBarrierCount == 0);
    
       const VkBufferMemoryBarrier2 *barrier2 =
          &dep_info->pBufferMemoryBarriers[0];
       assert(barrier2->pNext == NULL);
       assert(mask_is_32bit(barrier2->srcStageMask));
       assert(mask_is_32bit(barrier2->srcAccessMask));
       assert(mask_is_32bit(barrier2->dstStageMask));
       assert(mask_is_32bit(barrier2->dstAccessMask));
    
       *barrier1 = (VkBufferMemoryBarrier){
          .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
          .pNext = NULL,
          .srcAccessMask = barrier2->srcAccessMask,
          .dstAccessMask = barrier2->dstAccessMask,
          .srcQueueFamilyIndex = barrier2->srcQueueFamilyIndex,
          .dstQueueFamilyIndex = barrier2->dstQueueFamilyIndex,
          .buffer = barrier2->buffer,
          .offset = barrier2->offset,
          .size = barrier2->size,
       };
    
       *src_stage_mask = barrier2->srcStageMask;
       *dst_stage_mask = barrier2->dstStageMask;
    }
    
    static void
    vn_cmd_buffer_memory_barrier(VkCommandBuffer cmd_handle,
                                 const VkDependencyInfo *dep_info,
                                 bool sync2)
    {
       if (sync2)
          vn_CmdPipelineBarrier2(cmd_handle, dep_info);
       else {
          VkBufferMemoryBarrier barrier1;
          VkPipelineStageFlags src_stage_mask;
          VkPipelineStageFlags dst_stage_mask;
    
          vn_build_buffer_memory_barrier(dep_info, &barrier1, &src_stage_mask,
                                         &dst_stage_mask);
          vn_CmdPipelineBarrier(cmd_handle, src_stage_mask, dst_stage_mask,
                                dep_info->dependencyFlags, 0, NULL, 1, &barrier1,
                                0, NULL);
       }
    }
    
    void
    vn_event_feedback_cmd_record(VkCommandBuffer cmd_handle,
                                 VkEvent ev_handle,
                                 VkPipelineStageFlags2 src_stage_mask,
                                 VkResult status,
                                 bool sync2)
    {
       /* For vkCmdSetEvent and vkCmdResetEvent feedback interception.
        *
        * The injection point is after the event call to avoid introducing
        * unexpected src stage waiting for VK_PIPELINE_STAGE_HOST_BIT and
        * VK_PIPELINE_STAGE_TRANSFER_BIT if they are not already being waited by
        * vkCmdSetEvent or vkCmdResetEvent. On the other hand, the delay in the
        * feedback signal is acceptable for the nature of VkEvent, and the event
        * feedback cmds lifecycle is guarded by the intercepted command buffer.
        */
       struct vn_event *ev = vn_event_from_handle(ev_handle);
       struct vn_feedback_slot *slot = ev->feedback_slot;
    
       if (!slot)
          return;
    
       STATIC_ASSERT(sizeof(*slot->status) == 4);
    
       const VkDependencyInfo dep_before = {
          .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
          .dependencyFlags = 0,
          .bufferMemoryBarrierCount = 1,
          .pBufferMemoryBarriers =
             (VkBufferMemoryBarrier2[]){
                {
                   .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
                   .srcStageMask = src_stage_mask | VK_PIPELINE_STAGE_HOST_BIT |
                                   VK_PIPELINE_STAGE_TRANSFER_BIT,
                   .srcAccessMask =
                      VK_ACCESS_HOST_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
                   .dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT,
                   .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
                   .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
                   .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
                   .buffer = slot->buf_handle,
                   .offset = slot->offset,
                   .size = 4,
                },
             },
       };
       vn_cmd_buffer_memory_barrier(cmd_handle, &dep_before, sync2);
    
       vn_CmdFillBuffer(cmd_handle, slot->buf_handle, slot->offset, 4, status);
    
       const VkDependencyInfo dep_after = {
          .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
          .dependencyFlags = 0,
          .bufferMemoryBarrierCount = 1,
          .pBufferMemoryBarriers =
             (VkBufferMemoryBarrier2[]){
                {
                   .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
                   .srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT,
                   .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
                   .dstStageMask = VK_PIPELINE_STAGE_HOST_BIT,
                   .dstAccessMask =
                      VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT,
                   .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
                   .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
                   .buffer = slot->buf_handle,
                   .offset = slot->offset,
                   .size = 4,
                },
             },
       };
       vn_cmd_buffer_memory_barrier(cmd_handle, &dep_after, sync2);
    }
    
    static inline void
    vn_feedback_cmd_record_flush_barrier(VkCommandBuffer cmd_handle,
                                         VkBuffer buffer,
                                         VkDeviceSize offset,
                                         VkDeviceSize size)
    {
       const VkBufferMemoryBarrier buf_flush_barrier = {
          .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
          .pNext = NULL,
          .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
          .dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT,
          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
          .buffer = buffer,
          .offset = offset,
          .size = size,
       };
       vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_TRANSFER_BIT,
                             VK_PIPELINE_STAGE_HOST_BIT, 0, 0, NULL, 1,
                             &buf_flush_barrier, 0, NULL);
    }
    
    static VkResult
    vn_feedback_cmd_record(VkCommandBuffer cmd_handle,
                           struct vn_feedback_slot *dst_slot,
                           struct vn_feedback_slot *src_slot)
    {
       STATIC_ASSERT(sizeof(*dst_slot->status) == 4);
       STATIC_ASSERT(sizeof(*dst_slot->counter) == 8);
       STATIC_ASSERT(sizeof(*src_slot->counter) == 8);
    
       /* slot size is 8 bytes for timeline semaphore and 4 bytes fence.
        * src slot is non-null for timeline semaphore.
        */
       const VkDeviceSize buf_size = src_slot ? 8 : 4;
    
       static const VkCommandBufferBeginInfo begin_info = {
          .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
          .pNext = NULL,
          .flags = 0,
          .pInheritanceInfo = NULL,
       };
       VkResult result = vn_BeginCommandBuffer(cmd_handle, &begin_info);
       if (result != VK_SUCCESS)
          return result;
    
       static const VkMemoryBarrier mem_barrier_before = {
          .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
          .pNext = NULL,
          /* make pending writes available to stay close to signal op */
          .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
          /* no need to make all memory visible for feedback update */
          .dstAccessMask = 0,
       };
    
       const VkBufferMemoryBarrier buf_barrier_before = {
          .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
          .pNext = NULL,
          /* slot memory has been made available via mem_barrier_before */
          .srcAccessMask = 0,
          .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
          .buffer = dst_slot->buf_handle,
          .offset = dst_slot->offset,
          .size = buf_size,
       };
    
       /* host writes for src_slots should implicitly be made visible upon
        * QueueSubmit call */
       vn_CmdPipelineBarrier(cmd_handle, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                             VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1,
                             &mem_barrier_before, 1, &buf_barrier_before, 0,
                             NULL);
    
       /* If passed a src_slot, timeline semaphore feedback records a
        * cmd to copy the counter value from the src slot to the dst slot.
        * If src_slot is NULL, then fence feedback records a cmd to fill
        * the dst slot with VK_SUCCESS.
        */
       if (src_slot) {
          assert(src_slot->type == VN_FEEDBACK_TYPE_SEMAPHORE);
          assert(dst_slot->type == VN_FEEDBACK_TYPE_SEMAPHORE);
    
          const VkBufferCopy buffer_copy = {
             .srcOffset = src_slot->offset,
             .dstOffset = dst_slot->offset,
             .size = buf_size,
          };
          vn_CmdCopyBuffer(cmd_handle, src_slot->buf_handle, dst_slot->buf_handle,
                           1, &buffer_copy);
       } else {
          assert(dst_slot->type == VN_FEEDBACK_TYPE_FENCE);
    
          vn_CmdFillBuffer(cmd_handle, dst_slot->buf_handle, dst_slot->offset,
                           buf_size, VK_SUCCESS);
       }
    
       vn_feedback_cmd_record_flush_barrier(cmd_handle, dst_slot->buf_handle,
                                            dst_slot->offset, buf_size);
    
       return vn_EndCommandBuffer(cmd_handle);
    }
    
    struct vn_semaphore_feedback_cmd *
    vn_semaphore_feedback_cmd_alloc(struct vn_device *dev,
                                    struct vn_feedback_slot *dst_slot)
    {
       const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
       struct vn_semaphore_feedback_cmd *sfb_cmd;
       VkCommandBuffer *cmd_handles;
    
       VK_MULTIALLOC(ma);
       vk_multialloc_add(&ma, &sfb_cmd, __typeof__(*sfb_cmd), 1);
       vk_multialloc_add(&ma, &cmd_handles, __typeof__(*cmd_handles),
                         dev->queue_family_count);
       if (!vk_multialloc_zalloc(&ma, alloc, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
          return NULL;
    
       struct vn_feedback_slot *src_slot =
          vn_feedback_pool_alloc(&dev->feedback_pool, VN_FEEDBACK_TYPE_SEMAPHORE);
       if (!src_slot) {
          vk_free(alloc, sfb_cmd);
          return NULL;
       }
    
       for (uint32_t i = 0; i < dev->queue_family_count; i++) {
          VkDevice dev_handle = vn_device_to_handle(dev);
          VkResult result =
             vn_feedback_cmd_alloc(dev_handle, &dev->fb_cmd_pools[i], dst_slot,
                                   src_slot, &cmd_handles[i]);
          if (result != VK_SUCCESS) {
             for (uint32_t j = 0; j < i; j++) {
                vn_feedback_cmd_free(dev_handle, &dev->fb_cmd_pools[j],
                                     cmd_handles[j]);
             }
    
             vn_feedback_pool_free(&dev->feedback_pool, src_slot);
             vk_free(alloc, sfb_cmd);
             return NULL;
          }
       }
    
       sfb_cmd->cmd_handles = cmd_handles;
       sfb_cmd->src_slot = src_slot;
       return sfb_cmd;
    }
    
    void
    vn_semaphore_feedback_cmd_free(struct vn_device *dev,
                                   struct vn_semaphore_feedback_cmd *sfb_cmd)
    {
       const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
    
       for (uint32_t i = 0; i < dev->queue_family_count; i++) {
          vn_feedback_cmd_free(vn_device_to_handle(dev), &dev->fb_cmd_pools[i],
                               sfb_cmd->cmd_handles[i]);
       }
    
       vn_feedback_pool_free(&dev->feedback_pool, sfb_cmd->src_slot);
       vk_free(alloc, sfb_cmd);
    }
    
    static void
    vn_query_feedback_cmd_record_internal(VkCommandBuffer cmd_handle,
                                          VkQueryPool pool_handle,
                                          uint32_t query,
                                          uint32_t count,
                                          bool copy)
    {
       struct vn_query_pool *pool = vn_query_pool_from_handle(pool_handle);
       assert(pool->fb_buf);
    
       /* Results are always 64 bit and include availability bit (also 64 bit) */
       const VkDeviceSize slot_size = (pool->result_array_size * 8) + 8;
       const VkDeviceSize offset = slot_size * query;
       const VkDeviceSize buf_size = slot_size * count;
    
       /* The first synchronization scope of vkCmdCopyQueryPoolResults does not
        * include the query feedback buffer. Insert a barrier to ensure ordering
        * against feedback buffer fill cmd injected in vkCmdResetQueryPool.
        *
        * The second synchronization scope of vkCmdResetQueryPool does not include
        * the query feedback buffer. Insert a barrer to ensure ordering against
        * prior cmds referencing the queries.
        *
        * For srcAccessMask, VK_ACCESS_TRANSFER_WRITE_BIT is sufficient since the
        * gpu cache invalidation for feedback buffer fill in vkResetQueryPool is
        * done implicitly via queue submission.
        */
       const VkPipelineStageFlags src_stage_mask =
          copy ? VK_PIPELINE_STAGE_TRANSFER_BIT
               : VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
    
       const VkBufferMemoryBarrier buf_barrier_before = {
          .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
          .pNext = NULL,
          .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
          .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
          .buffer = pool->fb_buf->buf_handle,
          .offset = offset,
          .size = buf_size,
       };
       vn_CmdPipelineBarrier(cmd_handle, src_stage_mask,
                             VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, NULL, 1,
                             &buf_barrier_before, 0, NULL);
    
       if (copy) {
          /* Per spec: "The first synchronization scope includes all commands
           * which reference the queries in queryPool indicated by query that
           * occur earlier in submission order. If flags does not include
           * VK_QUERY_RESULT_WAIT_BIT, vkCmdEndQueryIndexedEXT,
           * vkCmdWriteTimestamp2, vkCmdEndQuery, and vkCmdWriteTimestamp are
           * excluded from this scope."
           *
           * Set VK_QUERY_RESULT_WAIT_BIT to ensure ordering after
           * vkCmdEndQuery or vkCmdWriteTimestamp makes the query available.
           *
           * Set VK_QUERY_RESULT_64_BIT as we can convert it to 32 bit if app
           * requested that.
           *
           * Per spec: "vkCmdCopyQueryPoolResults is considered to be a transfer
           * operation, and its writes to buffer memory must be synchronized using
           * VK_PIPELINE_STAGE_TRANSFER_BIT and VK_ACCESS_TRANSFER_WRITE_BIT
           * before using the results."
           *
           * So we can reuse the flush barrier after this copy cmd.
           */
          vn_CmdCopyQueryPoolResults(cmd_handle, pool_handle, query, count,
                                     pool->fb_buf->buf_handle, offset, slot_size,
                                     VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
                                        VK_QUERY_RESULT_64_BIT |
                                        VK_QUERY_RESULT_WAIT_BIT);
       } else {
          vn_CmdFillBuffer(cmd_handle, pool->fb_buf->buf_handle, offset, buf_size,
                           0);
       }
    
       vn_feedback_cmd_record_flush_barrier(cmd_handle, pool->fb_buf->buf_handle,
                                            offset, buf_size);
    }
    
    static VkResult
    vn_query_feedback_cmd_record(VkDevice dev_handle,
                                 struct list_head *query_records,
                                 struct vn_query_feedback_cmd *qfb_cmd)
    {
       assert(!list_is_empty(query_records));
    
       static const VkCommandBufferBeginInfo begin_info = {
          .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
       };
       VkResult result = vn_BeginCommandBuffer(qfb_cmd->cmd_handle, &begin_info);
       if (result != VK_SUCCESS)
          return result;
    
       list_for_each_entry_safe(struct vn_cmd_query_record, record, query_records,
                                head) {
          vn_query_feedback_cmd_record_internal(
             qfb_cmd->cmd_handle, vn_query_pool_to_handle(record->query_pool),
             record->query, record->query_count, record->copy);
       }
    
       return vn_EndCommandBuffer(qfb_cmd->cmd_handle);
    }
    
    VkResult
    vn_query_feedback_cmd_alloc(VkDevice dev_handle,
                                struct vn_feedback_cmd_pool *fb_cmd_pool,
                                struct list_head *query_records,
                                struct vn_query_feedback_cmd **out_qfb_cmd)
    {
       struct vn_query_feedback_cmd *qfb_cmd;
       VkResult result;
    
       simple_mtx_lock(&fb_cmd_pool->mutex);
    
       if (list_is_empty(&fb_cmd_pool->free_qfb_cmds)) {
          struct vn_command_pool *cmd_pool =
             vn_command_pool_from_handle(fb_cmd_pool->pool_handle);
    
          qfb_cmd = vk_alloc(&cmd_pool->allocator, sizeof(*qfb_cmd),
                             VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
          if (!qfb_cmd) {
             result = VK_ERROR_OUT_OF_HOST_MEMORY;
             goto out_unlock;
          }
    
          const VkCommandBufferAllocateInfo info = {
             .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
             .commandPool = fb_cmd_pool->pool_handle,
             .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
             .commandBufferCount = 1,
          };
          VkCommandBuffer qfb_cmd_handle;
          result = vn_AllocateCommandBuffers(dev_handle, &info, &qfb_cmd_handle);
          if (result != VK_SUCCESS) {
             vk_free(&cmd_pool->allocator, qfb_cmd);
             goto out_unlock;
          }
    
          qfb_cmd->fb_cmd_pool = fb_cmd_pool;
          qfb_cmd->cmd_handle = qfb_cmd_handle;
       } else {
          qfb_cmd = list_first_entry(&fb_cmd_pool->free_qfb_cmds,
                                     struct vn_query_feedback_cmd, head);
          list_del(&qfb_cmd->head);
          vn_ResetCommandBuffer(qfb_cmd->cmd_handle, 0);
       }
    
       result = vn_query_feedback_cmd_record(dev_handle, query_records, qfb_cmd);
       if (result != VK_SUCCESS) {
          list_add(&qfb_cmd->head, &fb_cmd_pool->free_qfb_cmds);
          goto out_unlock;
       }
    
       *out_qfb_cmd = qfb_cmd;
    
    out_unlock:
       simple_mtx_unlock(&fb_cmd_pool->mutex);
    
       return result;
    }
    
    void
    vn_query_feedback_cmd_free(struct vn_query_feedback_cmd *qfb_cmd)
    {
       simple_mtx_lock(&qfb_cmd->fb_cmd_pool->mutex);
       list_add(&qfb_cmd->head, &qfb_cmd->fb_cmd_pool->free_qfb_cmds);
       simple_mtx_unlock(&qfb_cmd->fb_cmd_pool->mutex);
    }
    
    VkResult
    vn_feedback_cmd_alloc(VkDevice dev_handle,
                          struct vn_feedback_cmd_pool *fb_cmd_pool,
                          struct vn_feedback_slot *dst_slot,
                          struct vn_feedback_slot *src_slot,
                          VkCommandBuffer *out_cmd_handle)
    {
       VkCommandPool cmd_pool_handle = fb_cmd_pool->pool_handle;
       const VkCommandBufferAllocateInfo info = {
          .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
          .pNext = NULL,
          .commandPool = cmd_pool_handle,
          .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
          .commandBufferCount = 1,
       };
       VkCommandBuffer cmd_handle;
       VkResult result;
    
       simple_mtx_lock(&fb_cmd_pool->mutex);
       result = vn_AllocateCommandBuffers(dev_handle, &info, &cmd_handle);
       if (result != VK_SUCCESS)
          goto out_unlock;
    
       result = vn_feedback_cmd_record(cmd_handle, dst_slot, src_slot);
       if (result != VK_SUCCESS) {
          vn_FreeCommandBuffers(dev_handle, cmd_pool_handle, 1, &cmd_handle);
          goto out_unlock;
       }
    
       *out_cmd_handle = cmd_handle;
    
    out_unlock:
       simple_mtx_unlock(&fb_cmd_pool->mutex);
    
       return result;
    }
    
    void
    vn_feedback_cmd_free(VkDevice dev_handle,
                         struct vn_feedback_cmd_pool *fb_cmd_pool,
                         VkCommandBuffer cmd_handle)
    {
       simple_mtx_lock(&fb_cmd_pool->mutex);
       vn_FreeCommandBuffers(dev_handle, fb_cmd_pool->pool_handle, 1,
                             &cmd_handle);
       simple_mtx_unlock(&fb_cmd_pool->mutex);
    }
    
    VkResult
    vn_feedback_cmd_pools_init(struct vn_device *dev)
    {
       const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
       VkDevice dev_handle = vn_device_to_handle(dev);
       struct vn_feedback_cmd_pool *fb_cmd_pools;
       VkCommandPoolCreateInfo info = {
          .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
          .pNext = NULL,
          .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
       };
    
       if (VN_PERF(NO_FENCE_FEEDBACK) && VN_PERF(NO_SEMAPHORE_FEEDBACK) &&
           VN_PERF(NO_QUERY_FEEDBACK))
          return VK_SUCCESS;
    
       assert(dev->queue_family_count);
    
       fb_cmd_pools =
          vk_zalloc(alloc, sizeof(*fb_cmd_pools) * dev->queue_family_count,
                    VN_DEFAULT_ALIGN, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
       if (!fb_cmd_pools)
          return VK_ERROR_OUT_OF_HOST_MEMORY;
    
       for (uint32_t i = 0; i < dev->queue_family_count; i++) {
          VkResult result;
    
          info.queueFamilyIndex = dev->queue_families[i];
          result = vn_CreateCommandPool(dev_handle, &info, alloc,
                                        &fb_cmd_pools[i].pool_handle);
          if (result != VK_SUCCESS) {
             for (uint32_t j = 0; j < i; j++) {
                vn_DestroyCommandPool(dev_handle, fb_cmd_pools[j].pool_handle,
                                      alloc);
                simple_mtx_destroy(&fb_cmd_pools[j].mutex);
             }
    
             vk_free(alloc, fb_cmd_pools);
             return result;
          }
    
          simple_mtx_init(&fb_cmd_pools[i].mutex, mtx_plain);
          list_inithead(&fb_cmd_pools[i].free_qfb_cmds);
       }
    
       dev->fb_cmd_pools = fb_cmd_pools;
    
       return VK_SUCCESS;
    }
    
    void
    vn_feedback_cmd_pools_fini(struct vn_device *dev)
    {
       const VkAllocationCallbacks *alloc = &dev->base.base.alloc;
       VkDevice dev_handle = vn_device_to_handle(dev);
    
       if (!dev->fb_cmd_pools)
          return;
    
       for (uint32_t i = 0; i < dev->queue_family_count; i++) {
          list_for_each_entry_safe(struct vn_query_feedback_cmd, feedback_cmd,
                                   &dev->fb_cmd_pools[i].free_qfb_cmds, head)
             vk_free(alloc, feedback_cmd);
    
          vn_DestroyCommandPool(dev_handle, dev->fb_cmd_pools[i].pool_handle,
                                alloc);
          simple_mtx_destroy(&dev->fb_cmd_pools[i].mutex);
       }
    
       vk_free(alloc, dev->fb_cmd_pools);
    }