Module: Mesa
Branch: main
Commit: fda5163f34a1ca8f3c65633497450b7d6443c165
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=fda5163f34a1ca8f3c65633497450b7d6443c165

Author: Paulo Zanoni <[email protected]>
Date:   Wed Oct 25 16:32:02 2023 -0700

anv/trtt: properly handle the lifetime of TR-TT batch BOs

We need to wait for the batches to complete before we return the BOs
to the pool. We were previously doing this completely synchronously,
which made the code unnecessarily wait. Now we have a timeline syncobj
that signals completion of the previous BOs, so sometimes we check
where we are in the timeline and then return the BOs that we know are
unused.

This, in addition to the previous patch that made us wait for the
other syncobjs through the execbuf ioctl instead of through the CPU,
makes TR-TT batches at least an order of magnitude faster. Still, I
don't think we'll notice any changes in games's FPS as they don't bind
sparse resources that often.

Reviewed-by: Lionel Landwerlin <[email protected]>
Signed-off-by: Paulo Zanoni <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25512>

---

 src/intel/vulkan/anv_batch_chain.c      | 20 ++++------
 src/intel/vulkan/anv_device.c           | 29 ++++++++++++++
 src/intel/vulkan/anv_private.h          | 28 +++++++++++++
 src/intel/vulkan/anv_sparse.c           | 71 +++++++++++++++++++++++++++++++++
 src/intel/vulkan/i915/anv_batch_chain.c | 16 ++++----
 src/intel/vulkan/xe/anv_batch_chain.c   | 40 +++++--------------
 6 files changed, 154 insertions(+), 50 deletions(-)

diff --git a/src/intel/vulkan/anv_batch_chain.c 
b/src/intel/vulkan/anv_batch_chain.c
index 4ecd3075b5e..0a8e7d46f8d 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -1705,28 +1705,24 @@ anv_queue_submit_trtt_batch(struct 
anv_sparse_submission *submit,
    struct anv_device *device = queue->device;
    VkResult result = VK_SUCCESS;
 
-   struct anv_trtt_batch_bo trtt_bbo;
-   trtt_bbo.size = align(batch->next - batch->start, 8);
-
-   result = anv_bo_pool_alloc(&device->batch_bo_pool, trtt_bbo.size,
-                              &trtt_bbo.bo);
+   uint32_t batch_size = align(batch->next - batch->start, 8);
+   struct anv_trtt_batch_bo *trtt_bbo;
+   result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo);
    if (result != VK_SUCCESS)
       return result;
 
-   memcpy(trtt_bbo.bo->map, batch->start, trtt_bbo.size);
+   memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size);
 #ifdef SUPPORT_INTEL_INTEGRATED_GPUS
    if (device->physical->memory.need_flush)
-      intel_flush_range(trtt_bbo.bo->map, trtt_bbo.size);
+      intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size);
 #endif
 
    if (INTEL_DEBUG(DEBUG_BATCH)) {
-      intel_print_batch(queue->decoder, trtt_bbo.bo->map, trtt_bbo.bo->size,
-                        trtt_bbo.bo->offset, false);
+      intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size,
+                        trtt_bbo->bo->offset, false);
    }
 
-   result = device->kmd_backend->execute_trtt_batch(submit, &trtt_bbo);
-
-   anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo.bo);
+   result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo);
 
    return result;
 }
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 48b243d9c00..404d137e523 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1452,6 +1452,7 @@ anv_physical_device_try_create(struct vk_instance 
*vk_instance,
    } else {
       device->has_sparse =
          device->info.ver >= 12 &&
+         device->has_exec_timeline &&
          debug_get_bool_option("ANV_SPARSE", false);
       device->sparse_uses_trtt = true;
    }
@@ -3105,6 +3106,8 @@ anv_device_init_trtt(struct anv_device *device)
    if (pthread_mutex_init(&trtt->mutex, NULL) != 0)
       return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
 
+   list_inithead(&trtt->in_flight_batches);
+
    return VK_SUCCESS;
 }
 
@@ -3113,6 +3116,32 @@ anv_device_finish_trtt(struct anv_device *device)
 {
    struct anv_trtt *trtt = &device->trtt;
 
+   if (trtt->timeline_val > 0) {
+      struct drm_syncobj_timeline_wait wait = {
+         .handles = (uintptr_t)&trtt->timeline_handle,
+         .points = (uintptr_t)&trtt->timeline_val,
+         .timeout_nsec = INT64_MAX,
+         .count_handles = 1,
+         .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+         .first_signaled = false,
+      };
+      if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wait))
+         fprintf(stderr, "TR-TT syncobj wait failed!\n");
+
+      list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
+                               &trtt->in_flight_batches, link)
+         anv_trtt_batch_bo_free(device, trtt_bbo);
+
+   }
+
+   if (trtt->timeline_handle > 0) {
+      struct drm_syncobj_destroy destroy = {
+         .handle = trtt->timeline_handle,
+      };
+      if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &destroy))
+         fprintf(stderr, "TR-TT syncobj destroy failed!\n");
+   }
+
    pthread_mutex_destroy(&trtt->mutex);
 
    vk_free(&device->vk.alloc, trtt->l3_mirror);
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 1db938569ab..8d07c6da5b5 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -1598,6 +1598,14 @@ struct anv_device_astc_emu {
 struct anv_trtt_batch_bo {
    struct anv_bo *bo;
    uint32_t size;
+
+   /* Once device->trtt.timeline_handle signals timeline_val as complete we
+    * can free this struct and its members.
+    */
+   uint64_t timeline_val;
+
+   /* Part of device->trtt.in_flight_batches. */
+   struct list_head link;
 };
 
 struct anv_device {
@@ -1807,6 +1815,15 @@ struct anv_device {
         */
        struct anv_bo *cur_page_table_bo;
        uint64_t next_page_table_bo_offset;
+
+       /* Timeline syncobj used to track completion of the TR-TT batch BOs. */
+       uint32_t timeline_handle;
+       uint64_t timeline_val;
+
+       /* List of struct anv_trtt_batch_bo batches that are in flight and can
+        * be freed once their timeline gets signaled.
+        */
+       struct list_head in_flight_batches;
     } trtt;
 
     /* This is true if the user ever bound a sparse resource to memory. This
@@ -1949,6 +1966,15 @@ VkResult anv_queue_submit_simple_batch(struct anv_queue 
*queue,
 VkResult anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
                                      struct anv_batch *batch);
 
+static inline void
+anv_trtt_batch_bo_free(struct anv_device *device,
+                       struct anv_trtt_batch_bo *trtt_bbo)
+{
+   anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo->bo);
+   list_del(&trtt_bbo->link);
+   vk_free(&device->vk.alloc, trtt_bbo);
+}
+
 void anv_queue_trace(struct anv_queue *queue, const char *label,
                      bool frame, bool begin);
 
@@ -2840,6 +2866,8 @@ VkResult anv_sparse_image_check_support(struct 
anv_physical_device *pdevice,
                                         VkSampleCountFlagBits samples,
                                         VkImageType type,
                                         VkFormat format);
+VkResult anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
+                               struct anv_trtt_batch_bo **out_trtt_bbo);
 
 struct anv_buffer {
    struct vk_buffer vk;
diff --git a/src/intel/vulkan/anv_sparse.c b/src/intel/vulkan/anv_sparse.c
index a3fe2c1ecae..381ab9c9fc3 100644
--- a/src/intel/vulkan/anv_sparse.c
+++ b/src/intel/vulkan/anv_sparse.c
@@ -391,6 +391,15 @@ anv_trtt_init_context_state(struct anv_queue *queue)
    struct anv_device *device = queue->device;
    struct anv_trtt *trtt = &device->trtt;
 
+   struct drm_syncobj_create create = {
+      .handle = 0,
+      .flags = 0,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create))
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+   assert(create.handle != 0);
+   trtt->timeline_handle = create.handle;
+
    struct anv_bo *l3_bo;
    VkResult result = trtt_get_page_table_bo(device, &l3_bo, &trtt->l3_addr);
    if (result != VK_SUCCESS)
@@ -1203,3 +1212,65 @@ anv_sparse_image_check_support(struct 
anv_physical_device *pdevice,
 
    return VK_SUCCESS;
 }
+
+static VkResult
+anv_trtt_garbage_collect_batches(struct anv_device *device)
+{
+   struct anv_trtt *trtt = &device->trtt;
+
+   if (trtt->timeline_val % 8 != 7)
+      return VK_SUCCESS;
+
+   uint64_t cur_timeline_val = 0;
+   struct drm_syncobj_timeline_array array = {
+      .handles = (uintptr_t)&trtt->timeline_handle,
+      .points = (uintptr_t)&cur_timeline_val,
+      .count_handles = 1,
+      .flags = 0,
+   };
+   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &array))
+      return vk_error(device, VK_ERROR_UNKNOWN);
+
+   list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
+                            &trtt->in_flight_batches, link) {
+      if (trtt_bbo->timeline_val > cur_timeline_val)
+         return VK_SUCCESS;
+
+      anv_trtt_batch_bo_free(device, trtt_bbo);
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
+                      struct anv_trtt_batch_bo **out_trtt_bbo)
+{
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result;
+
+   anv_trtt_garbage_collect_batches(device);
+
+   struct anv_trtt_batch_bo *trtt_bbo =
+      vk_alloc(&device->vk.alloc, sizeof(*trtt_bbo), 8,
+               VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!trtt_bbo)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size,
+                              &trtt_bbo->bo);
+   if (result != VK_SUCCESS)
+      goto out;
+
+   trtt_bbo->size = batch_size;
+   trtt_bbo->timeline_val = ++trtt->timeline_val;
+
+   list_addtail(&trtt_bbo->link, &trtt->in_flight_batches);
+
+   *out_trtt_bbo = trtt_bbo;
+
+   return VK_SUCCESS;
+out:
+   vk_free(&device->vk.alloc, trtt_bbo);
+   return result;
+}
diff --git a/src/intel/vulkan/i915/anv_batch_chain.c 
b/src/intel/vulkan/i915/anv_batch_chain.c
index 622ae0fddd1..640c1d7cbf4 100644
--- a/src/intel/vulkan/i915/anv_batch_chain.c
+++ b/src/intel/vulkan/i915/anv_batch_chain.c
@@ -1008,6 +1008,13 @@ i915_execute_trtt_batch(struct anv_sparse_submission 
*submit,
          goto out;
    }
 
+   result = anv_execbuf_add_syncobj(device, &execbuf, trtt->timeline_handle,
+                                    I915_EXEC_FENCE_SIGNAL,
+                                    trtt_bbo->timeline_val);
+   if (result != VK_SUCCESS)
+      goto out;
+
+
    result = anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL,
                                0);
    if (result != VK_SUCCESS)
@@ -1067,15 +1074,6 @@ i915_execute_trtt_batch(struct anv_sparse_submission 
*submit,
       }
    }
 
-   /* TODO: we can get rid of this wait once we can properly handle the buffer
-    * lifetimes.
-    */
-   result = anv_device_wait(device, trtt_bbo->bo, INT64_MAX);
-   if (result != VK_SUCCESS) {
-      result = vk_device_set_lost(&device->vk,
-                                  "trtt anv_device_wait failed: %m");
-   }
-
 out:
    anv_execbuf_finish(&execbuf);
    return result;
diff --git a/src/intel/vulkan/xe/anv_batch_chain.c 
b/src/intel/vulkan/xe/anv_batch_chain.c
index d251b743016..970b25f439d 100644
--- a/src/intel/vulkan/xe/anv_batch_chain.c
+++ b/src/intel/vulkan/xe/anv_batch_chain.c
@@ -189,15 +189,13 @@ xe_execute_trtt_batch(struct anv_sparse_submission 
*submit,
 {
    struct anv_queue *queue = submit->queue;
    struct anv_device *device = queue->device;
-   VkResult result = VK_SUCCESS;
-
-   uint32_t syncobj_handle;
-   if (drmSyncobjCreate(device->fd, 0, &syncobj_handle))
-      return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
+   struct anv_trtt *trtt = &device->trtt;
+   VkResult result;
 
    struct drm_xe_sync extra_sync = {
-      .flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
-      .handle = syncobj_handle,
+      .flags = DRM_XE_SYNC_TIMELINE_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
+      .handle = trtt->timeline_handle,
+      .timeline_value = trtt_bbo->timeline_val,
    };
 
    struct drm_xe_sync *xe_syncs = NULL;
@@ -209,7 +207,7 @@ xe_execute_trtt_batch(struct anv_sparse_submission *submit,
                                   false, /* is_companion_rcs_queue */
                                   &xe_syncs, &xe_syncs_count);
    if (result != VK_SUCCESS)
-      goto exec_error;
+      return result;
 
    struct drm_xe_exec exec = {
       .exec_queue_id = queue->exec_queue_id,
@@ -220,34 +218,18 @@ xe_execute_trtt_batch(struct anv_sparse_submission 
*submit,
    };
 
    if (!device->info->no_hw) {
-      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
-         result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
-         goto exec_error;
-      }
+      if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
+         return vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
    }
 
    if (queue->sync) {
       result = vk_sync_wait(&device->vk, queue->sync, 0,
                             VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
-      if (result != VK_SUCCESS) {
-         result = vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
-         goto exec_error;
-      }
+      if (result != VK_SUCCESS)
+         return vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
    }
 
-   /* FIXME: we shouldn't need this wait, figure out a way to remove it. */
-   struct drm_syncobj_wait wait = {
-      .handles = (uintptr_t)&syncobj_handle,
-      .timeout_nsec = INT64_MAX,
-      .count_handles = 1,
-   };
-   if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
-      result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: 
%m");
-
-exec_error:
-   drmSyncobjDestroy(device->fd, syncobj_handle);
-
-   return result;
+   return VK_SUCCESS;
 }
 
 VkResult

Reply via email to