Running the Cyberpunk 2077 benchmark we can observe that waiting on DRM
sycobjs is relatively hot, but the 96% of the calls are for a single
object. (~4% for two points, and never more than three points. While
a more trivial workload like vkmark under Plasma is even more skewed
to single point waits.)

Therefore lets add a fast path to bypass the kcalloc/kfree and use a pre-
allocated stack array for those cases.

Signed-off-by: Tvrtko Ursulin <[email protected]>
Reviewed-by: MaĆ­ra Canal <[email protected]> # v3
---
v2:
 * Document rationale for stack array in a comment.

v3:
 * Added DRM_SYNCOBJ_FAST_PATH_ENTRIES to avoid hardcoding fast path array
   size.

v4:
 * Rebased to be standalone.
---
 drivers/gpu/drm/drm_syncobj.c | 44 ++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 1333ef0ea03b..99aada85865d 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -237,6 +237,14 @@ static void
 syncobj_eventfd_entry_func(struct drm_syncobj *syncobj,
                           struct syncobj_eventfd_entry *entry);
 
+/*
+ * Empirically vast majority of ioctls pass in a single syncobj (96%) and never
+ * more than three points. Therefore implement a fast path with a small stack
+ * array to avoid going into the allocator sometimes several times per
+ * userspace rendered frame.
+ */
+#define DRM_SYNCOBJ_FAST_PATH_ENTRIES 4
+
 /**
  * drm_syncobj_find - lookup and reference a sync object.
  * @file_private: drm file private pointer
@@ -1063,10 +1071,12 @@ static signed long 
drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
                                                  uint32_t *idx,
                                                  ktime_t *deadline)
 {
+       struct syncobj_wait_entry stack_entries[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
+       u64 stack_points[DRM_SYNCOBJ_FAST_PATH_ENTRIES];
        struct syncobj_wait_entry *entries;
        struct dma_fence *fence;
-       uint64_t *points;
        uint32_t signaled_count, i;
+       uint64_t *points;
 
        if (flags & (DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT |
                     DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE)) {
@@ -1074,24 +1084,33 @@ static signed long 
drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
                lockdep_assert_none_held_once();
        }
 
-       points = kmalloc_array(count, sizeof(*points), GFP_KERNEL);
-       if (points == NULL)
-               return -ENOMEM;
+       if (count > ARRAY_SIZE(stack_points)) {
+               points = kmalloc_array(count, sizeof(*points), GFP_KERNEL);
+               if (!points)
+                       return -ENOMEM;
+       } else {
+               points = stack_points;
+       }
 
        if (!user_points) {
                memset(points, 0, count * sizeof(uint64_t));
-
        } else if (copy_from_user(points, user_points,
                                  sizeof(uint64_t) * count)) {
                timeout = -EFAULT;
                goto err_free_points;
        }
 
-       entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
-       if (!entries) {
-               timeout = -ENOMEM;
-               goto err_free_points;
+       if (count > ARRAY_SIZE(stack_entries)) {
+               entries = kcalloc(count, sizeof(*entries), GFP_KERNEL);
+               if (!entries) {
+                       timeout = -ENOMEM;
+                       goto err_free_points;
+               }
+       } else {
+               memset(stack_entries, 0, sizeof(stack_entries));
+               entries = stack_entries;
        }
+
        /* Walk the list of sync objects and initialize entries.  We do
         * this up-front so that we can properly return -EINVAL if there is
         * a syncobj with a missing fence and then never have the chance of
@@ -1208,10 +1227,13 @@ static signed long 
drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
                                                  &entries[i].fence_cb);
                dma_fence_put(entries[i].fence);
        }
-       kfree(entries);
+
+       if (entries != stack_entries)
+               kfree(entries);
 
 err_free_points:
-       kfree(points);
+       if (points != stack_points)
+               kfree(points);
 
        return timeout;
 }
-- 
2.48.0

Reply via email to