On 6/11/25 17:29, Tvrtko Ursulin wrote: > > On 11/06/2025 15:21, Christian König wrote: >> On 6/11/25 16:00, Tvrtko Ursulin wrote: >>> Running the Cyberpunk 2077 benchmark we can observe that the lookup helper >>> is relatively hot, but the 97% of the calls are for a single object. (~3% >>> for two points, and never more than three points. While a more trivial >>> workload like vkmark under Plasma is even more skewed to single point >>> lookups.) >>> >>> Therefore lets add a fast path to bypass the kmalloc_array/kfree and use a >>> pre-allocated stack array for those cases. >> >> Have you considered using memdup_user()? That's using a separate bucket IIRC >> and might give similar performance. > > I haven't but I can try it. I would be surprised if it made a (positive) > difference though.
Yeah, it's mostly for extra security I think. > > And I realised I need to repeat the benchmarks anyway, since in v4 I had to > stop doing access_ok+__get_user, after kernel test robot let me know 64-bit > get_user is a not a thing on all platforms. I thought the gains are from > avoiding allocations but, as you say, now I need to see if copy_from_user > doesn't nullify them.. > >> If that is still not sufficient I'm really wondering if we shouldn't have a >> macro for doing this. It's a really common use case as far as I can see. > > Hmm macro for what exactly? Like a macro which uses an array on the stack for small (<4) number of values and k(v)malloc() for large ones. IIRC there is also a relatively new functionality which allows releasing the memory automatically when we leave the function. Regards, Christian. > > Regards, > > Tvrtko > >>> Signed-off-by: Tvrtko Ursulin <[email protected]> >>> Reviewed-by: Maíra Canal <[email protected]> >>> --- >>> v2: >>> * Added comments describing how the fast path arrays were sized. >>> * Make container freeing criteria clearer by using a boolean. >>> --- >>> drivers/gpu/drm/drm_syncobj.c | 56 +++++++++++++++++++++++++++-------- >>> 1 file changed, 44 insertions(+), 12 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c >>> index be5905dca87f..65c301852f0d 100644 >>> --- a/drivers/gpu/drm/drm_syncobj.c >>> +++ b/drivers/gpu/drm/drm_syncobj.c >>> @@ -1259,6 +1259,8 @@ EXPORT_SYMBOL(drm_timeout_abs_to_jiffies); >>> static int drm_syncobj_array_find(struct drm_file *file_private, >>> u32 __user *handles, >>> uint32_t count, >>> + struct drm_syncobj **stack_syncobjs, >>> + u32 stack_count, >>> struct drm_syncobj ***syncobjs_out) >>> { >>> struct drm_syncobj **syncobjs; >>> @@ -1268,9 +1270,13 @@ static int drm_syncobj_array_find(struct drm_file >>> *file_private, >>> if (!access_ok(handles, count * sizeof(*handles))) >>> return -EFAULT; >>> - syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); >>> - if (!syncobjs) >>> - return -ENOMEM; >>> + if (count > stack_count) { >>> + syncobjs = kmalloc_array(count, sizeof(*syncobjs), GFP_KERNEL); >>> + if (!syncobjs) >>> + return -ENOMEM; >>> + } else { >>> + syncobjs = stack_syncobjs; >>> + } >>> for (i = 0; i < count; i++) { >>> u32 handle; >>> @@ -1292,25 +1298,31 @@ static int drm_syncobj_array_find(struct drm_file >>> *file_private, >>> err_put_syncobjs: >>> while (i-- > 0) >>> drm_syncobj_put(syncobjs[i]); >>> - kfree(syncobjs); >>> + >>> + if (syncobjs != stack_syncobjs) >>> + kfree(syncobjs); >>> return ret; >>> } >>> static void drm_syncobj_array_free(struct drm_syncobj **syncobjs, >>> - uint32_t count) >>> + uint32_t count, >>> + bool free_container) >>> { >>> uint32_t i; >>> for (i = 0; i < count; i++) >>> drm_syncobj_put(syncobjs[i]); >>> - kfree(syncobjs); >>> + >>> + if (free_container) >>> + kfree(syncobjs); >>> } >>> int >>> drm_syncobj_wait_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; >>> struct drm_syncobj_wait *args = data; >>> ktime_t deadline, *pdeadline = NULL; >>> u32 count = args->count_handles; >>> @@ -1336,6 +1348,8 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void >>> *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> count, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1354,7 +1368,7 @@ drm_syncobj_wait_ioctl(struct drm_device *dev, void >>> *data, >>> &first, >>> pdeadline); >>> - drm_syncobj_array_free(syncobjs, count); >>> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); >>> if (timeout < 0) >>> return timeout; >>> @@ -1368,6 +1382,7 @@ int >>> drm_syncobj_timeline_wait_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; >>> struct drm_syncobj_timeline_wait *args = data; >>> ktime_t deadline, *pdeadline = NULL; >>> u32 count = args->count_handles; >>> @@ -1394,6 +1409,8 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device >>> *dev, void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> count, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1412,7 +1429,7 @@ drm_syncobj_timeline_wait_ioctl(struct drm_device >>> *dev, void *data, >>> &first, >>> pdeadline); >>> - drm_syncobj_array_free(syncobjs, count); >>> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); >>> if (timeout < 0) >>> return timeout; >>> @@ -1529,6 +1546,7 @@ int >>> drm_syncobj_reset_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; >>> struct drm_syncobj_array *args = data; >>> struct drm_syncobj **syncobjs; >>> uint32_t i; >>> @@ -1546,6 +1564,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void >>> *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> args->count_handles, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1553,7 +1573,8 @@ drm_syncobj_reset_ioctl(struct drm_device *dev, void >>> *data, >>> for (i = 0; i < args->count_handles; i++) >>> drm_syncobj_replace_fence(syncobjs[i], NULL); >>> - drm_syncobj_array_free(syncobjs, args->count_handles); >>> + drm_syncobj_array_free(syncobjs, args->count_handles, >>> + syncobjs != stack_syncobjs); >>> return 0; >>> } >>> @@ -1562,6 +1583,7 @@ int >>> drm_syncobj_signal_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; >>> struct drm_syncobj_array *args = data; >>> struct drm_syncobj **syncobjs; >>> uint32_t i; >>> @@ -1579,6 +1601,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void >>> *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> args->count_handles, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1589,7 +1613,8 @@ drm_syncobj_signal_ioctl(struct drm_device *dev, void >>> *data, >>> break; >>> } >>> - drm_syncobj_array_free(syncobjs, args->count_handles); >>> + drm_syncobj_array_free(syncobjs, args->count_handles, >>> + syncobjs != stack_syncobjs); >>> return ret; >>> } >>> @@ -1598,6 +1623,7 @@ int >>> drm_syncobj_timeline_signal_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; >>> struct drm_syncobj_timeline_array *args = data; >>> uint64_t __user *points = u64_to_user_ptr(args->points); >>> uint32_t i, j, count = args->count_handles; >>> @@ -1617,6 +1643,8 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device >>> *dev, void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> count, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1653,7 +1681,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device >>> *dev, void *data, >>> err_chains: >>> kfree(chains); >>> out: >>> - drm_syncobj_array_free(syncobjs, count); >>> + drm_syncobj_array_free(syncobjs, count, syncobjs != stack_syncobjs); >>> return ret; >>> } >>> @@ -1661,6 +1689,7 @@ drm_syncobj_timeline_signal_ioctl(struct drm_device >>> *dev, void *data, >>> int drm_syncobj_query_ioctl(struct drm_device *dev, void *data, >>> struct drm_file *file_private) >>> { >>> + struct drm_syncobj *stack_syncobjs[DRM_SYNCOBJ_FAST_PATH_ENTRIES]; >>> struct drm_syncobj_timeline_array *args = data; >>> struct drm_syncobj **syncobjs; >>> uint64_t __user *points = u64_to_user_ptr(args->points); >>> @@ -1679,6 +1708,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, >>> void *data, >>> ret = drm_syncobj_array_find(file_private, >>> u64_to_user_ptr(args->handles), >>> args->count_handles, >>> + stack_syncobjs, >>> + ARRAY_SIZE(stack_syncobjs), >>> &syncobjs); >>> if (ret < 0) >>> return ret; >>> @@ -1722,7 +1753,8 @@ int drm_syncobj_query_ioctl(struct drm_device *dev, >>> void *data, >>> if (ret) >>> break; >>> } >>> - drm_syncobj_array_free(syncobjs, args->count_handles); >>> + drm_syncobj_array_free(syncobjs, args->count_handles, >>> + syncobjs != stack_syncobjs); >>> return ret; >>> } >> >
