On 03/12/2025 09:01, Boris Brezillon wrote: > From: Loïc Molinari <[email protected]> > > Will be used by the UMD to optimize CPU accesses to buffers > that are frequently read by the CPU, or on which the access > pattern makes non-cacheable mappings inefficient. > > Mapping buffers CPU-cached implies taking care of the CPU > cache maintenance in the UMD, unless the GPU is IO coherent. > > v2: > - Add more to the commit message > - Tweak the doc > - Make sure we sync the section of the BO pointing to the CS > syncobj before we read its seqno > > v3: > - Fix formatting/spelling issues > > v4: > - Add Steve's R-b > > v5: > - Drop Steve's R-b (changes in the ioctl semantics requiring > new review) > > v6: > - Fix the uAPI doc > - Fix inverted logic in some comment > > v7: > - No changes > > Signed-off-by: Loïc Molinari <[email protected]> > Signed-off-by: Boris Brezillon <[email protected]>
Readding my r-b that I gave on v6: Reviewed-by: Steven Price <[email protected]> Thanks, Steve > --- > drivers/gpu/drm/panthor/panthor_drv.c | 7 ++++- > drivers/gpu/drm/panthor/panthor_gem.c | 37 +++++++++++++++++++++++-- > drivers/gpu/drm/panthor/panthor_sched.c | 18 ++++++++++-- > include/uapi/drm/panthor_drm.h | 9 ++++++ > 4 files changed, 66 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/panthor/panthor_drv.c > b/drivers/gpu/drm/panthor/panthor_drv.c > index db04f13217b9..1bb6a20c497a 100644 > --- a/drivers/gpu/drm/panthor/panthor_drv.c > +++ b/drivers/gpu/drm/panthor/panthor_drv.c > @@ -902,7 +902,8 @@ static int panthor_ioctl_vm_destroy(struct drm_device > *ddev, void *data, > return panthor_vm_pool_destroy_vm(pfile->vms, args->id); > } > > -#define PANTHOR_BO_FLAGS DRM_PANTHOR_BO_NO_MMAP > +#define PANTHOR_BO_FLAGS (DRM_PANTHOR_BO_NO_MMAP | \ > + DRM_PANTHOR_BO_WB_MMAP) > > static int panthor_ioctl_bo_create(struct drm_device *ddev, void *data, > struct drm_file *file) > @@ -921,6 +922,10 @@ static int panthor_ioctl_bo_create(struct drm_device > *ddev, void *data, > goto out_dev_exit; > } > > + if ((args->flags & DRM_PANTHOR_BO_NO_MMAP) && > + (args->flags & DRM_PANTHOR_BO_WB_MMAP)) > + return -EINVAL; > + > if (args->exclusive_vm_id) { > vm = panthor_vm_pool_get_vm(pfile->vms, args->exclusive_vm_id); > if (!vm) { > diff --git a/drivers/gpu/drm/panthor/panthor_gem.c > b/drivers/gpu/drm/panthor/panthor_gem.c > index 4be32fc1732b..7a9eb6010f6f 100644 > --- a/drivers/gpu/drm/panthor/panthor_gem.c > +++ b/drivers/gpu/drm/panthor/panthor_gem.c > @@ -59,6 +59,39 @@ static void panthor_gem_debugfs_set_usage_flags(struct > panthor_gem_object *bo, u > static void panthor_gem_debugfs_bo_init(struct panthor_gem_object *bo) {} > #endif > > +static bool > +should_map_wc(struct panthor_gem_object *bo, struct panthor_vm *exclusive_vm) > +{ > + struct panthor_device *ptdev = container_of(bo->base.base.dev, struct > panthor_device, base); > + > + /* We can't do uncached mappings if the device is coherent, > + * because the zeroing done by the shmem layer at page allocation > + * time happens on a cached mapping which isn't CPU-flushed (at least > + * not on Arm64 where the flush is deferred to PTE setup time, and > + * only done conditionally based on the mapping permissions). We can't > + * rely on dma_map_sgtable()/dma_sync_sgtable_for_xxx() either to flush > + * those, because they are NOPed if dma_dev_coherent() returns true. > + * > + * FIXME: Note that this problem is going to pop up again when we > + * decide to support mapping buffers with the NO_MMAP flag as > + * non-shareable (AKA buffers accessed only by the GPU), because we > + * need the same CPU flush to happen after page allocation, otherwise > + * there's a risk of data leak or late corruption caused by a dirty > + * cacheline being evicted. At this point we'll need a way to force > + * CPU cache maintenance regardless of whether the device is coherent > + * or not. > + */ > + if (ptdev->coherent) > + return false; > + > + /* Cached mappings are explicitly requested, so no write-combine. */ > + if (bo->flags & DRM_PANTHOR_BO_WB_MMAP) > + return false; > + > + /* The default is write-combine. */ > + return true; > +} > + > static void panthor_gem_free_object(struct drm_gem_object *obj) > { > struct panthor_gem_object *bo = to_panthor_bo(obj); > @@ -145,6 +178,7 @@ panthor_kernel_bo_create(struct panthor_device *ptdev, > struct panthor_vm *vm, > bo = to_panthor_bo(&obj->base); > kbo->obj = &obj->base; > bo->flags = bo_flags; > + bo->base.map_wc = should_map_wc(bo, vm); > bo->exclusive_vm_root_gem = panthor_vm_root_gem(vm); > drm_gem_object_get(bo->exclusive_vm_root_gem); > bo->base.base.resv = bo->exclusive_vm_root_gem->resv; > @@ -345,7 +379,6 @@ static const struct drm_gem_object_funcs > panthor_gem_funcs = { > */ > struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev, > size_t size) > { > - struct panthor_device *ptdev = container_of(ddev, struct > panthor_device, base); > struct panthor_gem_object *obj; > > obj = kzalloc(sizeof(*obj), GFP_KERNEL); > @@ -353,7 +386,6 @@ struct drm_gem_object *panthor_gem_create_object(struct > drm_device *ddev, size_t > return ERR_PTR(-ENOMEM); > > obj->base.base.funcs = &panthor_gem_funcs; > - obj->base.map_wc = !ptdev->coherent; > mutex_init(&obj->label.lock); > > panthor_gem_debugfs_bo_init(obj); > @@ -388,6 +420,7 @@ panthor_gem_create_with_handle(struct drm_file *file, > > bo = to_panthor_bo(&shmem->base); > bo->flags = flags; > + bo->base.map_wc = should_map_wc(bo, exclusive_vm); > > if (exclusive_vm) { > bo->exclusive_vm_root_gem = panthor_vm_root_gem(exclusive_vm); > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c > b/drivers/gpu/drm/panthor/panthor_sched.c > index 389d508b3848..ae69a5704756 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.c > +++ b/drivers/gpu/drm/panthor/panthor_sched.c > @@ -863,8 +863,11 @@ panthor_queue_get_syncwait_obj(struct panthor_group > *group, struct panthor_queue > struct iosys_map map; > int ret; > > - if (queue->syncwait.kmap) > - return queue->syncwait.kmap + queue->syncwait.offset; > + if (queue->syncwait.kmap) { > + bo = container_of(queue->syncwait.obj, > + struct panthor_gem_object, base.base); > + goto out_sync; > + } > > bo = panthor_vm_get_bo_for_va(group->vm, > queue->syncwait.gpu_va, > @@ -881,6 +884,17 @@ panthor_queue_get_syncwait_obj(struct panthor_group > *group, struct panthor_queue > if (drm_WARN_ON(&ptdev->base, !queue->syncwait.kmap)) > goto err_put_syncwait_obj; > > +out_sync: > + /* Make sure the CPU caches are invalidated before the seqno is read. > + * drm_gem_shmem_sync() is a NOP if map_wc=true, so no need to check > + * it here. > + */ > + panthor_gem_sync(&bo->base.base, queue->syncwait.offset, > + queue->syncwait.sync64 ? > + sizeof(struct panthor_syncobj_64b) : > + sizeof(struct panthor_syncobj_32b), > + DRM_PANTHOR_BO_SYNC_CPU_CACHE_FLUSH_AND_INVALIDATE); > + > return queue->syncwait.kmap + queue->syncwait.offset; > > err_put_syncwait_obj: > diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h > index 39d5ce815742..e238c6264fa1 100644 > --- a/include/uapi/drm/panthor_drm.h > +++ b/include/uapi/drm/panthor_drm.h > @@ -681,6 +681,15 @@ struct drm_panthor_vm_get_state { > enum drm_panthor_bo_flags { > /** @DRM_PANTHOR_BO_NO_MMAP: The buffer object will never be CPU-mapped > in userspace. */ > DRM_PANTHOR_BO_NO_MMAP = (1 << 0), > + > + /** > + * @DRM_PANTHOR_BO_WB_MMAP: Force "Write-Back Cacheable" CPU mapping. > + * > + * CPU map the buffer object in userspace by forcing the "Write-Back > + * Cacheable" cacheability attribute. The mapping otherwise uses the > + * "Non-Cacheable" attribute if the GPU is not IO coherent. > + */ > + DRM_PANTHOR_BO_WB_MMAP = (1 << 1), > }; > > /**
