From: Loïc Molinari <[email protected]> Will be used by the UMD to optimize CPU accesses to buffers that are frequently read by the CPU, or on which the access pattern makes non-cacheable mappings inefficient.
Mapping buffers CPU-cached implies taking care of the CPU cache maintenance in the UMD, unless the GPU is IO coherent. v2: - Add more to the commit message - Tweak the doc - Make sure we sync the section of the BO pointing to the CS syncobj before we read its seqno v3: - Fix formatting/spelling issues v4: - Add Steve's R-b v5: - Drop Steve's R-b (changes in the ioctl semantics requiring new review) Signed-off-by: Loïc Molinari <[email protected]> Signed-off-by: Boris Brezillon <[email protected]> --- drivers/gpu/drm/panthor/panthor_drv.c | 7 ++++- drivers/gpu/drm/panthor/panthor_gem.c | 37 +++++++++++++++++++++++-- drivers/gpu/drm/panthor/panthor_sched.c | 18 ++++++++++-- include/uapi/drm/panthor_drm.h | 12 ++++++++ 4 files changed, 69 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c index c07fc5dcd4a1..4e915f5ef3fa 100644 --- a/drivers/gpu/drm/panthor/panthor_drv.c +++ b/drivers/gpu/drm/panthor/panthor_drv.c @@ -900,7 +900,8 @@ static int panthor_ioctl_vm_destroy(struct drm_device *ddev, void *data, return panthor_vm_pool_destroy_vm(pfile->vms, args->id); } -#define PANTHOR_BO_FLAGS DRM_PANTHOR_BO_NO_MMAP +#define PANTHOR_BO_FLAGS (DRM_PANTHOR_BO_NO_MMAP | \ + DRM_PANTHOR_BO_WB_MMAP) static int panthor_ioctl_bo_create(struct drm_device *ddev, void *data, struct drm_file *file) @@ -919,6 +920,10 @@ static int panthor_ioctl_bo_create(struct drm_device *ddev, void *data, goto out_dev_exit; } + if ((args->flags & DRM_PANTHOR_BO_NO_MMAP) && + (args->flags & DRM_PANTHOR_BO_WB_MMAP)) + return -EINVAL; + if (args->exclusive_vm_id) { vm = panthor_vm_pool_get_vm(pfile->vms, args->exclusive_vm_id); if (!vm) { diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c index 1b1e98c61b8c..479a779ee59d 100644 --- a/drivers/gpu/drm/panthor/panthor_gem.c +++ b/drivers/gpu/drm/panthor/panthor_gem.c @@ -58,6 +58,39 @@ static void panthor_gem_debugfs_set_usage_flags(struct panthor_gem_object *bo, u static void panthor_gem_debugfs_bo_init(struct panthor_gem_object *bo) {} #endif +static bool +should_map_wc(struct panthor_gem_object *bo, struct panthor_vm *exclusive_vm) +{ + struct panthor_device *ptdev = container_of(bo->base.base.dev, struct panthor_device, base); + + /* We can't do uncached mappings if the device is coherent, + * because the zeroing done by the shmem layer at page allocation + * time happens on a cached mapping which isn't CPU-flushed (at least + * not on Arm64 where the flush is deferred to PTE setup time, and + * only done conditionally based on the mapping permissions). We can't + * rely on dma_map_sgtable()/dma_sync_sgtable_for_xxx() either to flush + * those, because they are NOPed if dma_dev_coherent() returns true. + * + * FIXME: Note that this problem is going to pop up again when we + * decide to support mapping buffers with the NO_MMAP flag as + * non-shareable (AKA buffers accessed only by the GPU), because we + * need the same CPU flush to happen after page allocation, otherwise + * there's a risk of data leak or late corruption caused by a dirty + * cacheline being evicted. At this point we'll need a way to force + * CPU cache maintenance regardless of whether the device is coherent + * or not. + */ + if (ptdev->coherent) + return false; + + /* Cached mappings are explicitly requested, so no write-combine. */ + if (bo->flags & DRM_PANTHOR_BO_WB_MMAP) + return false; + + /* The default is write-combine. */ + return true; +} + static void panthor_gem_free_object(struct drm_gem_object *obj) { struct panthor_gem_object *bo = to_panthor_bo(obj); @@ -152,6 +185,7 @@ panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm, bo = to_panthor_bo(&obj->base); kbo->obj = &obj->base; bo->flags = bo_flags; + bo->base.map_wc = should_map_wc(bo, vm); if (vm == panthor_fw_vm(ptdev)) debug_flags |= PANTHOR_DEBUGFS_GEM_USAGE_FLAG_FW_MAPPED; @@ -255,7 +289,6 @@ static const struct drm_gem_object_funcs panthor_gem_funcs = { */ struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev, size_t size) { - struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base); struct panthor_gem_object *obj; obj = kzalloc(sizeof(*obj), GFP_KERNEL); @@ -263,7 +296,6 @@ struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev, size_t return ERR_PTR(-ENOMEM); obj->base.base.funcs = &panthor_gem_funcs; - obj->base.map_wc = !ptdev->coherent; mutex_init(&obj->label.lock); panthor_gem_debugfs_bo_init(obj); @@ -298,6 +330,7 @@ panthor_gem_create_with_handle(struct drm_file *file, bo = to_panthor_bo(&shmem->base); bo->flags = flags; + bo->base.map_wc = should_map_wc(bo, exclusive_vm); if (exclusive_vm) { bo->exclusive_vm_root_gem = panthor_vm_root_gem(exclusive_vm); diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index f5e01cb16cfc..7d5da5717de2 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -868,8 +868,11 @@ panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue struct iosys_map map; int ret; - if (queue->syncwait.kmap) - return queue->syncwait.kmap + queue->syncwait.offset; + if (queue->syncwait.kmap) { + bo = container_of(queue->syncwait.obj, + struct panthor_gem_object, base.base); + goto out_sync; + } bo = panthor_vm_get_bo_for_va(group->vm, queue->syncwait.gpu_va, @@ -886,6 +889,17 @@ panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue if (drm_WARN_ON(&ptdev->base, !queue->syncwait.kmap)) goto err_put_syncwait_obj; +out_sync: + /* Make sure the CPU caches are invalidated before the seqno is read. + * drm_gem_shmem_sync() is a NOP if map_wc=false, so no need to check + * it here. + */ + drm_gem_shmem_sync(&bo->base, queue->syncwait.offset, + queue->syncwait.sync64 ? + sizeof(struct panthor_syncobj_64b) : + sizeof(struct panthor_syncobj_32b), + DRM_GEM_SHMEM_SYNC_CPU_CACHE_FLUSH_AND_INVALIDATE); + return queue->syncwait.kmap + queue->syncwait.offset; err_put_syncwait_obj: diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h index 7eec9f922183..57e2f5ffa03c 100644 --- a/include/uapi/drm/panthor_drm.h +++ b/include/uapi/drm/panthor_drm.h @@ -681,6 +681,18 @@ struct drm_panthor_vm_get_state { enum drm_panthor_bo_flags { /** @DRM_PANTHOR_BO_NO_MMAP: The buffer object will never be CPU-mapped in userspace. */ DRM_PANTHOR_BO_NO_MMAP = (1 << 0), + + /** + * @DRM_PANTHOR_BO_WB_MMAP: Force "Write-Back Cacheable" CPU mapping. + * + * CPU map the buffer object in userspace by forcing the "Write-Back + * Cacheable" cacheability attribute. The mapping otherwise uses the + * "Non-Cacheable" attribute if the GPU is not IO coherent. + * + * Can't be set if exclusive_vm_id=0 (only private BOs can be mapped + * cacheable). + */ + DRM_PANTHOR_BO_WB_MMAP = (1 << 1), }; /** -- 2.51.0
