On 10/30/25 14:05, Boris Brezillon wrote:
From: Loïc Molinari <[email protected]>
Will be used by the UMD to optimize CPU accesses to buffers
that are frequently read by the CPU, or on which the access
pattern makes non-cacheable mappings inefficient.
Mapping buffers CPU-cached implies taking care of the CPU
cache maintenance in the UMD, unless the GPU is IO coherent.
v2:
- Add more to the commit message
- Tweak the doc
- Make sure we sync the section of the BO pointing to the CS
syncobj before we read its seqno
v3:
- Fix formatting/spelling issues
v4:
- Add Steve's R-b
v5:
- Drop Steve's R-b (changes in the ioctl semantics requiring
new review)
Signed-off-by: Loïc Molinari <[email protected]>
Signed-off-by: Boris Brezillon <[email protected]>
---
drivers/gpu/drm/panthor/panthor_drv.c | 7 ++++-
drivers/gpu/drm/panthor/panthor_gem.c | 37 +++++++++++++++++++++++--
drivers/gpu/drm/panthor/panthor_sched.c | 18 ++++++++++--
include/uapi/drm/panthor_drm.h | 12 ++++++++
4 files changed, 69 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/panthor/panthor_drv.c
b/drivers/gpu/drm/panthor/panthor_drv.c
index c07fc5dcd4a1..4e915f5ef3fa 100644
--- a/drivers/gpu/drm/panthor/panthor_drv.c
+++ b/drivers/gpu/drm/panthor/panthor_drv.c
@@ -900,7 +900,8 @@ static int panthor_ioctl_vm_destroy(struct drm_device
*ddev, void *data,
return panthor_vm_pool_destroy_vm(pfile->vms, args->id);
}
-#define PANTHOR_BO_FLAGS DRM_PANTHOR_BO_NO_MMAP
+#define PANTHOR_BO_FLAGS (DRM_PANTHOR_BO_NO_MMAP | \
+ DRM_PANTHOR_BO_WB_MMAP)
static int panthor_ioctl_bo_create(struct drm_device *ddev, void *data,
struct drm_file *file)
@@ -919,6 +920,10 @@ static int panthor_ioctl_bo_create(struct drm_device
*ddev, void *data,
goto out_dev_exit;
}
+ if ((args->flags & DRM_PANTHOR_BO_NO_MMAP) &&
+ (args->flags & DRM_PANTHOR_BO_WB_MMAP))
+ return -EINVAL;
+
if (args->exclusive_vm_id) {
vm = panthor_vm_pool_get_vm(pfile->vms, args->exclusive_vm_id);
if (!vm) {
diff --git a/drivers/gpu/drm/panthor/panthor_gem.c
b/drivers/gpu/drm/panthor/panthor_gem.c
index 1b1e98c61b8c..479a779ee59d 100644
--- a/drivers/gpu/drm/panthor/panthor_gem.c
+++ b/drivers/gpu/drm/panthor/panthor_gem.c
@@ -58,6 +58,39 @@ static void panthor_gem_debugfs_set_usage_flags(struct
panthor_gem_object *bo, u
static void panthor_gem_debugfs_bo_init(struct panthor_gem_object *bo) {}
#endif
+static bool
+should_map_wc(struct panthor_gem_object *bo, struct panthor_vm *exclusive_vm)
+{
+ struct panthor_device *ptdev = container_of(bo->base.base.dev, struct
panthor_device, base);
+
+ /* We can't do uncached mappings if the device is coherent,
+ * because the zeroing done by the shmem layer at page allocation
+ * time happens on a cached mapping which isn't CPU-flushed (at least
+ * not on Arm64 where the flush is deferred to PTE setup time, and
+ * only done conditionally based on the mapping permissions). We can't
+ * rely on dma_map_sgtable()/dma_sync_sgtable_for_xxx() either to flush
+ * those, because they are NOPed if dma_dev_coherent() returns true.
+ *
+ * FIXME: Note that this problem is going to pop up again when we
+ * decide to support mapping buffers with the NO_MMAP flag as
+ * non-shareable (AKA buffers accessed only by the GPU), because we
+ * need the same CPU flush to happen after page allocation, otherwise
+ * there's a risk of data leak or late corruption caused by a dirty
+ * cacheline being evicted. At this point we'll need a way to force
+ * CPU cache maintenance regardless of whether the device is coherent
+ * or not.
+ */
+ if (ptdev->coherent)
+ return false;
+
+ /* Cached mappings are explicitly requested, so no write-combine. */
+ if (bo->flags & DRM_PANTHOR_BO_WB_MMAP)
+ return false;
+
+ /* The default is write-combine. */
+ return true;
+}
+
static void panthor_gem_free_object(struct drm_gem_object *obj)
{
struct panthor_gem_object *bo = to_panthor_bo(obj);
@@ -152,6 +185,7 @@ panthor_kernel_bo_create(struct panthor_device *ptdev,
struct panthor_vm *vm,
bo = to_panthor_bo(&obj->base);
kbo->obj = &obj->base;
bo->flags = bo_flags;
+ bo->base.map_wc = should_map_wc(bo, vm);
if (vm == panthor_fw_vm(ptdev))
debug_flags |= PANTHOR_DEBUGFS_GEM_USAGE_FLAG_FW_MAPPED;
@@ -255,7 +289,6 @@ static const struct drm_gem_object_funcs panthor_gem_funcs
= {
*/
struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev,
size_t size)
{
- struct panthor_device *ptdev = container_of(ddev, struct panthor_device,
base);
struct panthor_gem_object *obj;
obj = kzalloc(sizeof(*obj), GFP_KERNEL);
@@ -263,7 +296,6 @@ struct drm_gem_object *panthor_gem_create_object(struct
drm_device *ddev, size_t
return ERR_PTR(-ENOMEM);
obj->base.base.funcs = &panthor_gem_funcs;
- obj->base.map_wc = !ptdev->coherent;
mutex_init(&obj->label.lock);
panthor_gem_debugfs_bo_init(obj);
@@ -298,6 +330,7 @@ panthor_gem_create_with_handle(struct drm_file *file,
bo = to_panthor_bo(&shmem->base);
bo->flags = flags;
+ bo->base.map_wc = should_map_wc(bo, exclusive_vm);
if (exclusive_vm) {
bo->exclusive_vm_root_gem = panthor_vm_root_gem(exclusive_vm);
diff --git a/drivers/gpu/drm/panthor/panthor_sched.c
b/drivers/gpu/drm/panthor/panthor_sched.c
index f5e01cb16cfc..7d5da5717de2 100644
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -868,8 +868,11 @@ panthor_queue_get_syncwait_obj(struct panthor_group
*group, struct panthor_queue
struct iosys_map map;
int ret;
- if (queue->syncwait.kmap)
- return queue->syncwait.kmap + queue->syncwait.offset;
+ if (queue->syncwait.kmap) {
+ bo = container_of(queue->syncwait.obj,
+ struct panthor_gem_object, base.base);
+ goto out_sync;
+ }
bo = panthor_vm_get_bo_for_va(group->vm,
queue->syncwait.gpu_va,
@@ -886,6 +889,17 @@ panthor_queue_get_syncwait_obj(struct panthor_group
*group, struct panthor_queue
if (drm_WARN_ON(&ptdev->base, !queue->syncwait.kmap))
goto err_put_syncwait_obj;
+out_sync:
+ /* Make sure the CPU caches are invalidated before the seqno is read.
+ * drm_gem_shmem_sync() is a NOP if map_wc=false, so no need to check
Sorry nitpick.
IIUC, drm_gem_shmem_sync() would be a NOP if 'map_wc' is true.
+ * it here.
+ */
+ drm_gem_shmem_sync(&bo->base, queue->syncwait.offset,
+ queue->syncwait.sync64 ?
+ sizeof(struct panthor_syncobj_64b) :
+ sizeof(struct panthor_syncobj_32b),
+ DRM_GEM_SHMEM_SYNC_CPU_CACHE_FLUSH_AND_INVALIDATE);
+
return queue->syncwait.kmap + queue->syncwait.offset;
err_put_syncwait_obj:
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 7eec9f922183..57e2f5ffa03c 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -681,6 +681,18 @@ struct drm_panthor_vm_get_state {
enum drm_panthor_bo_flags {
/** @DRM_PANTHOR_BO_NO_MMAP: The buffer object will never be CPU-mapped
in userspace. */
DRM_PANTHOR_BO_NO_MMAP = (1 << 0),
+
+ /**
+ * @DRM_PANTHOR_BO_WB_MMAP: Force "Write-Back Cacheable" CPU mapping.
+ *
+ * CPU map the buffer object in userspace by forcing the "Write-Back
+ * Cacheable" cacheability attribute. The mapping otherwise uses the
+ * "Non-Cacheable" attribute if the GPU is not IO coherent.
+ *
+ * Can't be set if exclusive_vm_id=0 (only private BOs can be mapped
+ * cacheable).
Sorry Boris, I may have misinterpreted the code.
As per the comment, DRM_PANTHOR_BO_WB_MMAP flag should be rejected if
'exclusive_vm' is NULL. But I don't see any check for 'exclusive_vm'
pointer inside should_map_wc().
Please can you clarify.
Best regards
Akash
+ */
+ DRM_PANTHOR_BO_WB_MMAP = (1 << 1),
};
/**
IMPORTANT NOTICE: The contents of this email and any attachments are
confidential and may also be privileged. If you are not the intended recipient,
please notify the sender immediately and do not disclose the contents to any
other person, use it for any purpose, or store or copy the information in any
medium. Thank you.