Am 12.07.22 um 07:39 schrieb Luben Tuikov:
Protect the struct amdgpu_bo_list with a mutex. This is used during command
submission in order to avoid buffer object corruption as recorded in
the link below.

Suggested-by: Christian König <[email protected]>
Cc: Alex Deucher <[email protected]>
Cc: Andrey Grodzovsky <[email protected]>
Cc: Vitaly Prosyak <[email protected]>
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048
Signed-off-by: Luben Tuikov <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  3 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |  4 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      | 31 +++++++++++++++++++--
  3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index 714178f1b6c6ed..2168163aad2d38 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -40,7 +40,7 @@ static void amdgpu_bo_list_free_rcu(struct rcu_head *rcu)
  {
        struct amdgpu_bo_list *list = container_of(rcu, struct amdgpu_bo_list,
                                                   rhead);
-
+       mutex_destroy(&list->bo_list_mutex);
        kvfree(list);
  }
@@ -136,6 +136,7 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp, trace_amdgpu_cs_bo_status(list->num_entries, total_size); + mutex_init(&list->bo_list_mutex);
        *result = list;
        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
index 044b41f0bfd9ce..717984d4de6858 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
@@ -48,6 +48,10 @@ struct amdgpu_bo_list {
        struct amdgpu_bo *oa_obj;
        unsigned first_userptr;
        unsigned num_entries;
+
+       /* Protect access during command submission.
+        */
+       struct mutex bo_list_mutex;
  };
int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 36ac1f1d11e6b4..0b2932c20ec777 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -517,6 +517,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                        return r;
        }
+ mutex_lock(&p->bo_list->bo_list_mutex);

That lock/unlock placement is not correct and probably the reason why you still run into trouble with this patch.

You need to grab the lock before the call to amdgpu_bo_list_get_list() and drop it either after a call to ttm_eu_backoff_reservation() or ttm_eu_fence_buffer_objects().

If the lock is dropped anywhere in between we would have list corruption again.

Regards,
Christian.

+
        /* One for TTM and one for the CS job */
        amdgpu_bo_list_for_each_entry(e, p->bo_list)
                e->tv.num_shared = 2;
@@ -544,6 +546,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                if (!e->user_pages) {
                        DRM_ERROR("kvmalloc_array failure\n");
                        r = -ENOMEM;
+                       mutex_unlock(&p->bo_list->bo_list_mutex);
                        goto out_free_user_pages;
                }
@@ -551,6 +554,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                if (r) {
                        kvfree(e->user_pages);
                        e->user_pages = NULL;
+                       mutex_unlock(&p->bo_list->bo_list_mutex);
                        goto out_free_user_pages;
                }
@@ -568,6 +572,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
        if (unlikely(r != 0)) {
                if (r != -ERESTARTSYS)
                        DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
+               mutex_unlock(&p->bo_list->bo_list_mutex);
                goto out_free_user_pages;
        }
@@ -580,11 +585,14 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                        e->chain = dma_fence_chain_alloc();
                        if (!e->chain) {
                                r = -ENOMEM;
+                               mutex_unlock(&p->bo_list->bo_list_mutex);
                                goto error_validate;
                        }
                }
        }
+ mutex_unlock(&p->bo_list->bo_list_mutex);






+
        /* Move fence waiting after getting reservation lock of
         * PD root. Then there is no need on a ctx mutex lock.
         */
@@ -607,6 +615,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                goto error_validate;
        }
+ mutex_lock(&p->bo_list->bo_list_mutex);
        r = amdgpu_cs_list_validate(p, &duplicates);
        if (r)
                goto error_validate;
@@ -614,6 +623,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
        r = amdgpu_cs_list_validate(p, &p->validated);
        if (r)
                goto error_validate;
+       mutex_unlock(&p->bo_list->bo_list_mutex);
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
                                     p->bytes_moved_vis);
@@ -644,15 +654,18 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser 
*p,
error_validate:
        if (r) {
+               mutex_lock(&p->bo_list->bo_list_mutex);
                amdgpu_bo_list_for_each_entry(e, p->bo_list) {
                        dma_fence_chain_free(e->chain);
                        e->chain = NULL;
                }
                ttm_eu_backoff_reservation(&p->ticket, &p->validated);
+               mutex_unlock(&p->bo_list->bo_list_mutex);
        }
out_free_user_pages:
        if (r) {
+               mutex_lock(&p->bo_list->bo_list_mutex);
                amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
                        struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
@@ -662,6 +675,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                        kvfree(e->user_pages);
                        e->user_pages = NULL;
                }
+               mutex_unlock(&p->bo_list->bo_list_mutex);
        }
        return r;
  }
@@ -704,6 +718,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
*parser, int error,
        if (error && backoff) {
                struct amdgpu_bo_list_entry *e;
+ mutex_lock(&parser->bo_list->bo_list_mutex);
                amdgpu_bo_list_for_each_entry(e, parser->bo_list) {
                        dma_fence_chain_free(e->chain);
                        e->chain = NULL;
@@ -711,6 +726,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
*parser, int error,
ttm_eu_backoff_reservation(&parser->ticket,
                                           &parser->validated);
+               mutex_unlock(&parser->bo_list->bo_list_mutex);
        }
for (i = 0; i < parser->num_post_deps; i++) {
@@ -839,6 +855,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
                        return r;
        }
+ mutex_lock(&p->bo_list->bo_list_mutex);
        amdgpu_bo_list_for_each_entry(e, p->bo_list) {
                /* ignore duplicates */
                bo = ttm_to_amdgpu_bo(e->tv.bo);
@@ -850,13 +867,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser 
*p)
                        continue;
r = amdgpu_vm_bo_update(adev, bo_va, false);
-               if (r)
+               if (r) {
+                       mutex_unlock(&p->bo_list->bo_list_mutex);
                        return r;
+               }
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
-               if (r)
+               if (r) {
+                       mutex_unlock(&p->bo_list->bo_list_mutex);
                        return r;
+               }
        }
+       mutex_unlock(&p->bo_list->bo_list_mutex);
r = amdgpu_vm_handle_moved(adev, vm);
        if (r)
@@ -874,6 +896,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (amdgpu_vm_debug) {
                /* Invalidate all BOs to test for userspace bugs */
+               mutex_lock(&p->bo_list->bo_list_mutex);
                amdgpu_bo_list_for_each_entry(e, p->bo_list) {
                        struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
@@ -883,6 +906,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) amdgpu_vm_bo_invalidate(adev, bo, false);
                }
+               mutex_unlock(&p->bo_list->bo_list_mutex);
        }
return amdgpu_cs_sync_rings(p);
@@ -1249,6 +1273,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
         * added to BOs.
         */
        mutex_lock(&p->adev->notifier_lock);
+       mutex_lock(&p->bo_list->bo_list_mutex);
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
         * -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
@@ -1308,12 +1333,14 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
        }
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
+       mutex_unlock(&p->bo_list->bo_list_mutex);
        mutex_unlock(&p->adev->notifier_lock);
return 0; error_abort:
        drm_sched_job_cleanup(&job->base);
+       mutex_unlock(&p->bo_list->bo_list_mutex);
        mutex_unlock(&p->adev->notifier_lock);
error_unlock:

Reply via email to