amdgpu: fix userq VM validation v4

Khatri, Sunil Thu, 11 Sep 2025 23:51:01 -0700

Functionally now looks good to me but adding more comments for thechanges as Alex pointed would be good to accommodate.

Reviewed-by: Sunil Khatri <sunil.kha...@amd.com>


On 9/5/2025 4:56 PM, Christian König wrote:

That was actually complete nonsense and not validating the BOs
at all. The code just cleared all VM areas were it couldn't grab the
lock for a BO.

Try to fix this. Only compile tested at the moment.

v2: fix fence slot reservation as well as pointed out by Sunil.
     also validate PDs, PTs, per VM BOs and update PDEs
v3: grab the status_lock while working with the done list.
v4: rename functions, add some comments, fix waiting for updates to
     complete.

Signed-off-by: Christian König <christian.koe...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 146 +++++++++++-----------
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  35 ++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h    |   2 +
  3 files changed, 108 insertions(+), 75 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index e228c1e6800d..cb3432b12221 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -601,108 +601,104 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr 
*uq_mgr)
        return ret;
  }

+static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)

+{
+       struct ttm_operation_ctx ctx = { false, false };
+
+       amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+       return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+}
+
+/* Handle all BOs on the invalidated list, validate them and update the PTs */
  static int
-amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
+amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec,
+                        struct amdgpu_vm *vm)
  {
        struct ttm_operation_ctx ctx = { false, false };
+       struct amdgpu_bo_va *bo_va;
+       struct amdgpu_bo *bo;
        int ret;

- amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);

+       spin_lock(&vm->status_lock);
+       while (!list_empty(&vm->invalidated)) {
+               bo_va = list_first_entry(&vm->invalidated,
+                                        struct amdgpu_bo_va,
+                                        base.vm_status);
+               spin_unlock(&vm->status_lock);

- ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);

-       if (ret)
-               DRM_ERROR("Fail to validate\n");
+               bo = bo_va->base.bo;
+               ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
+               if (unlikely(ret))
+                       return ret;

- return ret;

+               amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+               ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+               if (ret)
+                       return ret;
+
+               /* This moves the bo_va to the done list */
+               ret = amdgpu_vm_bo_update(adev, bo_va, false);
+               if (ret)
+                       return ret;
+
+               spin_lock(&vm->status_lock);
+       }
+       spin_unlock(&vm->status_lock);
+
+       return 0;
  }

+/* Make sure the whole VM is ready to be used */

  static int
-amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
+amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
  {
        struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
-       struct amdgpu_vm *vm = &fpriv->vm;
        struct amdgpu_device *adev = uq_mgr->adev;
+       struct amdgpu_vm *vm = &fpriv->vm;
        struct amdgpu_bo_va *bo_va;
-       struct ww_acquire_ctx *ticket;
        struct drm_exec exec;
-       struct amdgpu_bo *bo;
-       struct dma_resv *resv;
-       bool clear, unlock;
-       int ret = 0;
+       int ret;

drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);

        drm_exec_until_all_locked(&exec) {
-               ret = amdgpu_vm_lock_pd(vm, &exec, 2);
+               ret = amdgpu_vm_lock_pd(vm, &exec, 1);
                drm_exec_retry_on_contention(&exec);
-               if (unlikely(ret)) {
-                       drm_file_err(uq_mgr->file, "Failed to lock PD\n");
+               if (unlikely(ret))
                        goto unlock_all;
-               }
-
-               /* Lock the done list */
-               list_for_each_entry(bo_va, &vm->done, base.vm_status) {
-                       bo = bo_va->base.bo;
-                       if (!bo)
-                               continue;

- ret = drm_exec_lock_obj(&exec, &bo->tbo.base);

-                       drm_exec_retry_on_contention(&exec);
-                       if (unlikely(ret))
-                               goto unlock_all;
-               }
-       }
-
-       spin_lock(&vm->status_lock);
-       while (!list_empty(&vm->moved)) {
-               bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
-                                        base.vm_status);
-               spin_unlock(&vm->status_lock);
-
-               /* Per VM BOs never need to bo cleared in the page tables */
-               ret = amdgpu_vm_bo_update(adev, bo_va, false);
-               if (ret)
+               ret = amdgpu_vm_lock_done(vm, &exec, 1);
+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret))
                        goto unlock_all;
-               spin_lock(&vm->status_lock);
-       }
-
-       ticket = &exec.ticket;
-       while (!list_empty(&vm->invalidated)) {
-               bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
-                                        base.vm_status);
-               resv = bo_va->base.bo->tbo.base.resv;
-               spin_unlock(&vm->status_lock);

- bo = bo_va->base.bo;

-               ret = amdgpu_userq_validate_vm_bo(NULL, bo);
-               if (ret) {
-                       drm_file_err(uq_mgr->file, "Failed to validate BO\n");
+               ret = amdgpu_vm_validate(adev, vm, NULL,
+                                        amdgpu_userq_validate_vm,
+                                        NULL);
+               if (unlikely(ret))
                        goto unlock_all;
-               }

- /* Try to reserve the BO to avoid clearing its ptes */

-               if (!adev->debug_vm && dma_resv_trylock(resv)) {
-                       clear = false;
-                       unlock = true;
-               /* The caller is already holding the reservation lock */
-               } else if (dma_resv_locking_ctx(resv) == ticket) {
-                       clear = false;
-                       unlock = false;
-               /* Somebody else is using the BO right now */
-               } else {
-                       clear = true;
-                       unlock = false;
-               }
+               ret = amdgpu_userq_bo_validate(adev, &exec, vm);
+               drm_exec_retry_on_contention(&exec);
+               if (unlikely(ret))
+                       goto unlock_all;
+       }

- ret = amdgpu_vm_bo_update(adev, bo_va, clear);

+       ret = amdgpu_vm_handle_moved(adev, vm, NULL);
+       if (ret)
+               goto unlock_all;

- if (unlock)

-                       dma_resv_unlock(resv);
-               if (ret)
-                       goto unlock_all;
+       ret = amdgpu_vm_update_pdes(adev, vm, false);
+       if (ret)
+               goto unlock_all;

- spin_lock(&vm->status_lock);

-       }
-       spin_unlock(&vm->status_lock);
+       /*
+        * We need to wait for all VM updates to finish before restarting the
+        * queues. Using the done list like that is now ok since everything is
+        * locked in place.
+        */
+       list_for_each_entry(bo_va, &vm->done, base.vm_status)
+               dma_fence_wait(bo_va->last_pt_update, false);
+       dma_fence_wait(vm->last_update, false);

ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);

        if (ret)
@@ -723,7 +719,7 @@ static void amdgpu_userq_restore_worker(struct work_struct 
*work)

mutex_lock(&uq_mgr->userq_mutex);- ret = amdgpu_userq_validate_bos(uq_mgr);

+       ret = amdgpu_userq_vm_validate(uq_mgr);
        if (ret) {
                drm_file_err(uq_mgr->file, "Failed to validate BOs to 
restore\n");
                goto unlock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index bd12d8ff15a4..ec29fec1739e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct 
drm_exec *exec,
                                    2 + num_fences);
  }

+/**

+ * amdgpu_vm_lock_done - lock all BOs on the done list
+ * @exec: drm execution context
+ * @num_fences: number of extra fences to reserve
+ *
+ * Lock the BOs on the done list in the DRM execution context.
+ */
+int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec,
+                       unsigned int num_fences)
+{
+       struct list_head *prev = &vm->done;
+       struct amdgpu_bo_va *bo_va;
+       struct amdgpu_bo *bo;
+       int ret;
+
+       /* We can only trust prev->next while holding the lock */
+       spin_lock(&vm->status_lock);
+       while (!list_is_head(prev->next, &vm->done)) {
+               bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status);
+               spin_unlock(&vm->status_lock);
+
+               bo = bo_va->base.bo;
+               if (bo) {
+                       ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1);
+                       if (unlikely(ret))
+                               return ret;
+               }
+               spin_lock(&vm->status_lock);
+               prev = prev->next;
+       }
+       spin_unlock(&vm->status_lock);
+
+       return 0;
+}
+
  /**
   * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
   *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index e045c1590d78..f86b1a6afb0c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -491,6 +491,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, 
struct amdgpu_vm *vm);
  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
  int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec,
                      unsigned int num_fences);
+int amdgpu_vm_lock_done(struct amdgpu_vm *vm, struct drm_exec *exec,
+                       unsigned int num_fences);
  bool amdgpu_vm_ready(struct amdgpu_vm *vm);
  uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm 
*vm);
  int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,

Re: [PATCH 1/3] drm/amdgpu: fix userq VM validation v4

Reply via email to