DMABuf imports in compute VMs are not wrapped in a kgd_mem object on the
process_info->kfd_bo_list. There is no explicit KFD API call to validate
them or add eviction fences to them.

This patch automatically validates and fences dymanic DMABuf imports when
they are added to a compute VM. Revalidation after evictions is handled
in the VM code.

Signed-off-by: Felix Kuehling <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   3 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  15 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c   |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       |  26 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        | 117 +++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |   6 +-
 7 files changed, 164 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcf8a98ad15e..68d534a89942 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -178,6 +178,9 @@ int amdgpu_queue_mask_bit_to_set_resource_bit(struct 
amdgpu_device *adev,
 struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context,
                                struct mm_struct *mm,
                                struct svm_range_bo *svm_bo);
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+                                       uint32_t domain,
+                                       struct dma_fence *fence);
 #if defined(CONFIG_DEBUG_FS)
 int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2e302956a279..0c1cb6048259 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -423,9 +423,9 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, 
uint32_t domain,
        return ret;
 }
 
-static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
-                                              uint32_t domain,
-                                              struct dma_fence *fence)
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+                                       uint32_t domain,
+                                       struct dma_fence *fence)
 {
        int ret = amdgpu_bo_reserve(bo, false);
 
@@ -2948,7 +2948,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence **ef)
                struct amdgpu_device *adev = amdgpu_ttm_adev(
                        peer_vm->root.bo->tbo.bdev);
 
-               ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket);
+               ret = amdgpu_vm_handle_moved(adev, peer_vm, &ctx.ticket, true);
                if (ret) {
                        pr_debug("Memory eviction: handle moved failed. Try 
again\n");
                        goto validate_map_fail;
@@ -3001,7 +3001,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence **ef)
                                   &process_info->eviction_fence->base,
                                   DMA_RESV_USAGE_BOOKKEEP);
        }
-       /* Attach eviction fence to PD / PT BOs */
+       /* Attach eviction fence to PD / PT BOs and DMABuf imports */
        list_for_each_entry(peer_vm, &process_info->vm_list_head,
                            vm_list_node) {
                struct amdgpu_bo *bo = peer_vm->root.bo;
@@ -3009,6 +3009,11 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
struct dma_fence **ef)
                dma_resv_add_fence(bo->tbo.base.resv,
                                   &process_info->eviction_fence->base,
                                   DMA_RESV_USAGE_BOOKKEEP);
+
+               ret = amdgpu_vm_fence_imports(peer_vm, &ctx.ticket,
+                                             
&process_info->eviction_fence->base);
+               if (ret)
+                       break;
        }
 
 validate_map_fail:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index aafedb344c1b..20f4be8cd635 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1165,7 +1165,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser 
*p)
                        return r;
        }
 
-       r = amdgpu_vm_handle_moved(adev, vm, &p->ticket);
+       r = amdgpu_vm_handle_moved(adev, vm, &p->ticket, false);
        if (r)
                return r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index e7e87a3b2601..234244704f27 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -373,6 +373,10 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment 
*attach)
        struct amdgpu_vm_bo_base *bo_base;
        int r;
 
+       /* FIXME: This should be after the "if", but needs a fix to make sure
+        * DMABuf imports are initialized in the right VM list.
+        */
+       amdgpu_vm_bo_invalidate(adev, bo, false);
        if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
                return;
 
@@ -409,7 +413,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment 
*attach)
                if (!r)
                        r = amdgpu_vm_clear_freed(adev, vm, NULL);
                if (!r)
-                       r = amdgpu_vm_handle_moved(adev, vm, ticket);
+                       r = amdgpu_vm_handle_moved(adev, vm, ticket, false);
 
                if (r && r != -EBUSY)
                        DRM_ERROR("Failed to invalidate VM page tables (%d))\n",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 849fffbb367d..755cc3c559f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -186,6 +186,32 @@ static int amdgpu_gem_object_open(struct drm_gem_object 
*obj,
        else
                ++bo_va->ref_count;
        amdgpu_bo_unreserve(abo);
+
+       /* Validate and add eviction fence to DMABuf imports with dymanic
+        * attachment in compute VMs. Re-validation will be done by
+        * amdgpu_vm_handle_moved and the fence will be updated by
+        * amdgpu_vm_fence_imports in amdgpu_amdkfd_gpuvm_restore_process_bos.
+        *
+        * Nested locking below for the case that a GEM object is opened in
+        * kfd_mem_export_dmabuf. Since the lock below is only taken for 
imports,
+        * but not for export, this is a different lock class that cannot lead 
to
+        * circular lock dependencies.
+        */
+       if (!vm->is_compute_context || !vm->process_info)
+               return 0;
+       if (!obj->import_attach ||
+           !dma_buf_is_dynamic(obj->import_attach->dmabuf))
+               return 0;
+       mutex_lock_nested(&vm->process_info->lock, 1);
+       if (!WARN_ON(!vm->process_info->eviction_fence)) {
+               r = amdgpu_amdkfd_bo_validate_and_fence(abo, 
AMDGPU_GEM_DOMAIN_GTT,
+                                                       
&vm->process_info->eviction_fence->base);
+               if (r)
+                       dev_warn(adev->dev, "%d: validate_and_fence failed: 
%d\n",
+                                vm->task_info.pid, r);
+       }
+       mutex_unlock(&vm->process_info->lock);
+
        return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 0d685577243c..b2c7449ab561 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1307,6 +1307,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
  * @adev: amdgpu_device pointer
  * @vm: requested vm
  * @ticket: optional reservation ticket used to reserve the VM
+ * @valitate: whether to auto-validate invalid DMABuf imports
  *
  * Make sure all BOs which are moved are updated in the PTs.
  *
@@ -1317,7 +1318,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
  */
 int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
                           struct amdgpu_vm *vm,
-                          struct ww_acquire_ctx *ticket)
+                          struct ww_acquire_ctx *ticket,
+                          bool validate)
 {
        struct amdgpu_bo_va *bo_va;
        struct dma_resv *resv;
@@ -1337,6 +1339,12 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
                spin_lock(&vm->status_lock);
        }
 
+       /* If we're validating user BOs, splice all evicted user BOs into
+        * the list of invalid BOs for revalidation
+        */
+       if (validate)
+               list_splice_init(&vm->evicted_user, &vm->invalidated);
+
        while (!list_empty(&vm->invalidated)) {
                bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
                                         base.vm_status);
@@ -1357,17 +1365,120 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
                        unlock = false;
                }
 
+               /* Automatically validate DMABuf imports in compute VMs, if we
+                * have a reservation, or remember them for later validation.
+                */
+               if (vm->is_compute_context && bo_va->base.bo &&
+                   bo_va->base.bo->tbo.base.import_attach &&
+                   (!bo_va->base.bo->tbo.resource ||
+                    bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM)) {
+                       struct ttm_operation_ctx ctx = { true, false };
+                       struct amdgpu_bo *bo = bo_va->base.bo;
+
+                       if (!validate) {
+                               r = amdgpu_vm_bo_update(adev, bo_va, clear);
+                               if (!r)
+                                       amdgpu_vm_bo_evicted_user(&bo_va->base);
+                               goto unlock;
+                       }
+
+                       if (clear) {
+                               pr_warn_ratelimited("Invalid DMABuf import is 
busy in pid %d\n", vm->task_info.pid);
+                               r = -EBUSY;
+                               goto unlock;
+                       }
+
+                       amdgpu_bo_placement_from_domain(bo,
+                                                       bo->preferred_domains);
+                       r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+                       if (r)
+                               goto unlock;
+                       r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD,
+                                               true);
+                       if (r)
+                               goto unlock;
+               }
+
                r = amdgpu_vm_bo_update(adev, bo_va, clear);
+unlock:
+               if (unlock)
+                       dma_resv_unlock(resv);
                if (r)
                        return r;
+               spin_lock(&vm->status_lock);
+       }
+       spin_unlock(&vm->status_lock);
+
+       return 0;
+}
+
+/**
+ * amdgpu_vm_fence_imports - add fence to valid DMABuf imports
+ *
+ * @vm: requested vm
+ * @ticket: optional reservation ticket used to reserve the VM
+ * @fence: fence to add
+ *
+ * Add the specified fence to all dymanic DMABuf imports that are valid.
+ *
+ * Returns:
+ * 0 for success.
+ */
+int amdgpu_vm_fence_imports(struct amdgpu_vm *vm,
+                           struct ww_acquire_ctx *ticket,
+                           struct dma_fence *fence)
+{
+       struct amdgpu_bo_va *bo_va, *tmp;
+       struct dma_resv *resv;
+       LIST_HEAD(imports);
+       bool unlock;
+       int ret = 0;
+
+       if (!vm->is_compute_context)
+               return 0;
+
+       /* Move all the DMABuf imports to a private list so I can reserve
+        * them while not holding te status_lock.
+        */
+       spin_lock(&vm->status_lock);
+       list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
+               if (bo_va->base.bo && bo_va->base.bo->tbo.base.import_attach &&
+                   
dma_buf_is_dynamic(bo_va->base.bo->tbo.base.import_attach->dmabuf))
+                       list_move(&bo_va->base.vm_status, &imports);
+       }
+       spin_unlock(&vm->status_lock);
+
+       list_for_each_entry(bo_va, &imports, base.vm_status) {
+               resv = bo_va->base.bo->tbo.base.resv;
+
+               /* Try to reserve the BO */
+               if (dma_resv_trylock(resv)) {
+                       unlock = true;
+               /* The caller is already holding the reservation lock */
+               } else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
+                       unlock = false;
+               } else {
+                       WARN_ONCE(1, "Failed to reserve DMABuf import");
+                       ret = -EBUSY;
+                       break;
+               }
+
+               ret = dma_resv_reserve_fences(resv, 1);
+               if (!ret)
+                       dma_resv_add_fence(resv, fence,
+                                          DMA_RESV_USAGE_BOOKKEEP);
 
                if (unlock)
                        dma_resv_unlock(resv);
-               spin_lock(&vm->status_lock);
+               if (ret)
+                       break;
        }
+
+       spin_lock(&vm->status_lock);
+       list_splice(&imports, &vm->idle);
        spin_unlock(&vm->status_lock);
 
-       return 0;
+       return ret;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 939d0c2219c0..2db04b8fef97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -443,7 +443,11 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
                          struct dma_fence **fence);
 int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
                           struct amdgpu_vm *vm,
-                          struct ww_acquire_ctx *ticket);
+                          struct ww_acquire_ctx *ticket,
+                          bool validate);
+int amdgpu_vm_fence_imports(struct amdgpu_vm *vm,
+                           struct ww_acquire_ctx *ticket,
+                           struct dma_fence *fence);
 int amdgpu_vm_flush_compute_tlb(struct amdgpu_device *adev,
                                struct amdgpu_vm *vm,
                                uint32_t flush_type,
-- 
2.34.1

Reply via email to