Am 31.01.24 um 18:14 schrieb Shashank Sharma:
This patch:
- Attaches the TLB flush fence to the PT objects being freed
- Adds a new ptr in VM to save this last TLB flush fence
- Adds a new lock in VM to prevent out-of-context update of saved
   TLB flush fence
- Adds a new ptr in tlb_flush structure to save vm

The idea is to delay freeing of page table objects until we have the
respective TLB entries flushed.

V2: rebase

Cc: Christian König <[email protected]>
Cc: Alex Deucher <[email protected]>
Cc: Felix Kuehling <[email protected]>
Cc: Rajneesh Bhardwaj <[email protected]>
Signed-off-by: Shashank Sharma <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c        |  3 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h        |  4 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c     | 27 +++++++++++++++++++
  .../gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c  | 13 +++++++--
  4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 67c690044b97..b0e81c249e3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2245,6 +2245,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
        vm->generation = 0;
mutex_init(&vm->eviction_lock);
+       mutex_init(&vm->tlb_flush_lock);
        vm->evicting = false;
        vm->tlb_fence_context = dma_fence_context_alloc(1);
@@ -2360,7 +2361,9 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
        }
dma_fence_put(vm->last_update);
+       dma_fence_put(vm->tlb_fence_last);
        vm->last_update = dma_fence_get_stub();
+       vm->tlb_fence_last = dma_fence_get_stub();
        vm->is_compute_context = true;
/* Free the shadow bo for compute VM */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 8e6fd25d07b7..b05bc586237f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -334,6 +334,10 @@ struct amdgpu_vm {
        uint64_t                *tlb_seq_cpu_addr;
        uint64_t                tlb_fence_context;
+ /* Ptr and lock to maintain tlb flush sync */
+       struct mutex            tlb_flush_lock;
+       struct dma_fence        *tlb_fence_last;
+
        atomic64_t              kfd_last_flushed_seq;
/* How many times we had to re-generate the page tables */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index 95dc0afdaffb..f1c4418c4d63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -631,6 +631,18 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
        return r;
  }
+static inline
+void amdgpu_vm_attach_tlb_fence(struct amdgpu_bo *bo, struct dma_fence *fence)
+{
+       if (!bo || !fence)
+               return;
+
+       if (!dma_resv_reserve_fences(bo->tbo.base.resv, 1)) {
+               dma_resv_add_fence(bo->tbo.base.resv, fence,
+                                  DMA_RESV_USAGE_BOOKKEEP);
+       }
+}
+
  /**
   * amdgpu_vm_pt_free - free one PD/PT
   *
@@ -638,6 +650,7 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
   */
  static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
  {
+       struct amdgpu_vm *vm;
        struct amdgpu_bo *shadow;
if (!entry->bo)
@@ -646,9 +659,23 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base 
*entry)
        entry->bo->vm_bo = NULL;
        shadow = amdgpu_bo_shadowed(entry->bo);
        if (shadow) {
+               vm = shadow->vm_bo->vm;
+
+               mutex_lock(&vm->tlb_flush_lock);
+               if (vm->tlb_fence_last)
+                       amdgpu_vm_attach_tlb_fence(shadow, vm->tlb_fence_last);
+               mutex_unlock(&vm->tlb_flush_lock);
+
                ttm_bo_set_bulk_move(&shadow->tbo, NULL);
                amdgpu_bo_unref(&shadow);
        }
+
+       vm = entry->vm;
+       mutex_lock(&vm->tlb_flush_lock);
+       if (vm->tlb_fence_last)
+               amdgpu_vm_attach_tlb_fence(entry->bo, vm->tlb_fence_last);
+       mutex_unlock(&vm->tlb_flush_lock);
+

That approach doesn't make sense. Instead add the freed PT/PDs to a linked list in the parameters structure and only really free them after adding the fence to the root PD.


        ttm_bo_set_bulk_move(&entry->bo->tbo, NULL);
spin_lock(&entry->vm->status_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
index 569681badd7c..54ec81d30034 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c
@@ -31,6 +31,7 @@
  struct amdgpu_tlb_fence {
        struct dma_fence        base;
        struct amdgpu_device    *adev;
+       struct amdgpu_vm        *vm;

Big NAK to that. The VM might not live long enough to see the end of the TLB flush.

Regards,
Christian.

        struct dma_fence        *dependency;
        struct work_struct      work;
        spinlock_t              lock;
@@ -51,6 +52,7 @@ static const char *amdgpu_tlb_fence_get_timeline_name(struct 
dma_fence *f)
  static void amdgpu_tlb_fence_work(struct work_struct *work)
  {
        struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work);
+       struct amdgpu_vm *vm = f->vm;
        int r;
r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0);
@@ -62,6 +64,10 @@ static void amdgpu_tlb_fence_work(struct work_struct *work)
dma_fence_signal(&f->base);
        dma_fence_put(&f->base);
+
+       mutex_lock(&vm->tlb_flush_lock);
+       vm->tlb_fence_last = NULL;
+       mutex_unlock(&vm->tlb_flush_lock);
  }
static const struct dma_fence_ops amdgpu_tlb_fence_ops = {
@@ -92,6 +98,7 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, 
struct amdgpu_vm *vm
        f->adev = adev;
        f->dependency = *fence;
        f->pasid = vm->pasid;
+       f->vm = vm;
        INIT_WORK(&f->work, amdgpu_tlb_fence_work);
        spin_lock_init(&f->lock);
@@ -99,8 +106,10 @@ void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm
                       vm->tlb_fence_context, atomic64_read(&vm->tlb_seq));
/* TODO: We probably need a separate wq here */
-       dma_fence_get(&f->base);
-       schedule_work(&f->work);
+       mutex_lock(&vm->tlb_flush_lock);
+       vm->tlb_fence_last = dma_fence_get(&f->base);
+       mutex_unlock(&vm->tlb_flush_lock);
+ schedule_work(&f->work);
        *fence = &f->base;
  }

Reply via email to