On Thu, 2025-11-13 at 15:51 +0100, Christian König wrote:
> This should allow amdkfd_fences to outlive the amdgpu module.
> 
> v2: implement Felix suggestion to lock the fence while signaling it.
> 
> Signed-off-by: Christian König <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  6 +++
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c  | 39 ++++++++-----------
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  7 ++--
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  4 +-
>  4 files changed, 27 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 8bdfcde2029b..6254cef04213 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -196,6 +196,7 @@ int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void 
> *data);
>  #endif
>  #if IS_ENABLED(CONFIG_HSA_AMD)
>  bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm);
> +void amdkfd_fence_signal(struct dma_fence *f);
>  struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f);
>  void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo);
>  int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
> @@ -210,6 +211,11 @@ bool amdkfd_fence_check_mm(struct dma_fence *f, struct 
> mm_struct *mm)
>       return false;
>  }
>  
> +static inline
> +void amdkfd_fence_signal(struct dma_fence *f)
> +{

I would add a short comment here: "Empty function because …"

> +}
> +
>  static inline
>  struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
>  {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
> index 09c919f72b6c..f76c3c52a2a1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c
> @@ -127,29 +127,9 @@ static bool amdkfd_fence_enable_signaling(struct 
> dma_fence *f)
>               if (!svm_range_schedule_evict_svm_bo(fence))
>                       return true;
>       }
> -     return false;
> -}
> -
> -/**
> - * amdkfd_fence_release - callback that fence can be freed
> - *
> - * @f: dma_fence
> - *
> - * This function is called when the reference count becomes zero.
> - * Drops the mm_struct reference and RCU schedules freeing up the fence.
> - */
> -static void amdkfd_fence_release(struct dma_fence *f)
> -{
> -     struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
> -
> -     /* Unconditionally signal the fence. The process is getting
> -      * terminated.
> -      */
> -     if (WARN_ON(!fence))
> -             return; /* Not an amdgpu_amdkfd_fence */
> -
>       mmdrop(fence->mm);
> -     kfree_rcu(f, rcu);
> +     fence->mm = NULL;

That the storage actually takes place is guaranteed by the lock taken
when calling the fence ops?

> +     return false;
>  }
>  
>  /**
> @@ -174,9 +154,22 @@ bool amdkfd_fence_check_mm(struct dma_fence *f, struct 
> mm_struct *mm)
>       return false;
>  }
>  
> +void amdkfd_fence_signal(struct dma_fence *f)
> +{
> +     struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f);
> +     long flags;
> +
> +     dma_fence_lock_irqsafe(f, flags)
> +     if (fence->mm) {
> +             mmdrop(fence->mm);
> +             fence->mm = NULL;
> +     }
> +     dma_fence_signal_locked(f);
> +     dma_fence_unlock_irqrestore(f, flags)
> +}
> +
>  static const struct dma_fence_ops amdkfd_fence_ops = {
>       .get_driver_name = amdkfd_fence_get_driver_name,
>       .get_timeline_name = amdkfd_fence_get_timeline_name,
>       .enable_signaling = amdkfd_fence_enable_signaling,
> -     .release = amdkfd_fence_release,
>  };
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index a085faac9fe1..8fac70b839ed 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1173,7 +1173,7 @@ static void kfd_process_wq_release(struct work_struct 
> *work)
>       synchronize_rcu();
>       ef = rcu_access_pointer(p->ef);
>       if (ef)
> -             dma_fence_signal(ef);
> +             amdkfd_fence_signal(ef);
>  
>       kfd_process_remove_sysfs(p);
>       kfd_debugfs_remove_process(p);
> @@ -1990,7 +1990,6 @@ kfd_process_gpuid_from_node(struct kfd_process *p, 
> struct kfd_node *node,
>  static int signal_eviction_fence(struct kfd_process *p)
>  {
>       struct dma_fence *ef;
> -     int ret;
>  
>       rcu_read_lock();
>       ef = dma_fence_get_rcu_safe(&p->ef);
> @@ -1998,10 +1997,10 @@ static int signal_eviction_fence(struct kfd_process 
> *p)
>       if (!ef)
>               return -EINVAL;
>  
> -     ret = dma_fence_signal(ef);
> +     amdkfd_fence_signal(ef);
>       dma_fence_put(ef);
>  
> -     return ret;
> +     return 0;

Oh wait, that's the code I'm also touching in my return code series!

https://lore.kernel.org/dri-devel/[email protected]/


Does this series then solve the problem Felix pointed out in
evict_process_worker()?


P.


>  }
>  
>  static void evict_process_worker(struct work_struct *work)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index c30dfb8ec236..566950702b7d 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -428,7 +428,7 @@ static void svm_range_bo_release(struct kref *kref)
>  
>       if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base))
>               /* We're not in the eviction worker. Signal the fence. */
> -             dma_fence_signal(&svm_bo->eviction_fence->base);
> +             amdkfd_fence_signal(&svm_bo->eviction_fence->base);
>       dma_fence_put(&svm_bo->eviction_fence->base);
>       amdgpu_bo_unref(&svm_bo->bo);
>       kfree(svm_bo);
> @@ -3628,7 +3628,7 @@ static void svm_range_evict_svm_bo_worker(struct 
> work_struct *work)
>       mmap_read_unlock(mm);
>       mmput(mm);
>  
> -     dma_fence_signal(&svm_bo->eviction_fence->base);
> +     amdkfd_fence_signal(&svm_bo->eviction_fence->base);
>  
>       /* This is the last reference to svm_bo, after svm_range_vram_node_free
>        * has been called in svm_migrate_vram_to_ram

Reply via email to