On 12/11/25 16:53, Tvrtko Ursulin wrote:
> 
> On 11/12/2025 13:16, Christian König wrote:
>> This allows amdgpu_fences to outlive the amdgpu module.
>>
>> v2: use dma_fence_get_rcu_safe to be NULL safe here.
>>
>> Signed-off-by: Christian König <[email protected]>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 63 +++++++----------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  1 -
>>   2 files changed, 20 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index c7843e336310..c636347801c1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -112,8 +112,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
>> amdgpu_fence *af,
>>       af->ring = ring;
>>         seq = ++ring->fence_drv.sync_seq;
>> -    dma_fence_init(fence, &amdgpu_fence_ops,
>> -               &ring->fence_drv.lock,
>> +    dma_fence_init(fence, &amdgpu_fence_ops, NULL,
>>                  adev->fence_context + ring->idx, seq);
>>         amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>> @@ -467,7 +466,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring 
>> *ring)
>>       timer_setup(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, 0);
>>         ring->fence_drv.num_fences_mask = ring->num_hw_submission * 2 - 1;
>> -    spin_lock_init(&ring->fence_drv.lock);
>>       ring->fence_drv.fences = kcalloc(ring->num_hw_submission * 2, 
>> sizeof(void *),
>>                        GFP_KERNEL);
>>   @@ -654,16 +652,20 @@ void amdgpu_fence_driver_set_error(struct 
>> amdgpu_ring *ring, int error)
>>       struct amdgpu_fence_driver *drv = &ring->fence_drv;
>>       unsigned long flags;
>>   -    spin_lock_irqsave(&drv->lock, flags);
>> +    rcu_read_lock();
>>       for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) {
>>           struct dma_fence *fence;
>>   -        fence = rcu_dereference_protected(drv->fences[i],
>> -                          lockdep_is_held(&drv->lock));
>> -        if (fence && !dma_fence_is_signaled_locked(fence))
>> +        fence = dma_fence_get_rcu(drv->fences[i]);
> 
> dma_fence_get_rcu is not safe against passing a NULL fence in, while the 
> existing code makes it look like drv->fence[] slot can contain NULL at this 
> point?
> 
> amdgpu_fence_process() is the place which can NULL the slots? Irq context? 
> Why is that safe with no reference held from clearing the slot to operating 
> on the fence?

The slots are never cleared. It can only be that they are NULL when the driver 
is loaded.

I've switched over to dma_fence_get_rcu_safe() where appropriated.

Regards,
Christian.

> 
>> +        if (!fence)
>> +            continue;
>> +
>> +        dma_fence_lock_irqsave(fence, flags);
>> +        if (!dma_fence_is_signaled_locked(fence))
>>               dma_fence_set_error(fence, error);
>> +        dma_fence_unlock_irqrestore(fence, flags);
>>       }
>> -    spin_unlock_irqrestore(&drv->lock, flags);
>> +    rcu_read_unlock();
>>   }
>>     /**
>> @@ -714,16 +716,19 @@ void 
>> amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
>>       seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
>>         /* mark all fences from the guilty context with an error */
>> -    spin_lock_irqsave(&ring->fence_drv.lock, flags);
>> +    rcu_read_lock();
>>       do {
>>           last_seq++;
>>           last_seq &= ring->fence_drv.num_fences_mask;
>>             ptr = &ring->fence_drv.fences[last_seq];
>> -        rcu_read_lock();
>> -        unprocessed = rcu_dereference(*ptr);
>> +        unprocessed = dma_fence_get_rcu_safe(ptr);
> 
> Similar concern like the above.
> 
> Regards,
> 
> Tvrtko
>> +
>> +        if (!unprocessed)
>> +            continue;
>>   -        if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
>> +        dma_fence_lock_irqsave(unprocessed, flags);
>> +        if (dma_fence_is_signaled_locked(unprocessed)) {
>>               fence = container_of(unprocessed, struct amdgpu_fence, base);
>>                 if (fence == af)
>> @@ -731,9 +736,10 @@ void amdgpu_fence_driver_guilty_force_completion(struct 
>> amdgpu_fence *af)
>>               else if (fence->context == af->context)
>>                   dma_fence_set_error(&fence->base, -ECANCELED);
>>           }
>> -        rcu_read_unlock();
>> +        dma_fence_unlock_irqrestore(unprocessed, flags);
>> +        dma_fence_put(unprocessed);
>>       } while (last_seq != seq);
>> -    spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
>> +    rcu_read_unlock();
>>       /* signal the guilty fence */
>>       amdgpu_fence_write(ring, (u32)af->base.seqno);
>>       amdgpu_fence_process(ring);
>> @@ -823,39 +829,10 @@ static bool amdgpu_fence_enable_signaling(struct 
>> dma_fence *f)
>>       return true;
>>   }
>>   -/**
>> - * amdgpu_fence_free - free up the fence memory
>> - *
>> - * @rcu: RCU callback head
>> - *
>> - * Free up the fence memory after the RCU grace period.
>> - */
>> -static void amdgpu_fence_free(struct rcu_head *rcu)
>> -{
>> -    struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
>> -
>> -    /* free fence_slab if it's separated fence*/
>> -    kfree(to_amdgpu_fence(f));
>> -}
>> -
>> -/**
>> - * amdgpu_fence_release - callback that fence can be freed
>> - *
>> - * @f: fence
>> - *
>> - * This function is called when the reference count becomes zero.
>> - * It just RCU schedules freeing up the fence.
>> - */
>> -static void amdgpu_fence_release(struct dma_fence *f)
>> -{
>> -    call_rcu(&f->rcu, amdgpu_fence_free);
>> -}
>> -
>>   static const struct dma_fence_ops amdgpu_fence_ops = {
>>       .get_driver_name = amdgpu_fence_get_driver_name,
>>       .get_timeline_name = amdgpu_fence_get_timeline_name,
>>       .enable_signaling = amdgpu_fence_enable_signaling,
>> -    .release = amdgpu_fence_release,
>>   };
>>     /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 7a27c6c4bb44..9cbf63454004 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -125,7 +125,6 @@ struct amdgpu_fence_driver {
>>       unsigned            irq_type;
>>       struct timer_list        fallback_timer;
>>       unsigned            num_fences_mask;
>> -    spinlock_t            lock;
>>       struct dma_fence        **fences;
>>   };
>>   
> 

Reply via email to