On 13/11/2025 10:39, Boris Brezillon wrote:
> We have seen a few cases where the whole memory subsystem is blocked
> and flush operations never complete. When that happens, we want to:
> 
> - schedule a reset, so we can recover from this situation
> - in the reset path, we need to reset the pending_reqs so we can send
>   new commands after the reset
> - if more panthor_gpu_flush_caches() operations are queued after
>   the timeout, we skip them and return -EIO directly to avoid needless
>   waits (the memory block won't miraculously work again)

You've removed the WARN from this last case. Is this intentional? I
agree the recovery is better, but I don't think we expect this to happen
- so it's pointing to something else being broken.

> 
> v2:
> - New patch
> 
> Fixes: 5cd894e258c4 ("drm/panthor: Add the GPU logical block")
> Signed-off-by: Boris Brezillon <[email protected]>
> ---
>  drivers/gpu/drm/panthor/panthor_gpu.c | 19 ++++++++++++-------
>  1 file changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c 
> b/drivers/gpu/drm/panthor/panthor_gpu.c
> index eda670229184..abd2fde04da9 100644
> --- a/drivers/gpu/drm/panthor/panthor_gpu.c
> +++ b/drivers/gpu/drm/panthor/panthor_gpu.c
> @@ -283,38 +283,42 @@ int panthor_gpu_l2_power_on(struct panthor_device 
> *ptdev)
>  int panthor_gpu_flush_caches(struct panthor_device *ptdev,
>                            u32 l2, u32 lsc, u32 other)
>  {
> -     bool timedout = false;
>       unsigned long flags;
> +     int ret = 0;
>  
>       /* Serialize cache flush operations. */
>       guard(mutex)(&ptdev->gpu->cache_flush_lock);
>  
>       spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
> -     if (!drm_WARN_ON(&ptdev->base,
> -                      ptdev->gpu->pending_reqs & 
> GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
> +     if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
>               ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
>               gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
> +     } else {
> +             ret = -EIO;
>       }
>       spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
>  
> +     if (ret)
> +             return ret;
> +
>       if (!wait_event_timeout(ptdev->gpu->reqs_acked,
>                               !(ptdev->gpu->pending_reqs & 
> GPU_IRQ_CLEAN_CACHES_COMPLETED),
>                               msecs_to_jiffies(100))) {
>               spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
>               if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) 
> != 0 &&
>                   !(gpu_read(ptdev, GPU_INT_RAWSTAT) & 
> GPU_IRQ_CLEAN_CACHES_COMPLETED))
> -                     timedout = true;
> +                     ret = -ETIMEDOUT;
>               else
>                       ptdev->gpu->pending_reqs &= 
> ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
>               spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
>       }
>  
> -     if (timedout) {
> +     if (ret) {
> +             panthor_device_schedule_reset(ptdev);
>               drm_err(&ptdev->base, "Flush caches timeout");
> -             return -ETIMEDOUT;
>       }
>  
> -     return 0;
> +     return ret;
>  }
>  
>  /**
> @@ -354,6 +358,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
>               return -ETIMEDOUT;
>       }
>  
> +     ptdev->gpu->pending_reqs = 0;
>       return 0;
>  }
>  

Reply via email to