When we backup ring contents to reemit after a queue reset,
we don't backup ring contents from the bad context.  When
we signal the fences, we should set an error on those
fences as well.

Fixes: 77cc0da39c7c ("drm/amdgpu: track ring state associated with a fence")
Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 34 ++++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c  |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  3 +-
 3 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index fd8cca241da62..1b689a4226291 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -752,17 +752,31 @@ void amdgpu_fence_driver_force_completion(struct 
amdgpu_ring *ring)
  * which data needs to be saved out of the queue's ring buffer.
  */
 
-/**
- * amdgpu_fence_driver_guilty_force_completion - force signal of specified 
sequence
- *
- * @fence: fence of the ring to signal
- *
- */
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_ring *ring,
+                                                struct amdgpu_fence 
*guilty_fence)
 {
-       dma_fence_set_error(&fence->base, -ETIME);
-       amdgpu_fence_write(fence->ring, fence->seq);
-       amdgpu_fence_process(fence->ring);
+       struct dma_fence *unprocessed;
+       struct dma_fence __rcu **ptr;
+       struct amdgpu_fence *fence;
+       u64 i, seqno;
+
+       seqno = amdgpu_fence_read(ring);
+
+       for (i = seqno + 1; i <= ring->fence_drv.sync_seq; ++i) {
+               ptr = &ring->fence_drv.fences[i & 
ring->fence_drv.num_fences_mask];
+               rcu_read_lock();
+               unprocessed = rcu_dereference(*ptr);
+
+               if (unprocessed && !dma_fence_is_signaled(unprocessed)) {
+                       fence = container_of(unprocessed, struct amdgpu_fence, 
base);
+
+                       if (fence->context == guilty_fence->context)
+                               dma_fence_set_error(&fence->base, -ETIME);
+               }
+               rcu_read_unlock();
+       }
+       amdgpu_fence_write(ring, guilty_fence->seq);
+       amdgpu_fence_process(ring);
 }
 
 void amdgpu_fence_save_wptr(struct dma_fence *fence)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 6379bb25bf5ce..725d6437fe8e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -812,9 +812,9 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
        if (r)
                return r;
 
-       /* signal the fence of the bad job */
+       /* signal the fences of the bad job */
        if (guilty_fence)
-               amdgpu_fence_driver_guilty_force_completion(guilty_fence);
+               amdgpu_fence_driver_guilty_force_completion(ring, guilty_fence);
        /* Re-emit the non-guilty commands */
        if (ring->ring_backup_entries_to_copy) {
                amdgpu_ring_alloc_reemit(ring, 
ring->ring_backup_entries_to_copy);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 7670f5d82b9e4..7a362ce8435fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -155,7 +155,8 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
 void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
 void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
 void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_ring *ring,
+                                                struct amdgpu_fence *fence);
 void amdgpu_fence_save_wptr(struct dma_fence *fence);
 
 int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);
-- 
2.51.0

Reply via email to