On 6/17/25 05:07, Alex Deucher wrote: > Move guilty logic into the ring reset callbacks. This > allows each ring reset callback to better handle fence > errors and force completions in line with the reset > behavior for each IP. It also allows us to remove > the ring guilty callback since that logic now lives > in the reset callback. > > Signed-off-by: Alex Deucher <[email protected]>
Acked-by: Christian König <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 23 ++---------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 - > drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 30 +----------------------- > 3 files changed, 3 insertions(+), 51 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 177f04491a11b..3b7d3844a74bc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -91,7 +91,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct > drm_sched_job *s_job) > struct amdgpu_job *job = to_amdgpu_job(s_job); > struct amdgpu_task_info *ti; > struct amdgpu_device *adev = ring->adev; > - bool set_error = false; > int idx, r; > > if (!drm_dev_enter(adev_to_drm(adev), &idx)) { > @@ -134,8 +133,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct > drm_sched_job *s_job) > if (unlikely(adev->debug_disable_gpu_ring_reset)) { > dev_err(adev->dev, "Ring reset disabled by debug mask\n"); > } else if (amdgpu_gpu_recovery && ring->funcs->reset) { > - bool is_guilty; > - > dev_err(adev->dev, "Starting %s ring reset\n", > s_job->sched->name); > > @@ -145,24 +142,9 @@ static enum drm_gpu_sched_stat > amdgpu_job_timedout(struct drm_sched_job *s_job) > */ > drm_sched_wqueue_stop(&ring->sched); > > - /* for engine resets, we need to reset the engine, > - * but individual queues may be unaffected. > - * check here to make sure the accounting is correct. > - */ > - if (ring->funcs->is_guilty) > - is_guilty = ring->funcs->is_guilty(ring); > - else > - is_guilty = true; > - > - if (is_guilty) { > - dma_fence_set_error(&s_job->s_fence->finished, -ETIME); > - set_error = true; > - } > - > r = amdgpu_ring_reset(ring, job->vmid, NULL); > if (!r) { > - if (is_guilty) > - atomic_inc(&ring->adev->gpu_reset_counter); > + atomic_inc(&ring->adev->gpu_reset_counter); > drm_sched_wqueue_start(&ring->sched); > dev_err(adev->dev, "Ring %s reset succeeded\n", > ring->sched.name); > @@ -173,8 +155,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct > drm_sched_job *s_job) > dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name); > } > > - if (!set_error) > - dma_fence_set_error(&s_job->s_fence->finished, -ETIME); > + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); > > if (amdgpu_device_should_recover_gpu(ring->adev)) { > struct amdgpu_reset_context reset_context; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index fc36b86c6dcf8..6aaa9d0c1f25c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -271,7 +271,6 @@ struct amdgpu_ring_funcs { > int (*reset)(struct amdgpu_ring *ring, unsigned int vmid, > struct amdgpu_fence *guilty_fence); > void (*emit_cleaner_shader)(struct amdgpu_ring *ring); > - bool (*is_guilty)(struct amdgpu_ring *ring); > }; > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > index d3cb4dbae790b..61274579b3452 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > @@ -1655,30 +1655,10 @@ static bool sdma_v4_4_2_is_queue_selected(struct > amdgpu_device *adev, uint32_t i > return (context_status & SDMA_GFX_CONTEXT_STATUS__SELECTED_MASK) != 0; > } > > -static bool sdma_v4_4_2_ring_is_guilty(struct amdgpu_ring *ring) > -{ > - struct amdgpu_device *adev = ring->adev; > - uint32_t instance_id = ring->me; > - > - return sdma_v4_4_2_is_queue_selected(adev, instance_id, false); > -} > - > -static bool sdma_v4_4_2_page_ring_is_guilty(struct amdgpu_ring *ring) > -{ > - struct amdgpu_device *adev = ring->adev; > - uint32_t instance_id = ring->me; > - > - if (!adev->sdma.has_page_queue) > - return false; > - > - return sdma_v4_4_2_is_queue_selected(adev, instance_id, true); > -} > - > static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, > unsigned int vmid, > struct amdgpu_fence *guilty_fence) > { > - bool is_guilty = ring->funcs->is_guilty(ring); > struct amdgpu_device *adev = ring->adev; > u32 id = ring->me; > int r; > @@ -1689,13 +1669,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring > *ring, > amdgpu_amdkfd_suspend(adev, true); > r = amdgpu_sdma_reset_engine(adev, id); > amdgpu_amdkfd_resume(adev, true); > - if (r) > - return r; > - > - if (is_guilty) > - amdgpu_fence_driver_force_completion(ring); > - > - return 0; > + return r; > } > > static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring) > @@ -2180,7 +2154,6 @@ static const struct amdgpu_ring_funcs > sdma_v4_4_2_ring_funcs = { > .emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait, > .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, > .reset = sdma_v4_4_2_reset_queue, > - .is_guilty = sdma_v4_4_2_ring_is_guilty, > }; > > static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = { > @@ -2213,7 +2186,6 @@ static const struct amdgpu_ring_funcs > sdma_v4_4_2_page_ring_funcs = { > .emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait, > .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, > .reset = sdma_v4_4_2_reset_queue, > - .is_guilty = sdma_v4_4_2_page_ring_is_guilty, > }; > > static void sdma_v4_4_2_set_ring_funcs(struct amdgpu_device *adev)
