Stopping the scheduler for queue reset is generally a good idea because it prevents any worker from touching the ring buffer.
But using amdgpu_fence_driver_force_completion() before restarting it was a really bad idea because it marked fences as failed while the work was potentially still running. Stop doing that and cleanup the comment a bit. Signed-off-by: Christian König <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 685c61a05af8..400e27ec14d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -133,25 +133,24 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) dma_fence_set_error(&s_job->s_fence->finished, -ETIME); /* attempt a per ring reset */ - if (amdgpu_gpu_recovery && - ring->funcs->reset) { - dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); - /* stop the scheduler, but don't mess with the - * bad job yet because if ring reset fails - * we'll fall back to full GPU reset. + if (amdgpu_gpu_recovery && ring->funcs->reset) { + dev_err(adev->dev, "Starting %s ring reset\n", + s_job->sched->name); + + /* + * Stop the scheduler to prevent anybody else from touching the + * ring buffer. */ drm_sched_wqueue_stop(&ring->sched); r = amdgpu_ring_reset(ring, job->vmid); if (!r) { - if (amdgpu_ring_sched_ready(ring)) - drm_sched_stop(&ring->sched, s_job); atomic_inc(&ring->adev->gpu_reset_counter); - amdgpu_fence_driver_force_completion(ring); - if (amdgpu_ring_sched_ready(ring)) - drm_sched_start(&ring->sched, 0); + drm_sched_wqueue_start(&ring->sched); + dev_err(adev->dev, "Ring %s reset worked\n", + ring->sched.name); goto exit; } - dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); + dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name); } if (amdgpu_device_should_recover_gpu(ring->adev)) { -- 2.34.1
