This commit enhances compute queue reset reliability by: 1. Adding hang detection verification before compute queue resets - Checks HQD address match to confirm actual hang - Returns early if no hang is detected - Prevents unnecessary resets of healthy queues
2. Implementing MES suspend/resume during compute queue reset - Suspends MES before resetting compute queues - Resumes MES after reset completes - Prevents potential race conditions during reset 3. Enabling MMIO path for compute queue resets - Sets use_mmio flag for compute queue resets Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 18 ++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 3 ++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 028989e1538c..6d3597244ac4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -142,11 +142,25 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr, const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; bool gpu_reset = false; + bool gpu_suspend = false; int r; if (unlikely(adev->debug_disable_gpu_ring_reset)) { dev_err(adev->dev, "userq reset disabled by debug mask\n"); } else if (amdgpu_gpu_recovery && userq_funcs->reset) { + if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE) { + if (!amdgpu_userqueue_detect_hang(uq_mgr, queue)) { + dev_err(adev->dev, "userq not detected hang\n"); + return true; + } + + r = amdgpu_mes_suspend(adev); + if (!r) { + dev_err(adev->dev, "userq suspend gangs from MES succeeded\n"); + gpu_suspend = true; + } + } + r = userq_funcs->reset(uq_mgr, queue); if (r) { dev_err(adev->dev, "userq reset failed\n"); @@ -157,6 +171,10 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr, amdgpu_userq_fence_driver_force_completion(queue); drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); } + + if (gpu_suspend) + amdgpu_mes_resume(adev); + } else if (amdgpu_gpu_recovery && !userq_funcs->reset) { gpu_reset = true; } diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index 2b5bd3691766..997b25f9fe45 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -360,7 +360,8 @@ static int mes_userq_reset(struct amdgpu_userq_mgr *uq_mgr, queue_input.queue_type = queue->queue_type; if (queue->queue_type == AMDGPU_RING_TYPE_GFX) queue_input.hang_detect_then_reset = true; - + else if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE) + queue_input.use_mmio = true; amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); -- 2.49.0
