This commit enhances compute queue reset reliability by:

1. Adding hang detection verification before compute queue resets
   - Checks HQD address match to confirm actual hang
   - Returns early if no hang is detected
   - Prevents unnecessary resets of healthy queues

2. Implementing MES suspend/resume during compute queue reset
   - Suspends MES before resetting compute queues
   - Resumes MES after reset completes
   - Prevents potential race conditions during reset

3. Enabling MMIO path for compute queue resets
   - Sets use_mmio flag for compute queue resets

Signed-off-by: Jesse Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 18 ++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/mes_userqueue.c |  3 ++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 028989e1538c..6d3597244ac4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -142,11 +142,25 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr 
*uq_mgr,
        const struct amdgpu_userq_funcs *userq_funcs =
                adev->userq_funcs[queue->queue_type];
        bool gpu_reset = false;
+       bool gpu_suspend = false;
        int r;
 
        if (unlikely(adev->debug_disable_gpu_ring_reset)) {
                dev_err(adev->dev, "userq reset disabled by debug mask\n");
        } else if (amdgpu_gpu_recovery && userq_funcs->reset) {
+               if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE) {
+                       if (!amdgpu_userqueue_detect_hang(uq_mgr, queue)) {
+                               dev_err(adev->dev, "userq not detected hang\n");
+                               return true;
+                       }
+
+                       r = amdgpu_mes_suspend(adev);
+                       if (!r) {
+                               dev_err(adev->dev, "userq suspend gangs from 
MES succeeded\n");
+                               gpu_suspend = true;
+                       }
+               }
+
                r = userq_funcs->reset(uq_mgr, queue);
                if (r) {
                        dev_err(adev->dev, "userq reset failed\n");
@@ -157,6 +171,10 @@ amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr 
*uq_mgr,
                        amdgpu_userq_fence_driver_force_completion(queue);
                        drm_dev_wedged_event(adev_to_drm(adev), 
DRM_WEDGE_RECOVERY_NONE);
                }
+
+               if (gpu_suspend)
+                       amdgpu_mes_resume(adev);
+
        } else if (amdgpu_gpu_recovery && !userq_funcs->reset) {
                gpu_reset = true;
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c 
b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 2b5bd3691766..997b25f9fe45 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -360,7 +360,8 @@ static int mes_userq_reset(struct amdgpu_userq_mgr *uq_mgr,
        queue_input.queue_type = queue->queue_type;
        if (queue->queue_type == AMDGPU_RING_TYPE_GFX)
                queue_input.hang_detect_then_reset = true;
-
+       else if (queue->queue_type == AMDGPU_RING_TYPE_COMPUTE)
+               queue_input.use_mmio = true;
        amdgpu_mes_lock(&adev->mes);
        r = adev->mes.funcs->reset_hw_queue(&adev->mes, &queue_input);
        amdgpu_mes_unlock(&adev->mes);
-- 
2.49.0

Reply via email to