[Why]
When host detected FLR earlier than guest, it will do HW reset.
Under multi process scenario, MES could use hardware resource and failed,
if host complete FLR work.

[How]
- Lock reset domain in *mailbox_flr_work
- Use AMDGPU_HOST_FLR flag checking in gpu recover to aviod double locking
- Clear AMDGPU_HOST_FLR bit after recovery completes

Signed-off-by: Yifan Zha <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      | 4 ++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      | 4 ++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c      | 4 ++++
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e64969d576a6..d59053a2a7e7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5413,7 +5413,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
                if (!amdgpu_ras_get_fed_status(adev))
                        amdgpu_virt_ready_to_reset(adev);
                amdgpu_virt_wait_reset(adev);
-               clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
                r = amdgpu_virt_request_full_gpu(adev, true);
        } else {
                r = amdgpu_virt_reset_gpu(adev);
@@ -6098,7 +6097,8 @@ static int amdgpu_device_halt_activities(struct 
amdgpu_device *adev,
        /* We need to lock reset domain only once both for XGMI and single 
device */
        tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
                                    reset_list);
-       amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+       if (!test_bit(AMDGPU_HOST_FLR, &reset_context->flags))
+               amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
 
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
@@ -6293,7 +6293,8 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device 
*adev,
 
        tmp_adev = list_first_entry(device_list, struct amdgpu_device,
                                            reset_list);
-       amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+       if (!test_bit(AMDGPU_HOST_FLR, &reset_context->flags))
+               amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
 
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 48101a34e049..f16449fbbc5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -287,8 +287,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
*work)
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
                set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
+               amdgpu_device_lock_reset_domain(adev->reset_domain);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+
+               amdgpu_device_unlock_reset_domain(adev->reset_domain);
+               clear_bit(AMDGPU_HOST_FLR, &reset_context.flags);
        }
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index f6d8597452ed..15e6e7cdd1da 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -354,8 +354,12 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
*work)
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
                set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
+               amdgpu_device_lock_reset_domain(adev->reset_domain);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+
+               amdgpu_device_unlock_reset_domain(adev->reset_domain);
+               clear_bit(AMDGPU_HOST_FLR, &reset_context.flags);
        }
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
index e1d63bed84bf..c1b32081e7ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
@@ -524,8 +524,12 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct 
*work)
                reset_context.reset_req_dev = adev;
                clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
                set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
+               amdgpu_device_lock_reset_domain(adev->reset_domain);
 
                amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+
+               amdgpu_device_unlock_reset_domain(adev->reset_domain);
+               clear_bit(AMDGPU_HOST_FLR, &reset_context.flags);
        }
 }
 
-- 
2.25.1

Reply via email to