[Why] When host detected FLR earlier than guest, it will do HW reset. Under multi process scenario, MES could use hardware resource and failed, if host complete FLR work.
[How] - Lock reset domain in *mailbox_flr_work - Use AMDGPU_HOST_FLR flag checking in gpu recover to aviod double locking - Clear AMDGPU_HOST_FLR bit after recovery completes Signed-off-by: Yifan Zha <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++--- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 4 ++++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e64969d576a6..d59053a2a7e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5413,7 +5413,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (!amdgpu_ras_get_fed_status(adev)) amdgpu_virt_ready_to_reset(adev); amdgpu_virt_wait_reset(adev); - clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); r = amdgpu_virt_request_full_gpu(adev, true); } else { r = amdgpu_virt_reset_gpu(adev); @@ -6098,7 +6097,8 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, /* We need to lock reset domain only once both for XGMI and single device */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); - amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); + if (!test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { @@ -6293,7 +6293,8 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, tmp_adev = list_first_entry(device_list, struct amdgpu_device, reset_list); - amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); + if (!test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 48101a34e049..f16449fbbc5c 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -287,8 +287,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_HOST_FLR, &reset_context.flags); + amdgpu_device_lock_reset_domain(adev->reset_domain); amdgpu_device_gpu_recover(adev, NULL, &reset_context); + + amdgpu_device_unlock_reset_domain(adev->reset_domain); + clear_bit(AMDGPU_HOST_FLR, &reset_context.flags); } } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index f6d8597452ed..15e6e7cdd1da 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -354,8 +354,12 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_HOST_FLR, &reset_context.flags); + amdgpu_device_lock_reset_domain(adev->reset_domain); amdgpu_device_gpu_recover(adev, NULL, &reset_context); + + amdgpu_device_unlock_reset_domain(adev->reset_domain); + clear_bit(AMDGPU_HOST_FLR, &reset_context.flags); } } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index e1d63bed84bf..c1b32081e7ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -524,8 +524,12 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); set_bit(AMDGPU_HOST_FLR, &reset_context.flags); + amdgpu_device_lock_reset_domain(adev->reset_domain); amdgpu_device_gpu_recover(adev, NULL, &reset_context); + + amdgpu_device_unlock_reset_domain(adev->reset_domain); + clear_bit(AMDGPU_HOST_FLR, &reset_context.flags); } } -- 2.25.1
