On 10/9/2024 9:15 AM, [email protected] wrote:
> From: Jiadong Zhu <[email protected]>
> 
> Implement sdma soft reset by sending MSG_ResetSDMA on smu 13.0.6.
> 
> v2: add firmware version for the reset message.
> 
> Signed-off-by: Jiadong Zhu <[email protected]>
> ---
>  drivers/gpu/drm/amd/pm/amdgpu_dpm.c           | 15 +++++++++++++
>  drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h       |  1 +
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 10 +++++++++
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  6 ++++++
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |  3 ++-
>  .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  | 21 +++++++++++++++++++
>  6 files changed, 55 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
> b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> index 9dc82f4d7c93..9e7a652d119b 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> @@ -700,6 +700,21 @@ int amdgpu_dpm_send_rma_reason(struct amdgpu_device 
> *adev)
>       return ret;
>  }
>  
> +int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask)
> +{
> +     struct smu_context *smu = adev->powerplay.pp_handle;
> +     int ret;
> +
> +     if (!is_support_sw_smu(adev))
> +             return -EOPNOTSUPP;
> +
> +     mutex_lock(&adev->pm.mutex);
> +     ret = smu_reset_sdma(smu, inst_mask);
> +     mutex_unlock(&adev->pm.mutex);
> +
> +     return ret;
> +}
> +
>  int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev,
>                                 enum pp_clock_type type,
>                                 uint32_t *min,
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
> b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> index f5bf41f21c41..41fb6ef984bf 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> @@ -597,5 +597,6 @@ int amdgpu_dpm_set_pm_policy(struct amdgpu_device *adev, 
> int policy_type,
>                            int policy_level);
>  ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev,
>                                     enum pp_pm_policy p_type, char *buf);
> +int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask);
>  
>  #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 4a6b4ad97f06..590d004046ef 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -3820,3 +3820,13 @@ int smu_send_rma_reason(struct smu_context *smu)
>  
>       return ret;
>  }
> +
> +int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask)
> +{
> +     int ret = 0;
> +
> +     if (smu->ppt_funcs && smu->ppt_funcs->reset_sdma)
> +             ret = smu->ppt_funcs->reset_sdma(smu, inst_mask);
> +
> +     return ret;
> +}
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
> b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index b44a185d07e8..5487d9d84a4d 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -1371,6 +1371,11 @@ struct pptable_funcs {
>        */
>       int (*send_rma_reason)(struct smu_context *smu);
>  
> +     /**
> +      * @reset_sdma: message SMU to soft reset sdma instance.
> +      */
> +     int (*reset_sdma)(struct smu_context *smu, uint32_t inst_mask);
> +
>       /**
>        * @get_ecc_table:  message SMU to get ECC INFO table.
>        */
> @@ -1630,6 +1635,7 @@ void amdgpu_smu_stb_debug_fs_init(struct amdgpu_device 
> *adev);
>  int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size);
>  int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size);
>  int smu_send_rma_reason(struct smu_context *smu);
> +int smu_reset_sdma(struct smu_context *smu, uint32_t inst_mask);
>  int smu_set_pm_policy(struct smu_context *smu, enum pp_pm_policy p_type,
>                     int level);
>  ssize_t smu_get_pm_policy_info(struct smu_context *smu,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
> b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index e71a721c12b9..855eb57c734d 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -275,7 +275,8 @@
>       __SMU_DUMMY_MAP(RmaDueToBadPageThreshold), \
>       __SMU_DUMMY_MAP(SelectPstatePolicy), \
>       __SMU_DUMMY_MAP(MALLPowerController), \
> -     __SMU_DUMMY_MAP(MALLPowerState),
> +     __SMU_DUMMY_MAP(MALLPowerState), \
> +     __SMU_DUMMY_MAP(ResetSDMA),
>  
>  #undef __SMU_DUMMY_MAP
>  #define __SMU_DUMMY_MAP(type)        SMU_MSG_##type
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> index 52f3c537bb3f..42c38ced209c 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
> @@ -182,6 +182,7 @@ static const struct cmn2asic_msg_mapping 
> smu_v13_0_6_message_map[SMU_MSG_MAX_COU
>       MSG_MAP(SelectPLPDMode,                      PPSMC_MSG_SelectPLPDMode,  
>                 0),
>       MSG_MAP(RmaDueToBadPageThreshold,            
> PPSMC_MSG_RmaDueToBadPageThreshold,        0),
>       MSG_MAP(SelectPstatePolicy,                  
> PPSMC_MSG_SelectPstatePolicy,              0),
> +     MSG_MAP(ResetSDMA,                           PPSMC_MSG_ResetSDMA,       
>                 0),
>  };
>  
>  // clang-format on
> @@ -2697,6 +2698,25 @@ static int smu_v13_0_6_send_rma_reason(struct 
> smu_context *smu)
>       return ret;
>  }
>  
> +static int smu_v13_0_6_reset_sdma(struct smu_context *smu, uint32_t 
> inst_mask)
> +{
> +     struct amdgpu_device *adev = smu->adev;
> +     int ret = 0;
> +
> +     /* the message is only valid on dGPU with pmfw 85.116.110 and above */
> +     if ((adev->flags & AMD_IS_APU) || smu->smc_fw_version < 0x0055746E)

This will need IP version check as this file also supports 13.0.14 which
has a different FW version.

> +             return 0;
> +
> +     ret = smu_cmn_send_smc_msg_with_param(smu,
> +                                           SMU_MSG_ResetSDMA, inst_mask, 
> NULL);
> +     if (ret)
> +             dev_err(smu->adev->dev,
> +                     "[%s] failed to send ResetSDMA event to SMU\n",
> +                     __func__);

Mostly, we will be interested to see the SDMA instance mask for which
the failure happened rather than a function name. That function name is
not necessary.

Thanks,
Lijo

> +
> +     return ret;
> +}
> +
>  static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)
>  {
>       struct smu_context *smu = adev->powerplay.pp_handle;
> @@ -3342,6 +3362,7 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs 
> = {
>       .i2c_fini = smu_v13_0_6_i2c_control_fini,
>       .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
>       .send_rma_reason = smu_v13_0_6_send_rma_reason,
> +     .reset_sdma = smu_v13_0_6_reset_sdma,
>  };
>  
>  void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)

Reply via email to