Add aid, xcd & hbm temperatures to gpu metrics for smu_v13_0_12 v2: Use correct umc control per stack (Lijo)
Signed-off-by: Asad Kamal <[email protected]> --- .../gpu/drm/amd/include/kgd_pp_interface.h | 3 ++ .../drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c | 37 ++++++++++++++++++- .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h | 13 ++++++- 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h index bdf8e6ff556c..a9b73f4fd466 100644 --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h @@ -584,6 +584,9 @@ enum amdgpu_metrics_attr_id { AMDGPU_METRICS_ATTR_ID_GFX_BELOW_HOST_LIMIT_THM_ACC, AMDGPU_METRICS_ATTR_ID_GFX_LOW_UTILIZATION_ACC, AMDGPU_METRICS_ATTR_ID_GFX_BELOW_HOST_LIMIT_TOTAL_ACC, + AMDGPU_METRICS_ATTR_ID_TEMPERATURE_HBM, + AMDGPU_METRICS_ATTR_ID_TEMPERATURE_AID, + AMDGPU_METRICS_ATTR_ID_TEMPERATURE_XCD, AMDGPU_METRICS_ATTR_ID_MAX, }; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c index f2a6ecb64c03..96a58d43db53 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_12_ppt.c @@ -49,6 +49,13 @@ #undef pr_info #undef pr_debug +#define hbm_stack_mask_valid(umc_mask) \ + (((umc_mask) & 0x3) == 0x3) + +#define for_each_hbm_stack(stack_idx, umc_mask) \ + for ((stack_idx) = 0; (umc_mask); \ + (umc_mask) >>= 2, (stack_idx)++) \ + #define SMU_13_0_12_FEA_MAP(smu_feature, smu_13_0_12_feature) \ [smu_feature] = { 1, (smu_13_0_12_feature) } @@ -834,7 +841,7 @@ void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, struct smu_v13_0_6_gpu_metrics *gpu_metrics) { struct amdgpu_device *adev = smu->adev; - int ret = 0, xcc_id, inst, i, j; + int ret = 0, xcc_id, inst, i, j, idx; u8 num_jpeg_rings_gpu_metrics; MetricsTable_t *metrics; @@ -849,6 +856,31 @@ void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, gpu_metrics->temperature_vrsoc = SMUQ10_ROUND(metrics->MaxVrTemperature); + if (smu_v13_0_6_cap_supported(smu, + SMU_CAP(TEMP_AID_XCD_HBM))) { + if (adev->umc.active_mask) { + u64 mask = adev->umc.active_mask; + int out_idx = 0; + int stack_idx; + + if (unlikely(hweight64(mask) / 2 > SMU_13_0_6_MAX_HBM_STACKS)) { + dev_warn(adev->dev, "Invalid umc mask %lld\n", mask); + } else { + for_each_hbm_stack(stack_idx, mask) { + if (!hbm_stack_mask_valid(mask)) + continue; + gpu_metrics->temperature_hbm[out_idx++] = + metrics->HbmTemperature[stack_idx]; + } + } + } + idx = 0; + for_each_inst(i, adev->aid_mask) { + gpu_metrics->temperature_aid[idx] = metrics->AidTemperature[i]; + idx++; + } + } + gpu_metrics->average_gfx_activity = SMUQ10_ROUND(metrics->SocketGfxBusy); gpu_metrics->average_umc_activity = @@ -964,6 +996,9 @@ void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, [i] = SMUQ10_ROUND( metrics->GfxclkBelowHostLimitTotalAcc[inst]); } + if (smu_v13_0_6_cap_supported(smu, + SMU_CAP(TEMP_AID_XCD_HBM))) + gpu_metrics->temperature_xcd[i] = metrics->XcdTemperature[inst]; } gpu_metrics->xgmi_link_width = metrics->XgmiWidth; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h index ffb06564f830..a150fc88902c 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.h @@ -78,6 +78,7 @@ enum smu_v13_0_6_caps { SMU_CAP(RAS_EEPROM), SMU_CAP(FAST_PPT), SMU_CAP(SYSTEM_POWER_METRICS), + SMU_CAP(TEMP_AID_XCD_HBM), SMU_CAP(ALL), }; @@ -87,6 +88,8 @@ enum smu_v13_0_6_caps { #define SMU_13_0_6_MAX_XCC 8 #define SMU_13_0_6_MAX_VCN 4 #define SMU_13_0_6_MAX_JPEG 40 +#define SMU_13_0_6_MAX_AID 4 +#define SMU_13_0_6_MAX_HBM_STACKS 8 extern void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu); bool smu_v13_0_6_cap_supported(struct smu_context *smu, enum smu_v13_0_6_caps cap); @@ -222,7 +225,15 @@ extern const struct ras_smu_drv smu_v13_0_12_ras_smu_drv; SMU_13_0_6_MAX_XCC); \ SMU_ARRAY(SMU_MATTR(GFX_BELOW_HOST_LIMIT_TOTAL_ACC), SMU_MUNIT(NONE), \ SMU_MTYPE(U64), gfx_below_host_limit_total_acc, \ - SMU_13_0_6_MAX_XCC); + SMU_13_0_6_MAX_XCC); \ + SMU_ARRAY(SMU_MATTR(TEMPERATURE_HBM), SMU_MUNIT(TEMP_1), \ + SMU_MTYPE(U16), temperature_hbm, \ + SMU_13_0_6_MAX_HBM_STACKS); \ + SMU_ARRAY(SMU_MATTR(TEMPERATURE_AID), SMU_MUNIT(TEMP_1), \ + SMU_MTYPE(U16), temperature_aid, SMU_13_0_6_MAX_AID); \ + SMU_ARRAY(SMU_MATTR(TEMPERATURE_XCD), SMU_MUNIT(TEMP_1), \ + SMU_MTYPE(U16), temperature_xcd, SMU_13_0_6_MAX_XCC); \ + DECLARE_SMU_METRICS_CLASS(smu_v13_0_6_gpu_metrics, SMU_13_0_6_METRICS_FIELDS); void smu_v13_0_12_get_gpu_metrics(struct smu_context *smu, void **table, -- 2.46.0
