On 5/19/25 10:20, Samuel Zhang wrote: > For SRIOV VM env with XGMI enabled systems, XGMI physical node id may > change when hibernate and resume with different VF. > > Update XGMI info and vram_base_offset on resume for gfx444 SRIOV env. > Add amdgpu_virt_xgmi_migrate_enabled() as the feature flag. > > Signed-off-by: Jiang Liu <[email protected]> > Signed-off-by: Samuel Zhang <[email protected]>
Reviewed-by: Christian König <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 7 +++++ > 2 files changed, 39 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index d477a901af84..e5bb46effb6c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2732,6 +2732,12 @@ static int amdgpu_device_ip_early_init(struct > amdgpu_device *adev) > if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) > adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; > > + adev->virt.is_xgmi_node_migrate_enabled = false; > + if (amdgpu_sriov_vf(adev)) { > + adev->virt.is_xgmi_node_migrate_enabled = > + amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, > 4, 4); > + } > + > total = true; > for (i = 0; i < adev->num_ip_blocks; i++) { > ip_block = &adev->ip_blocks[i]; > @@ -5040,6 +5046,28 @@ int amdgpu_device_suspend(struct drm_device *dev, bool > notify_clients) > return 0; > } > > +static inline int amdgpu_virt_resume(struct amdgpu_device *adev) > +{ > + int r; > + unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id; > + > + if (!amdgpu_virt_xgmi_migrate_enabled(adev)) > + return 0; > + > + r = adev->gfxhub.funcs->get_xgmi_info(adev); > + if (r) > + return r; > + > + dev_info(adev->dev, "xgmi node, old id %d, new id %d\n", > + prev_physical_node_id, adev->gmc.xgmi.physical_node_id); > + > + adev->vm_manager.vram_base_offset = > adev->gfxhub.funcs->get_mc_fb_offset(adev); > + adev->vm_manager.vram_base_offset += > + adev->gmc.xgmi.physical_node_id * > adev->gmc.xgmi.node_segment_size; > + > + return 0; > +} > + > /** > * amdgpu_device_resume - initiate device resume > * > @@ -5061,6 +5089,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool > notify_clients) > return r; > } > > + r = amdgpu_virt_resume(adev); > + if (r) > + goto exit; > + > if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) > return 0; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > index df03dba67ab8..532b92628171 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > @@ -295,6 +295,9 @@ struct amdgpu_virt { > union amd_sriov_ras_caps ras_telemetry_en_caps; > struct amdgpu_virt_ras ras; > struct amd_sriov_ras_telemetry_error_count count_cache; > + > + /* hibernate and resume with different VF feature for xgmi enabled > system */ > + bool is_xgmi_node_migrate_enabled; > }; > > struct amdgpu_video_codec_info; > @@ -376,6 +379,10 @@ static inline bool is_virtual_machine(void) > ((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE) > #define amdgpu_sriov_is_mes_info_enable(adev) \ > ((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE) > + > +#define amdgpu_virt_xgmi_migrate_enabled(adev) \ > + ((adev)->virt.is_xgmi_node_migrate_enabled) > + > bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev); > void amdgpu_virt_init_setting(struct amdgpu_device *adev); > int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
