>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?
In where ? if you mean in amdgpu_device_recover routine I won't do that since 
the reset() can do any kind of reset like:
1) PF FLR
2) mode1/2 reset
3) magic reset through config space
4) BACO reset

PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO reset is 
definitely a vram lost reset 

If you increase the counter in general function that will be not accurate 
_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Christian König <[email protected]> 
Sent: Friday, August 23, 2019 4:27 PM
To: Liu, Monk <[email protected]>; [email protected]
Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function

Am 23.08.19 um 05:34 schrieb Monk Liu:
> for SOC15/vega10 the BACO reset would introduce vram lost in the high 
> end address range and current kmd's vram lost checking cannot catch it 
> since it only check visible frame buffer
>
> TODO:
> to confirm if mode1/2 reset would introduce vram lost

Looks good in general, but I would make the value mandatory or maybe use a 
special return code instead.

On the other hand wouldn't it be simpler to just increment vram_lost_counter?

Christian.

>
> Signed-off-by: Monk Liu <[email protected]>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----
>   drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-
>   drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---
>   drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-
>   drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-
>   drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-
>   7 files changed, 22 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f6ae565..1fe3756 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {
>       int (*read_register)(struct amdgpu_device *adev, u32 se_num,
>                            u32 sh_num, u32 reg_offset, u32 *value);
>       void (*set_vga_state)(struct amdgpu_device *adev, bool state);
> -     int (*reset)(struct amdgpu_device *adev);
> +     int (*reset)(struct amdgpu_device *adev, bool *lost);
>       enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);
>       /* get the reference clock */
>       u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7 @@ 
> int emu_soc_asic_init(struct amdgpu_device *adev);
>    * ASICs macro.
>    */
>   #define amdgpu_asic_set_vga_state(adev, state) 
> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define 
> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))
> +#define amdgpu_asic_reset(adev, lost) 
> +(adev)->asic_funcs->reset((adev), (lost))
>   #define amdgpu_asic_reset_method(adev) 
> (adev)->asic_funcs->reset_method((adev))
>   #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))
>   #define amdgpu_asic_set_uvd_clocks(adev, v, d) 
> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 02b3e7d..8668cb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct 
> work_struct *__work)
>       struct amdgpu_device *adev =
>               container_of(__work, struct amdgpu_device, xgmi_reset_work);
>   
> -     adev->asic_reset_res =  amdgpu_asic_reset(adev);
> +     adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);
>       if (adev->asic_reset_res)
>               DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>                        adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 
> +2751,7 @@ 
> int amdgpu_device_init(struct amdgpu_device *adev,
>        *  E.g., driver was not cleanly unloaded previously, etc.
>        */
>       if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
> -             r = amdgpu_asic_reset(adev);
> +             r = amdgpu_asic_reset(adev, NULL);
>               if (r) {
>                       dev_err(adev->dev, "asic reset on init failed\n");
>                       goto failed;
> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> suspend, bool fbcon)
>               pci_disable_device(dev->pdev);
>               pci_set_power_state(dev->pdev, PCI_D3hot);
>       } else {
> -             r = amdgpu_asic_reset(adev);
> +             r = amdgpu_asic_reset(adev, NULL);
>               if (r)
>                       DRM_ERROR("amdgpu asic reset failed\n");
>       }
> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
> *hive,
>                               if (!queue_work(system_highpri_wq, 
> &tmp_adev->xgmi_reset_work))
>                                       r = -EALREADY;
>                       } else
> -                             r = amdgpu_asic_reset(tmp_adev);
> +                             r = amdgpu_asic_reset(tmp_adev, &vram_lost);
>   
>                       if (r) {
>                               DRM_ERROR("ASIC reset failed with error, %d for 
> drm dev, %s", @@ 
> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
> *hive,
>                               if (r)
>                                       goto out;
>   
> -                             vram_lost = 
> amdgpu_device_check_vram_lost(tmp_adev);
> +                             if (!vram_lost)
> +                                     vram_lost = 
> amdgpu_device_check_vram_lost(tmp_adev);
> +
>                               if (vram_lost) {
>                                       DRM_INFO("VRAM is lost due to GPU 
> reset!\n");
>                                       
> atomic_inc(&tmp_adev->vram_lost_counter);
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c 
> b/drivers/gpu/drm/amd/amdgpu/cik.c
> index 7b63d7a..0f25b82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct 
> amdgpu_device *adev)
>    * to reset them.
>    * Returns 0 for success.
>    */
> -static int cik_asic_reset(struct amdgpu_device *adev)
> +static int cik_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>       int r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c 
> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)
>               return AMD_RESET_METHOD_MODE1;
>   }
>   
> -static int nv_asic_reset(struct amdgpu_device *adev)
> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>   
>       /* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@ 
> static int nv_asic_reset(struct amdgpu_device *adev)
>       int ret = 0;
>       struct smu_context *smu = &adev->smu;
>   
> -     if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
> +     if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> +             if (vramlost)
> +                     *vramlost = true;
>               ret = smu_baco_reset(smu);
> -     else
> +     }
> +     else {
>               ret = nv_asic_mode1_reset(adev);
> +     }
>   
>       return ret;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c 
> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644
> --- a/drivers/gpu/drm/amd/amdgpu/si.c
> +++ b/drivers/gpu/drm/amd/amdgpu/si.c
> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device 
> *adev,
>   }
>   
>   //xxx: not implemented
> -static int si_asic_reset(struct amdgpu_device *adev)
> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>       return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index fe2212df..12b2966 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
>               return AMD_RESET_METHOD_MODE1;
>   }
>   
> -static int soc15_asic_reset(struct amdgpu_device *adev)
> +static int soc15_asic_reset(struct amdgpu_device *adev, bool 
> +*vramlost)
>   {
>       switch (soc15_asic_reset_method(adev)) {
>               case AMD_RESET_METHOD_BACO:
> +                     if (vramlost)
> +                             *vramlost = true;
>                       return soc15_asic_baco_reset(adev);
>               case AMD_RESET_METHOD_MODE2:
>                       return soc15_mode2_reset(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c 
> b/drivers/gpu/drm/amd/amdgpu/vi.c index 56c882b..8eceb00 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device 
> *adev)
>    * to reset them.
>    * Returns 0 for success.
>    */
> -static int vi_asic_reset(struct amdgpu_device *adev)
> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>       int r;
>   

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to