amdgpu: Avoid HW GPU reset for RAS.

Grodzovsky, Andrey Thu, 29 Aug 2019 07:19:13 -0700

On 8/29/19 3:56 AM, Zhou1, Tao wrote:
>
>> -----Original Message-----
>> From: amd-gfx <[email protected]> On Behalf Of
>> Andrey Grodzovsky
>> Sent: 2019年8月29日 4:00
>> To: [email protected]
>> Cc: [email protected]; [email protected];
>> Grodzovsky, Andrey <[email protected]>; Zhang, Hawking
>> <[email protected]>
>> Subject: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
>>
>> Problem:
>> Under certain conditions, when some IP bocks take a RAS error, we can get
> [Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks"
>
>> into a situation where a GPU reset is not possible due to issues in RAS in
>> SMU/PSP.
>>
>> Temporary fix until proper solution in PSP/SMU is ready:
>> When uncorrectable error happens the DF will unconditionally broadcast
>> error event packets to all its clients/slave upon receiving fatal error 
>> event and
>> freeze all its outbound queues, err_event_athub interrupt  will be triggered.
>> In such case and we use this interrupt
>> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
>> reset, only stops schedulers, deatches all in progress and not yet scheduled
>> job's fences, set error code on them and signals.
>> Also reject any new incoming job submissions from user space.
>> All this is done to notify the applications of the problem.
>>
>> Signed-off-by: Andrey Grodzovsky <[email protected]>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98
>> ++++++++++++++++++++++--------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 30 +++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 12 +++-
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 +--
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 ++++----
>>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 ++
>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++-----
>>   10 files changed, 164 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 9da681e..300adb8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -38,6 +38,7 @@
>>   #include "amdgpu_gmc.h"
>>   #include "amdgpu_gem.h"
>>   #include "amdgpu_display.h"
>> +#include "amdgpu_ras.h"
>>
>>   #if defined(HAVE_DRM_FREE_LARGE)
>>   #define kvfree drm_free_large
>> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
>> *data, struct drm_file *filp)
>>      bool reserved_buffers = false;
>>      int i, r;
>>
>> +    if (amdgpu_ras_intr_triggered())
>> +            return -EHWPOISON;
>> +
>>      if (!adev->accel_working)
>>              return -EBUSY;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 07a4ba0..3ecee10 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct
>> amdgpu_device *adev, bool trylock)
>>      return true;
>>   }
>>
>> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, bool
>> +skip_kfd)
>>   {
>>      /*unlock kfd: SRIOV would do it separately */
>> -    if (!amdgpu_sriov_vf(adev))
>> +    if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>>                   amdgpu_amdkfd_post_reset(adev);
>>      amdgpu_vf_error_trans_all(adev);
>>      adev->mp1_state = PP_MP1_STATE_NONE;
>> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct
>> amdgpu_device *adev)  }
>>
>>
>> +#define to_drm_sched_job(sched_job)         \
>> +            container_of((sched_job), struct drm_sched_job,
>> queue_node)
>> +
>> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>> +*sched) {
>> +    struct drm_sched_job *s_job;
>> +    struct drm_sched_entity *s_entity = NULL;
>> +    int i;
>> +
>> +    /* Signal all jobs not yet scheduled */
>> +    for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
>> DRM_SCHED_PRIORITY_MIN; i--) {
>> +            struct drm_sched_rq *rq = &sched->sched_rq[i];
>> +
>> +            if (!rq)
>> +                    continue;
>> +
>> +            spin_lock(&rq->lock);
>> +            list_for_each_entry(s_entity, &rq->entities, list) {
>> +                    while ((s_job =
>> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
>> +                            struct drm_sched_fence *s_fence = s_job-
>>> s_fence;
>> +
>> +                            dma_fence_signal(&s_fence->scheduled);
>> +                            dma_fence_set_error(&s_fence->finished, -
>> EHWPOISON);
>> +                            dma_fence_signal(&s_fence->finished);
>> +                    }
>> +            }
>> +            spin_unlock(&rq->lock);
>> +    }
>> +
>> +    /* Signal all jobs already scheduled to HW */
>> +    list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
>> +            struct drm_sched_fence *s_fence = s_job->s_fence;
>> +
>> +            dma_fence_set_error(&s_fence->finished, -EHWPOISON);
>> +            dma_fence_signal(&s_fence->finished);
>> +    }
>> +}
>> +
>>   /**
>>    * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>>    *
>> @@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct
>> amdgpu_device *adev,
>>      struct amdgpu_hive_info *hive = NULL;
>>      struct amdgpu_device *tmp_adev = NULL;
>>      int i, r = 0;
>> +    bool in_ras_intr = amdgpu_ras_intr_triggered();
>>
>>      need_full_reset = job_signaled = false;
>>      INIT_LIST_HEAD(&device_list);
>>
>> -    dev_info(adev->dev, "GPU reset begin!\n");
>> +    dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs
>> +stop":"reset");
>>
>>      cancel_delayed_work_sync(&adev->delayed_init_work);
>>
>> @@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct
>> amdgpu_device *adev,
>>      /* Build list of devices to reset */
>>      if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>              if (!hive) {
>> -                    amdgpu_device_unlock_adev(adev);
>> +                    amdgpu_device_unlock_adev(adev, false);
>>                      return -ENODEV;
>>              }
>>
>> @@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct
>> amdgpu_device *adev,
>>      /* block all schedulers and reset given job's ring */
>>      list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>              /* disable ras on ALL IPs */
>> -            if (amdgpu_device_ip_need_full_reset(tmp_adev))
>> +            if (!in_ras_intr &&
>> amdgpu_device_ip_need_full_reset(tmp_adev))
>>                      amdgpu_ras_suspend(tmp_adev);
>>
>>              for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3834,10
>> +3873,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>                              continue;
>>
>>                      drm_sched_stop(&ring->sched, job ? &job->base :
>> NULL);
>> +
>> +                    if (in_ras_intr)
>> +                            amdgpu_stop_all_jobs_on_sched(&ring-
>>> sched);
>>              }
>>      }
>>
>>
>> +    if (in_ras_intr)
>> +            goto skip_hw_reset;
>> +
>>      /*
>>       * Must check guilty signal here since after this point all old
>>       * HW fences are force signaled.
>> @@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct
>> amdgpu_device *adev,
>>
>>      /* Post ASIC reset for all devs .*/
>>      list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -            for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> -                    struct amdgpu_ring *ring = tmp_adev->rings[i];
>>
>> -                    if (!ring || !ring->sched.thread)
>> -                            continue;
>> +            if (!in_ras_intr) {
>> +                    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>> +                            struct amdgpu_ring *ring = tmp_adev-
>>> rings[i];
>> -                    /* No point to resubmit jobs if we didn't HW reset*/
>> -                    if (!tmp_adev->asic_reset_res && !job_signaled)
>> -                            drm_sched_resubmit_jobs(&ring->sched);
>> +                            if (!ring || !ring->sched.thread)
>> +                                    continue;
>>
>> -                    drm_sched_start(&ring->sched, !tmp_adev-
>>> asic_reset_res);
>> -            }
>> +                            /* No point to resubmit jobs if we didn't HW
>> reset*/
>> +                            if (!tmp_adev->asic_reset_res
>> && !job_signaled)
>> +                                    drm_sched_resubmit_jobs(&ring-
>>> sched);
>> -            if (!amdgpu_device_has_dc_support(tmp_adev)
>> && !job_signaled) {
>> -                    drm_helper_resume_force_mode(tmp_adev->ddev);
>> -            }
>> +                            drm_sched_start(&ring->sched, !tmp_adev-
>>> asic_reset_res);
>> +                    }
>>
>> -            tmp_adev->asic_reset_res = 0;
>> +                    if (!amdgpu_device_has_dc_support(tmp_adev)
>> && !job_signaled) {
>> +                            drm_helper_resume_force_mode(tmp_adev-
>>> ddev);
>> +                    }
>>
>> -            if (r) {
>> -                    /* bad news, how to tell it to userspace ? */
>> -                    dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
>> atomic_read(&adev->gpu_reset_counter));
>> -                    amdgpu_vf_error_put(tmp_adev,
>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
>> -            } else {
>> -                    dev_info(tmp_adev->dev, "GPU reset(%d)
>> succeeded!\n", atomic_read(&adev->gpu_reset_counter));
>> +                    tmp_adev->asic_reset_res = 0;
>> +
>> +                    if (r) {
>> +                            /* bad news, how to tell it to userspace ? */
>> +                            dev_info(tmp_adev->dev, "GPU reset(%d)
>> failed\n", atomic_read(&adev->gpu_reset_counter));
>> +                            amdgpu_vf_error_put(tmp_adev,
>> AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
>> +                    } else {
>> +                            dev_info(tmp_adev->dev, "GPU reset(%d)
>> succeeded!\n", atomic_read(&adev->gpu_reset_counter));
>> +                    }
>>              }
>>
>> -            amdgpu_device_unlock_adev(tmp_adev);
>> +            amdgpu_device_unlock_adev(tmp_adev, in_ras_intr);
>>      }
>>
>>      if (hive)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index 151d7f2..757fd6d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -40,6 +40,8 @@
>>
>>   #include "amdgpu_amdkfd.h"
>>
>> +#include "amdgpu_ras.h"
>> +
>>   /*
>>    * KMS wrapper.
>>    * - 3.0.0 - initial driver
>> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
>>      struct drm_device *dev = pci_get_drvdata(pdev);
>>      struct amdgpu_device *adev = dev->dev_private;
>>
>> +    if (amdgpu_ras_intr_triggered())
>> +            return;
>> +
>>      /* if we are running in a VM, make sure the device
>>       * torn down properly on reboot/shutdown.
>>       * unfortunately we can't detect certain diff --git
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index da2143d..ced766c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device
>> *dev, struct drm_file *file_priv)
>>      /* Ensure IB tests are run on ring */
>>      flush_delayed_work(&adev->delayed_init_work);
>>
>> +
>> +    if (amdgpu_ras_intr_triggered()) {
>> +            DRM_ERROR("RAS Intr triggered, device disabled!!");
>> +            return -EHWPOISON;
>> +    }
>> +
>>      file_priv->driver_priv = NULL;
>>
>>      r = pm_runtime_get_sync(dev->dev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index 2d5897a..086e6df 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -24,6 +24,8 @@
>>   #include <linux/debugfs.h>
>>   #include <linux/list.h>
>>   #include <linux/module.h>
>> +#include <linux/reboot.h>
>> +#include <linux/syscalls.h>
>>   #include "amdgpu.h"
>>   #include "amdgpu_ras.h"
>>   #include "amdgpu_atomfirmware.h"
>> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {
>>   /* inject address is 52 bits */
>>   #define    RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
>>
>> +
>> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
>> +
>>   static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
>>              uint64_t offset, uint64_t size,
>>              struct amdgpu_bo **bo_ptr);
>> @@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f,
>> char __user *buf,
>>      ssize_t s;
>>      char val[128];
>>
>> -    if (amdgpu_ras_error_query(obj->adev, &info))
>> +    if (amdgpu_ras_error_query(obj->adev, &info, false))
>>              return -EINVAL;
>>
>>      s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n", @@ -188,6
>> +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>>
>>      return 0;
>>   }
>> +
>> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device
>> *adev,
>> +            struct ras_common_if *head);
>> +
>>   /**
>>    * DOC: AMDGPU RAS debugfs control interface
>>    *
>> @@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device
>> *dev,
>>              .head = obj->head,
>>      };
>>
>> -    if (amdgpu_ras_error_query(obj->adev, &info))
>> +    if (amdgpu_ras_error_query(obj->adev, &info, false))
>>              return -EINVAL;
>>
>>      return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", @@ -591,7
>> +600,7 @@ static int amdgpu_ras_enable_all_features(struct
>> amdgpu_device *adev,
>>
>>   /* query/inject/cure begin */
>>   int amdgpu_ras_error_query(struct amdgpu_device *adev,
>> -            struct ras_query_if *info)
>> +            struct ras_query_if *info, bool print)
>>   {
>>      struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>>      struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -627,12 +636,14
>> @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
>>      info->ue_count = obj->err_data.ue_count;
>>      info->ce_count = obj->err_data.ce_count;
>>
>> -    if (err_data.ce_count)
>> +    if (err_data.ce_count || print) {
>>              dev_info(adev->dev, "%ld correctable errors detected in %s
>> block\n",
>>                       obj->err_data.ce_count, ras_block_str(info-
>>> head.block));
> [Tao] Could you explain why print is needed even ce/ue_count == 0? And I 
> think these codes can be split into a single patch.



I will just remove it, at first we planned to also dump all CE/CU 
counters but I don't do it eventually.


>
>> -    if (err_data.ue_count)
>> +    }
>> +    if (err_data.ue_count || print) {
>>              dev_info(adev->dev, "%ld uncorrectable errors detected
>> in %s block\n",
>>                       obj->err_data.ue_count, ras_block_str(info-
>>> head.block));
>> +    }
>>
>>      return 0;
>>   }
>> @@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct
>> amdgpu_device *adev,
>>                      .head = obj->head,
>>              };
>>
>> -            if (amdgpu_ras_error_query(adev, &info))
>> +            if (amdgpu_ras_error_query(adev, &info, true))
>>                      return -EINVAL;
>>
>>              data.ce_count += info.ce_count;
>> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>>
>>      return 0;
>>   }
>> +
>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) {
>> +    if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
>> +            DRM_WARN("RAS event of type
>> ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
>> +    }
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 5a0df73..c0e22af 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct
>> amdgpu_device *adev,
>>              struct ras_common_if *head);
>>
>>   int amdgpu_ras_error_query(struct amdgpu_device *adev,
>> -            struct ras_query_if *info);
>> +            struct ras_query_if *info, bool print);
>>
>>   int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>>              struct ras_inject_if *info);
>> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct
>> amdgpu_device *adev,
>>
>>   int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
>>              struct ras_dispatch_if *info);
>> +
>> +extern atomic_t amdgpu_ras_in_intr;
>> +
>> +static inline bool amdgpu_ras_intr_triggered(void) {
>> +    return !!atomic_read(&amdgpu_ras_in_intr);
>> +}
>> +
>> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
>> +
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index b2c86a0..e7a83f6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct
>> amdgpu_device *adev,
>>              struct amdgpu_iv_entry *entry)
>>   {
>>      /* TODO ue will trigger an interrupt. */
>> -    kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>> -    if (adev->gfx.funcs->query_ras_error_count)
>> -            adev->gfx.funcs->query_ras_error_count(adev, err_data);
>> -    amdgpu_ras_reset_gpu(adev, 0);
>> +    if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> [Tao] Have you encountered any error without the check? ras_data_cb would not 
> be registered if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))


We have a requirement to not to handle block specific interrupts in case 
of expecting a sync flood which will trigger err_event_athub interrupt 
which is exactly the case for when RAS GFX is enabled.


>
>> +            kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>> +            if (adev->gfx.funcs->query_ras_error_count)
>> +                    adev->gfx.funcs->query_ras_error_count(adev,
>> err_data);
>> +            amdgpu_ras_reset_gpu(adev, 0);
>> +    }
>>      return AMDGPU_RAS_SUCCESS;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index 43b4fbc..87a66c2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct
>> amdgpu_device *adev,
>>              struct ras_err_data *err_data,
>>              struct amdgpu_iv_entry *entry)
>>   {
>> -    kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>> -    if (adev->umc.funcs->query_ras_error_count)
>> -            adev->umc.funcs->query_ras_error_count(adev, err_data);
>> -    /* umc query_ras_error_address is also responsible for clearing
>> -     * error status
>> -     */
>> -    if (adev->umc.funcs->query_ras_error_address)
>> -            adev->umc.funcs->query_ras_error_address(adev, err_data);
>> +    if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> [Tao] AMDGPU_RAS_BLOCK__UMC


See above explanation


>
>> +            kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>> +            if (adev->umc.funcs->query_ras_error_count)
>> +                    adev->umc.funcs->query_ras_error_count(adev,
>> err_data);
>> +            /* umc query_ras_error_address is also responsible for
>> clearing
>> +             * error status
>> +             */
>> +            if (adev->umc.funcs->query_ras_error_address)
>> +                    adev->umc.funcs->query_ras_error_address(adev,
>> err_data);
>>
>> -    /* only uncorrectable error needs gpu reset */
>> -    if (err_data->ue_count)
>> -            amdgpu_ras_reset_gpu(adev, 0);
>> +            /* only uncorrectable error needs gpu reset */
>> +            if (err_data->ue_count)
>> +                    amdgpu_ras_reset_gpu(adev, 0);
>> +    }
>>
>>      return AMDGPU_RAS_SUCCESS;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>> index 367f9d6..545990c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
>> @@ -30,6 +30,7 @@
>>   #include "nbio/nbio_7_4_0_smn.h"
>>   #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
>>   #include <uapi/linux/kfd_ioctl.h>
>> +#include "amdgpu_ras.h"
>>
>>   #define smnNBIF_MGCG_CTRL_LCLK     0x1013a21c
>>
>> @@ -329,6 +330,8 @@ static void
>> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
>>                                              BIF_DOORBELL_INT_CNTL,
>>
>>      RAS_CNTLR_INTERRUPT_CLEAR, 1);
>>              WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
>> bif_doorbell_intr_cntl);
>> +
>> +            amdgpu_ras_global_ras_isr(adev);
>>      }
>>   }
>>
>> @@ -344,6 +347,8 @@ static void
>> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
>>                                              BIF_DOORBELL_INT_CNTL,
>>
>>      RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
>>              WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,
>> bif_doorbell_intr_cntl);
>> +
>> +            amdgpu_ras_global_ras_isr(adev);
>>      }
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> index 956432f..438e504 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> @@ -1972,24 +1972,26 @@ static int
>> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
>>      uint32_t err_source;
>>      int instance;
>>
>> -    instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>> -    if (instance < 0)
>> -            return 0;
>> +    if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> [Tao] AMDGPU_RAS_BLOCK__SDMA


See above explanation

Andrey


>
>> +            instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>> +            if (instance < 0)
>> +                    return 0;
>>
>> -    switch (entry->src_id) {
>> -    case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
>> -            err_source = 0;
>> -            break;
>> -    case SDMA0_4_0__SRCID__SDMA_ECC:
>> -            err_source = 1;
>> -            break;
>> -    default:
>> -            return 0;
>> -    }
>> +            switch (entry->src_id) {
>> +            case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
>> +                    err_source = 0;
>> +                    break;
>> +            case SDMA0_4_0__SRCID__SDMA_ECC:
>> +                    err_source = 1;
>> +                    break;
>> +            default:
>> +                    return 0;
>> +            }
>>
>> -    kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>> +            kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
>>
>> -    amdgpu_ras_reset_gpu(adev, 0);
>> +            amdgpu_ras_reset_gpu(adev, 0);
>> +    }
>>
>>      return AMDGPU_RAS_SUCCESS;
>>   }
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> [email protected]
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

Reply via email to