xe: Introduce the usage of drm_ras with supported HW errors

Rodrigo Vivi Mon, 29 Sep 2025 14:45:07 -0700

All MTL+ devices supports these correctable and non-fatal error
notification over the IRQ. None of current supported platforms
support error counter directly in the HW.


But since we are already supporting the error interrupt for
these errors, let's incorporate the counter inside the driver
itself and start using the drm_ras generic netlink to report them.

Keep the CSC_work only for discrete devices.

Cc: Riana Tauro <riana.ta...@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.v...@intel.com>
---
 drivers/gpu/drm/xe/regs/xe_hw_error_regs.h |  22 +++
 drivers/gpu/drm/xe/xe_hw_error.c           | 155 ++++++++++++++++++++-
 2 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h 
b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
index c146b9ef44eb..860fc3b8a3c4 100644
--- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h
@@ -16,5 +16,27 @@
 #define DEV_ERR_STAT_REG(x)                    XE_REG(_PICK_EVEN((x), \
                                                                  
DEV_ERR_STAT_CORRECTABLE, \
                                                                  
DEV_ERR_STAT_NONFATAL))
+#define   XE_SGMI_DATA_PARITY_ERROR            BIT(25)
+#define   XE_MERT_ERROR                                BIT(20)
 #define   XE_CSC_ERROR                         BIT(17)
+#define   XE_SOC_ERROR                         BIT(16)
+#define   XE_SGCI_DATA_PARITY_ERROR            BIT(13)
+#define   XE_SGUNIT_ERROR                      BIT(12)
+#define   XE_SGLI_DATA_PARITY_ERROR            BIT(9)
+#define   XE_GSC_ERROR                         BIT(8)
+#define   XE_SGDI_DATA_PARITY_ERROR            BIT(5)
+#define   XE_DISPLAY_ERROR                     BIT(4)
+#define   XE_SGGI_DATA_PARITY_ERROR            BIT(1)
+#define   XE_GT_ERROR                          BIT(0)
+
+#define DEV_ERR_STAT_NONFATAL_VALID_MASK \
+       (XE_SGMI_DATA_PARITY_ERROR | XE_MERT_ERROR | XE_CSC_ERROR | 
XE_SOC_ERROR | \
+        XE_SGCI_DATA_PARITY_ERROR | XE_SGUNIT_ERROR | 
XE_SGLI_DATA_PARITY_ERROR | \
+        XE_GSC_ERROR | XE_SGDI_DATA_PARITY_ERROR | XE_DISPLAY_ERROR |  \
+        XE_SGGI_DATA_PARITY_ERROR | XE_GT_ERROR)
+
+#define DEV_ERR_STAT_CORRECTABLE_VALID_MASK \
+       (XE_CSC_ERROR | XE_SOC_ERROR | XE_SGUNIT_ERROR | XE_GSC_ERROR | \
+        XE_DISPLAY_ERROR | XE_GT_ERROR)
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 8c65291f36fc..615d10cd83f0 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -3,7 +3,13 @@
  * Copyright © 2025 Intel Corporation
  */
 
+#include <linux/atomic.h>
 #include <linux/fault-inject.h>
+#include <linux/find.h>
+#include <linux/types.h>
+
+#include <drm/drm_managed.h>
+#include <drm/drm_ras.h>
 
 #include "regs/xe_gsc_regs.h"
 #include "regs/xe_hw_error_regs.h"
@@ -46,6 +52,93 @@ static const char *hw_error_to_str(const enum hardware_error 
hw_err)
        }
 }
 
+struct error_info {
+       const char *name;
+       atomic64_t counter;
+};
+
+#define ERR_INFO(_bit, _name) \
+       [__ffs(_bit)] = { .name = _name, .counter = ATOMIC64_INIT(0) }
+
+static struct error_info dev_err_stat_nonfatal_reg[] = {
+       ERR_INFO(XE_GT_ERROR, "GT Error"),
+       ERR_INFO(XE_SGGI_DATA_PARITY_ERROR, "SGGI Data Parity Error"),
+       ERR_INFO(XE_DISPLAY_ERROR, "Display Error"),
+       ERR_INFO(XE_SGDI_DATA_PARITY_ERROR, "SGDI Data Parity Error"),
+       ERR_INFO(XE_GSC_ERROR, "GSC Error"),
+       ERR_INFO(XE_SGLI_DATA_PARITY_ERROR, "SGLI Data Parity Error"),
+       ERR_INFO(XE_SGUNIT_ERROR, "SG Unit Error"),
+       ERR_INFO(XE_SGCI_DATA_PARITY_ERROR, "SGCI Data Parity Error"),
+       ERR_INFO(XE_SOC_ERROR, "SoC Error"),
+       ERR_INFO(XE_CSC_ERROR, "CSC Error"),
+       ERR_INFO(XE_MERT_ERROR, "MERT Error"),
+       ERR_INFO(XE_SGMI_DATA_PARITY_ERROR, "SGMI Data Parity Error"),
+};
+
+static struct error_info dev_err_stat_correctable_reg[] = {
+       ERR_INFO(XE_GT_ERROR, "GT Error"),
+       ERR_INFO(XE_DISPLAY_ERROR, "Display Error"),
+       ERR_INFO(XE_GSC_ERROR, "GSC Error"),
+       ERR_INFO(XE_SGUNIT_ERROR, "SG Unit Error"),
+       ERR_INFO(XE_SOC_ERROR, "SoC Error"),
+       ERR_INFO(XE_CSC_ERROR, "CSC Error"),
+};
+
+static int hw_query_error_counter(struct error_info *error_list,
+                                 u32 error_id, const char **name, u32 *val)
+{
+       *name = error_list[error_id].name;
+       *val = atomic64_read(&error_list[error_id].counter);
+
+       return 0;
+}
+
+static int query_error_counter_non_fatal(struct drm_ras_node *ep,
+                                        u32 error_id,
+                                        const char **name,
+                                        u32 *val)
+{
+       if (error_id >= ARRAY_SIZE(dev_err_stat_nonfatal_reg))
+               return -EINVAL;
+
+       if (!(DEV_ERR_STAT_NONFATAL_VALID_MASK & BIT(error_id)) ||
+           !dev_err_stat_nonfatal_reg[error_id].name)
+               return -ENOENT;
+
+       return hw_query_error_counter(dev_err_stat_nonfatal_reg,
+                                     error_id, name, val);
+}
+
+static int query_error_counter_correctable(struct drm_ras_node *ep,
+                                          u32 error_id,
+                                          const char **name,
+                                          u32 *val)
+{
+       if (error_id >= ARRAY_SIZE(dev_err_stat_correctable_reg))
+               return -EINVAL;
+
+       if (!(DEV_ERR_STAT_CORRECTABLE_VALID_MASK & BIT(error_id)) ||
+           !dev_err_stat_correctable_reg[error_id].name)
+               return -ENOENT;
+
+       return hw_query_error_counter(dev_err_stat_correctable_reg,
+                                     error_id, name, val);
+}
+
+static struct drm_ras_node node_non_fatal = {
+       .node_name = "non-fatal",
+       .type = DRM_RAS_NODE_TYPE_ERROR_COUNTER,
+       .error_counter_range.last = __ffs(XE_SGMI_DATA_PARITY_ERROR),
+       .query_error_counter = query_error_counter_non_fatal,
+};
+
+static struct drm_ras_node node_correctable = {
+       .node_name = "correctable",
+       .type = DRM_RAS_NODE_TYPE_ERROR_COUNTER,
+       .error_counter_range.last = __ffs(XE_CSC_ERROR),
+       .query_error_counter = query_error_counter_correctable,
+};
+
 static bool fault_inject_csc_hw_error(void)
 {
        return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 
1);
@@ -97,6 +190,29 @@ static void csc_hw_error_handler(struct xe_tile *tile, 
const enum hardware_error
        xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src);
 }
 
+static void hw_error_counter(struct xe_device *xe,
+                            const enum hardware_error hw_err, const u32 
err_src)
+{
+       struct error_info *err_info;
+       unsigned long err_bits = err_src;
+       unsigned long error;
+
+       if (hw_err == HARDWARE_ERROR_NONFATAL) {
+               err_info = dev_err_stat_nonfatal_reg;
+       } else if (hw_err == HARDWARE_ERROR_CORRECTABLE) {
+               err_info = dev_err_stat_correctable_reg;
+       } else {
+               drm_err_ratelimited(&xe->drm, HW_ERR
+                                   "Error from non-supported source, 
err_src=0x%x\n",
+                                   err_src);
+               return;
+       }
+
+       for_each_set_bit(error, &err_bits, 32) {
+               atomic64_inc(&err_info[error].counter);
+       }
+}
+
 static void hw_error_source_handler(struct xe_tile *tile, const enum 
hardware_error hw_err)
 {
        const char *hw_err_str = hw_error_to_str(hw_err);
@@ -118,6 +234,8 @@ static void hw_error_source_handler(struct xe_tile *tile, 
const enum hardware_er
        if (err_src & XE_CSC_ERROR)
                csc_hw_error_handler(tile, hw_err);
 
+       hw_error_counter(xe, hw_err, err_src);
+
        xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src);
 
 unlock:
@@ -162,6 +280,36 @@ static void process_hw_errors(struct xe_device *xe)
        }
 }
 
+static void hw_error_counter_fini(struct drm_device *dev, void *res)
+{
+       drm_ras_node_unregister(&node_non_fatal);
+       drm_ras_node_unregister(&node_correctable);
+}
+
+static void hw_error_counter_init(struct xe_device *xe)
+{
+       struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+       char *name;
+
+       name = kasprintf(GFP_KERNEL, "%02x:%02x.%d",
+                        pdev->bus->number,
+                        PCI_SLOT(pdev->devfn),
+                        PCI_FUNC(pdev->devfn));
+       if (!name) {
+               drm_err(&xe->drm, "Failed to allocate memory for device name 
for drm_ras\n");
+               return;
+       }
+
+       node_non_fatal.device_name = name;
+       drm_ras_node_register(&node_non_fatal);
+
+       node_correctable.device_name = name;
+       drm_ras_node_register(&node_correctable);
+
+       if (drmm_add_action_or_reset(&xe->drm, hw_error_counter_fini, xe))
+               drm_err(&xe->drm, "Failed to add action for hw error counter 
fini\n");
+}
+
 /**
  * xe_hw_error_init - Initialize hw errors
  * @xe: xe device instance
@@ -173,10 +321,13 @@ void xe_hw_error_init(struct xe_device *xe)
 {
        struct xe_tile *tile = xe_device_get_root_tile(xe);
 
-       if (!IS_DGFX(xe) || IS_SRIOV_VF(xe))
+       if (IS_SRIOV_VF(xe))
                return;
 
-       INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
+       if (IS_DGFX(xe))
+               INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
 
        process_hw_errors(xe);
+
+       hw_error_counter_init(xe);
 }
-- 
2.51.0

[PATCH 2/2] drm/xe: Introduce the usage of drm_ras with supported HW errors

Reply via email to