xe_drm_ras: Add support for drm ras

Riana Tauro Mon, 01 Dec 2025 19:27:44 -0800



On 11/27/2025 12:18 AM, Rodrigo Vivi wrote:

On Wed, Nov 26, 2025 at 08:06:45PM +0530, Riana Tauro wrote:

Allocate correctable, nonfatal and fatal nodes per xe device.
Each node contains error classes, counters and respective
query counter functions.

Add basic functionality to create and register drm nodes.

Signed-off-by: Riana Tauro <[email protected]>
---
  drivers/gpu/drm/xe/Makefile           |   1 +
  drivers/gpu/drm/xe/xe_device_types.h  |   4 +
  drivers/gpu/drm/xe/xe_drm_ras.c       | 221 ++++++++++++++++++++++++++
  drivers/gpu/drm/xe/xe_drm_ras.h       |  12 ++
  drivers/gpu/drm/xe/xe_drm_ras_types.h |  54 +++++++
  drivers/gpu/drm/xe/xe_hw_error.c      |  38 ++---
  include/uapi/drm/xe_drm.h             |   5 +
  7 files changed, 313 insertions(+), 22 deletions(-)
  create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.c
  create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.h
  create mode 100644 drivers/gpu/drm/xe/xe_drm_ras_types.h

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index b848da79a4e1..7bc805b33e12 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -41,6 +41,7 @@ xe-y += xe_bb.o \
        xe_device_sysfs.o \
        xe_dma_buf.o \
        xe_drm_client.o \
+       xe_drm_ras.o \
        xe_eu_stall.o \
        xe_exec.o \
        xe_exec_queue.o \
diff --git a/drivers/gpu/drm/xe/xe_device_types.h 
b/drivers/gpu/drm/xe/xe_device_types.h
index 6ce3247d1bd8..69097e3b3995 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -13,6 +13,7 @@
  #include <drm/ttm/ttm_device.h>

#include "xe_devcoredump_types.h"

+#include "xe_drm_ras_types.h"
  #include "xe_heci_gsc.h"
  #include "xe_late_bind_fw_types.h"
  #include "xe_lmtt_types.h"
@@ -353,6 +354,9 @@ struct xe_device {
                bool oob_initialized;
        } wa_active;

+ /** @ras: ras structure for device */

+       struct xe_drm_ras ras;
+
        /** @survivability: survivability information for device */
        struct xe_survivability survivability;

diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c

new file mode 100644
index 000000000000..5320e845e9d5
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <drm/drm_managed.h>
+#include <drm/drm_ras.h>
+#include <drm/xe_drm.h>
+
+#include "xe_device.h"
+#include "xe_drm_ras.h"
+
+#define ERR_INFO(index, _name) \
+       [index] = { .name = _name, .counter = 0 }
+
+static struct xe_drm_ras_counter error_info[] = {
+       ERR_INFO(DRM_XE_GENL_CORE_COMPUTE, "GT Error"),
+};
+
+static int hw_query_error_counter(struct xe_drm_ras_counter *info,
+                                 u32 error_id, const char **name, u32 *val)
+{
+       *name = info[error_id].name;
+       *val =  info[error_id].counter;
+
+       return 0;
+}
+
+static int query_non_fatal_error_counters(struct drm_ras_node *ep,
+                                         u32 error_id, const char **name,
+                                         u32 *val)
+{
+       struct xe_device *xe = ep->priv;
+       struct xe_drm_ras *ras = &xe->ras;
+       struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_NONFATAL];
+
+       if (error_id >= ARRAY_SIZE(error_info))
+               return -EINVAL;
+
+       if (!error_info[error_id].name)
+               return -ENOENT;
+
+       return hw_query_error_counter(info, error_id, name, val);
+}
+
+static int query_fatal_error_counters(struct drm_ras_node *ep,
+                                     u32 error_id, const char **name,
+                                     u32 *val)
+{
+       struct xe_device *xe = ep->priv;
+       struct xe_drm_ras *ras = &xe->ras;
+       struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_FATAL];
+
+       if (error_id >= ARRAY_SIZE(error_info))
+               return -EINVAL;
+
+       if (!error_info[error_id].name)
+               return -ENOENT;
+
+       return hw_query_error_counter(info, error_id, name, val);
+}
+
+static int query_correctable_error_counters(struct drm_ras_node *ep,
+                                           u32 error_id, const char **name,
+                                           u32 *val)
+{
+       struct xe_device *xe = ep->priv;
+       struct xe_drm_ras *ras = &xe->ras;
+       struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_CORRECTABLE];
+
+       if (error_id >= ARRAY_SIZE(error_info))
+               return -EINVAL;
+
+       if (!error_info[error_id].name)
+               return -ENOENT;
+
+       return hw_query_error_counter(info, error_id, name, val);
+}
+
+static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device 
*xe,
+                                                            int count,
+                                                            struct 
xe_drm_ras_counter *src)
+{
+       struct xe_drm_ras_counter *counter;
+
+       counter = drmm_kzalloc(&xe->drm, count * sizeof(struct 
xe_drm_ras_counter), GFP_KERNEL);
+       if (!counter)
+               return ERR_PTR(-ENOMEM);
+
+       memcpy(counter, src, count * sizeof(struct xe_drm_ras_counter));
+
+       return counter;
+}
+
+static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
+                             enum hardware_error hw_err)
+{
+       struct xe_drm_ras *ras = &xe->ras;
+       int count = 0, ret = 0;
+
+       count = ARRAY_SIZE(error_info);
+       node->error_counter_range.first = DRM_XE_GENL_CORE_COMPUTE;
+       node->error_counter_range.last = count - 1;
+
+       switch (hw_err) {
+       case HARDWARE_ERROR_CORRECTABLE:
+               ras->info[hw_err] = allocate_and_copy_counters(xe, count, 
error_info);
+               if (IS_ERR(ras->info[hw_err]))
+                       return PTR_ERR(ras->info[hw_err]);
+               node->query_error_counter = query_correctable_error_counters;
+               break;
+       case HARDWARE_ERROR_NONFATAL:
+               ras->info[hw_err] = allocate_and_copy_counters(xe, count, 
error_info);
+               if (IS_ERR(ras->info[hw_err]))
+                       return PTR_ERR(ras->info[hw_err]);
+               node->query_error_counter = query_non_fatal_error_counters;
+               break;
+       case HARDWARE_ERROR_FATAL:
+               ras->info[hw_err] = allocate_and_copy_counters(xe, count, 
error_info);
+               if (IS_ERR(ras->info[hw_err]))
+                       return PTR_ERR(ras->info[hw_err]);
+               node->query_error_counter = query_fatal_error_counters;
+               break;
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+static int register_nodes(struct xe_device *xe)
+{
+       struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+       struct xe_drm_ras *ras = &xe->ras;
+       const char *device_name;
+       int i = 0, ret;
+
+       device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
+                               pci_domain_nr(pdev->bus), pdev->bus->number,
+                               PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+
+       for (i = 0; i < HARDWARE_ERROR_MAX; i++) {
+               struct drm_ras_node *node = &ras->node[i];
+               const char *hw_err_str = hw_error_to_str(i);
+               const char *node_name;
+
+               node_name = kasprintf(GFP_KERNEL, "%s-errors", hw_err_str);
+
+               node->device_name = device_name;
+               node->node_name = node_name;
+               node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
+
+               ret = assign_node_params(xe, node, i);
+               if (ret) {
+                       kfree(node->node_name);
+                       return ret;
+               }
+
+               node->priv = xe;
+
+               ret = drm_ras_node_register(node);
+               if (ret) {
+                       drm_err(&xe->drm, "Failed to register drm ras tile 
node\n");
+                       kfree(node->node_name);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static void xe_drm_ras_unregister_nodes(void *arg)
+{
+       struct xe_device *xe = arg;
+       struct xe_drm_ras *ras = &xe->ras;
+       int i = 0;
+
+       for (i = 0; i < HARDWARE_ERROR_MAX; i++) {
+               struct drm_ras_node *node = &ras->node[i];
+
+               drm_ras_node_unregister(node);
+
+               kfree(node->node_name);
+               if (i == 0)
+                       kfree(node->device_name);
+       }
+}
+
+/**
+ * xe_drm_ras_allocate_nodes - Allocate drm ras nodes
+ * @xe: xe device instance
+ *
+ * Allocate xe drm ras nodes for all errors in a tile
+ *
+ * Return: 0 on success, error code on failure
+ */
+int xe_drm_ras_allocate_nodes(struct xe_device *xe)
+{
+       struct drm_ras_node *node;
+       int err;
+
+       node = drmm_kzalloc(&xe->drm, HARDWARE_ERROR_MAX * sizeof(struct 
drm_ras_node), GFP_KERNEL);
+       if (!node)
+               return -ENOMEM;
+
+       xe->ras.node = node;
+
+       err = register_nodes(xe);
+       if (err) {
+               drm_err(&xe->drm, "Failed to register drm ras node\n");
+               return err;
+       }
+
+       err = devm_add_action_or_reset(xe->drm.dev, 
xe_drm_ras_unregister_nodes, xe);
+       if (err) {
+               drm_err(&xe->drm, "Failed to add action for xe drm_ras\n");
+               return err;
+       }
+
+       return 0;
+}
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h
new file mode 100644
index 000000000000..6272b5da4e6d
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+#ifndef XE_DRM_RAS_H_
+#define XE_DRM_RAS_H_
+
+struct xe_device;
+
+int xe_drm_ras_allocate_nodes(struct xe_device *xe);
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h 
b/drivers/gpu/drm/xe/xe_drm_ras_types.h
new file mode 100644
index 000000000000..452ff9a91510
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef _XE_DRM_RAS_TYPES_H_
+#define _XE_DRM_RAS_TYPES_H_
+
+#include <linux/limits.h>
+
+struct drm_ras_node;
+
+/* Error categories reported by hardware */
+enum hardware_error {
+       HARDWARE_ERROR_CORRECTABLE = 0,
+       HARDWARE_ERROR_NONFATAL = 1,
+       HARDWARE_ERROR_FATAL = 2,
+       HARDWARE_ERROR_MAX,
+};
+
+static inline const char *hw_error_to_str(const enum hardware_error hw_err)
+{
+       switch (hw_err) {
+       case HARDWARE_ERROR_CORRECTABLE:
+               return "correctable";
+       case HARDWARE_ERROR_NONFATAL:
+               return "nonfatal";
+       case HARDWARE_ERROR_FATAL:
+               return "fatal";
+       default:
+               return "UNKNOWN";
+       }
+}
+
+struct xe_drm_ras_counter {
+       const char *name;
+       int counter;
+};
+
+/**
+ * struct xe_drm_ras - xe drm ras structure
+ *
+ * This structure has details of error counters
+ */
+struct xe_drm_ras {
+       /** @node: DRM RAS node */
+       struct drm_ras_node *node;
+
+       /** @info: info array for all types of errors */
+       struct xe_drm_ras_counter *info[HARDWARE_ERROR_MAX];
+
+};
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 8c65291f36fc..2adc2e6540f6 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -10,6 +10,7 @@
  #include "regs/xe_irq_regs.h"

#include "xe_device.h"

+#include "xe_drm_ras.h"
  #include "xe_hw_error.h"
  #include "xe_mmio.h"
  #include "xe_survivability_mode.h"
@@ -17,14 +18,6 @@
  #define  HEC_UNCORR_FW_ERR_BITS 4
  extern struct fault_attr inject_csc_hw_error;

-/* Error categories reported by hardware */

-enum hardware_error {
-       HARDWARE_ERROR_CORRECTABLE = 0,
-       HARDWARE_ERROR_NONFATAL = 1,
-       HARDWARE_ERROR_FATAL = 2,
-       HARDWARE_ERROR_MAX,
-};
-
  static const char * const hec_uncorrected_fw_errors[] = {
        "Fatal",
        "CSE Disabled",
@@ -32,20 +25,6 @@ static const char * const hec_uncorrected_fw_errors[] = {
        "Data Corruption"
  };

-static const char *hw_error_to_str(const enum hardware_error hw_err)

-{
-       switch (hw_err) {
-       case HARDWARE_ERROR_CORRECTABLE:
-               return "CORRECTABLE";
-       case HARDWARE_ERROR_NONFATAL:
-               return "NONFATAL";
-       case HARDWARE_ERROR_FATAL:
-               return "FATAL";
-       default:
-               return "UNKNOWN";
-       }
-}
-
  static bool fault_inject_csc_hw_error(void)
  {
        return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 
1);
@@ -146,6 +125,20 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const 
u32 master_ctl)
                        hw_error_source_handler(tile, hw_err);
  }

+static int hw_error_info_init(struct xe_device *xe)

+{
+       int ret;
+
+       if (xe->info.platform != XE_PVC)
+               return 0;
+
+       ret = xe_drm_ras_allocate_nodes(xe);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
  /*
   * Process hardware errors during boot
   */
@@ -178,5 +171,6 @@ void xe_hw_error_init(struct xe_device *xe)

INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);+ hw_error_info_init(xe);

        process_hw_errors(xe);
  }
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 47853659a705..053cbe1aafbb 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -2273,6 +2273,11 @@ struct drm_xe_vm_query_mem_range_attr {

};+/**

+ * RAS Counters
+ */
+#define DRM_XE_GENL_CORE_COMPUTE       (1)


The feedback we got from Joonas is that we should also make the string name
an uAPI defined here in the header.
I'm afraid this series is missing that part for all the patches here.
Perhaps we should also mention this in the docs in the firs patch as well


Sure will add this. We may need mapping for node_id and node_name as well

Thanks
Riana


Other than that the series is great, thank you so much for picking that up!

+
  #if defined(__cplusplus)
  }
  #endif
--
2.47.1

Re: [PATCH v2 2/4] drm/xe/xe_drm_ras: Add support for drm ras

Reply via email to