Allocate correctable, nonfatal and fatal nodes per xe device.
Each node contains error classes, counters and respective
query counter functions.
Add basic functionality to create and register drm nodes.
Signed-off-by: Riana Tauro <[email protected]>
---
drivers/gpu/drm/xe/Makefile | 1 +
drivers/gpu/drm/xe/xe_device_types.h | 4 +
drivers/gpu/drm/xe/xe_drm_ras.c | 221 ++++++++++++++++++++++++++
drivers/gpu/drm/xe/xe_drm_ras.h | 12 ++
drivers/gpu/drm/xe/xe_drm_ras_types.h | 54 +++++++
drivers/gpu/drm/xe/xe_hw_error.c | 38 ++---
include/uapi/drm/xe_drm.h | 5 +
7 files changed, 313 insertions(+), 22 deletions(-)
create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.c
create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.h
create mode 100644 drivers/gpu/drm/xe/xe_drm_ras_types.h
diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index b848da79a4e1..7bc805b33e12 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -41,6 +41,7 @@ xe-y += xe_bb.o \
xe_device_sysfs.o \
xe_dma_buf.o \
xe_drm_client.o \
+ xe_drm_ras.o \
xe_eu_stall.o \
xe_exec.o \
xe_exec_queue.o \
diff --git a/drivers/gpu/drm/xe/xe_device_types.h
b/drivers/gpu/drm/xe/xe_device_types.h
index 6ce3247d1bd8..69097e3b3995 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -13,6 +13,7 @@
#include <drm/ttm/ttm_device.h>
#include "xe_devcoredump_types.h"
+#include "xe_drm_ras_types.h"
#include "xe_heci_gsc.h"
#include "xe_late_bind_fw_types.h"
#include "xe_lmtt_types.h"
@@ -353,6 +354,9 @@ struct xe_device {
bool oob_initialized;
} wa_active;
+ /** @ras: ras structure for device */
+ struct xe_drm_ras ras;
+
/** @survivability: survivability information for device */
struct xe_survivability survivability;
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
new file mode 100644
index 000000000000..5320e845e9d5
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <drm/drm_managed.h>
+#include <drm/drm_ras.h>
+#include <drm/xe_drm.h>
+
+#include "xe_device.h"
+#include "xe_drm_ras.h"
+
+#define ERR_INFO(index, _name) \
+ [index] = { .name = _name, .counter = 0 }
+
+static struct xe_drm_ras_counter error_info[] = {
+ ERR_INFO(DRM_XE_GENL_CORE_COMPUTE, "GT Error"),
+};
+
+static int hw_query_error_counter(struct xe_drm_ras_counter *info,
+ u32 error_id, const char **name, u32 *val)
+{
+ *name = info[error_id].name;
+ *val = info[error_id].counter;
+
+ return 0;
+}
+
+static int query_non_fatal_error_counters(struct drm_ras_node *ep,
+ u32 error_id, const char **name,
+ u32 *val)
+{
+ struct xe_device *xe = ep->priv;
+ struct xe_drm_ras *ras = &xe->ras;
+ struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_NONFATAL];
+
+ if (error_id >= ARRAY_SIZE(error_info))
+ return -EINVAL;
+
+ if (!error_info[error_id].name)
+ return -ENOENT;
+
+ return hw_query_error_counter(info, error_id, name, val);
+}
+
+static int query_fatal_error_counters(struct drm_ras_node *ep,
+ u32 error_id, const char **name,
+ u32 *val)
+{
+ struct xe_device *xe = ep->priv;
+ struct xe_drm_ras *ras = &xe->ras;
+ struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_FATAL];
+
+ if (error_id >= ARRAY_SIZE(error_info))
+ return -EINVAL;
+
+ if (!error_info[error_id].name)
+ return -ENOENT;
+
+ return hw_query_error_counter(info, error_id, name, val);
+}
+
+static int query_correctable_error_counters(struct drm_ras_node *ep,
+ u32 error_id, const char **name,
+ u32 *val)
+{
+ struct xe_device *xe = ep->priv;
+ struct xe_drm_ras *ras = &xe->ras;
+ struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_CORRECTABLE];
+
+ if (error_id >= ARRAY_SIZE(error_info))
+ return -EINVAL;
+
+ if (!error_info[error_id].name)
+ return -ENOENT;
+
+ return hw_query_error_counter(info, error_id, name, val);
+}
+
+static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device
*xe,
+ int count,
+ struct
xe_drm_ras_counter *src)
+{
+ struct xe_drm_ras_counter *counter;
+
+ counter = drmm_kzalloc(&xe->drm, count * sizeof(struct
xe_drm_ras_counter), GFP_KERNEL);
+ if (!counter)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(counter, src, count * sizeof(struct xe_drm_ras_counter));
+
+ return counter;
+}
+
+static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
+ enum hardware_error hw_err)
+{
+ struct xe_drm_ras *ras = &xe->ras;
+ int count = 0, ret = 0;
+
+ count = ARRAY_SIZE(error_info);
+ node->error_counter_range.first = DRM_XE_GENL_CORE_COMPUTE;
+ node->error_counter_range.last = count - 1;
+
+ switch (hw_err) {
+ case HARDWARE_ERROR_CORRECTABLE:
+ ras->info[hw_err] = allocate_and_copy_counters(xe, count,
error_info);
+ if (IS_ERR(ras->info[hw_err]))
+ return PTR_ERR(ras->info[hw_err]);
+ node->query_error_counter = query_correctable_error_counters;
+ break;
+ case HARDWARE_ERROR_NONFATAL:
+ ras->info[hw_err] = allocate_and_copy_counters(xe, count,
error_info);
+ if (IS_ERR(ras->info[hw_err]))
+ return PTR_ERR(ras->info[hw_err]);
+ node->query_error_counter = query_non_fatal_error_counters;
+ break;
+ case HARDWARE_ERROR_FATAL:
+ ras->info[hw_err] = allocate_and_copy_counters(xe, count,
error_info);
+ if (IS_ERR(ras->info[hw_err]))
+ return PTR_ERR(ras->info[hw_err]);
+ node->query_error_counter = query_fatal_error_counters;
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static int register_nodes(struct xe_device *xe)
+{
+ struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
+ struct xe_drm_ras *ras = &xe->ras;
+ const char *device_name;
+ int i = 0, ret;
+
+ device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d",
+ pci_domain_nr(pdev->bus), pdev->bus->number,
+ PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+
+ for (i = 0; i < HARDWARE_ERROR_MAX; i++) {
+ struct drm_ras_node *node = &ras->node[i];
+ const char *hw_err_str = hw_error_to_str(i);
+ const char *node_name;
+
+ node_name = kasprintf(GFP_KERNEL, "%s-errors", hw_err_str);
+
+ node->device_name = device_name;
+ node->node_name = node_name;
+ node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER;
+
+ ret = assign_node_params(xe, node, i);
+ if (ret) {
+ kfree(node->node_name);
+ return ret;
+ }
+
+ node->priv = xe;
+
+ ret = drm_ras_node_register(node);
+ if (ret) {
+ drm_err(&xe->drm, "Failed to register drm ras tile
node\n");
+ kfree(node->node_name);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void xe_drm_ras_unregister_nodes(void *arg)
+{
+ struct xe_device *xe = arg;
+ struct xe_drm_ras *ras = &xe->ras;
+ int i = 0;
+
+ for (i = 0; i < HARDWARE_ERROR_MAX; i++) {
+ struct drm_ras_node *node = &ras->node[i];
+
+ drm_ras_node_unregister(node);
+
+ kfree(node->node_name);
+ if (i == 0)
+ kfree(node->device_name);
+ }
+}
+
+/**
+ * xe_drm_ras_allocate_nodes - Allocate drm ras nodes
+ * @xe: xe device instance
+ *
+ * Allocate xe drm ras nodes for all errors in a tile
+ *
+ * Return: 0 on success, error code on failure
+ */
+int xe_drm_ras_allocate_nodes(struct xe_device *xe)
+{
+ struct drm_ras_node *node;
+ int err;
+
+ node = drmm_kzalloc(&xe->drm, HARDWARE_ERROR_MAX * sizeof(struct
drm_ras_node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ xe->ras.node = node;
+
+ err = register_nodes(xe);
+ if (err) {
+ drm_err(&xe->drm, "Failed to register drm ras node\n");
+ return err;
+ }
+
+ err = devm_add_action_or_reset(xe->drm.dev,
xe_drm_ras_unregister_nodes, xe);
+ if (err) {
+ drm_err(&xe->drm, "Failed to add action for xe drm_ras\n");
+ return err;
+ }
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h
new file mode 100644
index 000000000000..6272b5da4e6d
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+#ifndef XE_DRM_RAS_H_
+#define XE_DRM_RAS_H_
+
+struct xe_device;
+
+int xe_drm_ras_allocate_nodes(struct xe_device *xe);
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h
b/drivers/gpu/drm/xe/xe_drm_ras_types.h
new file mode 100644
index 000000000000..452ff9a91510
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#ifndef _XE_DRM_RAS_TYPES_H_
+#define _XE_DRM_RAS_TYPES_H_
+
+#include <linux/limits.h>
+
+struct drm_ras_node;
+
+/* Error categories reported by hardware */
+enum hardware_error {
+ HARDWARE_ERROR_CORRECTABLE = 0,
+ HARDWARE_ERROR_NONFATAL = 1,
+ HARDWARE_ERROR_FATAL = 2,
+ HARDWARE_ERROR_MAX,
+};
+
+static inline const char *hw_error_to_str(const enum hardware_error hw_err)
+{
+ switch (hw_err) {
+ case HARDWARE_ERROR_CORRECTABLE:
+ return "correctable";
+ case HARDWARE_ERROR_NONFATAL:
+ return "nonfatal";
+ case HARDWARE_ERROR_FATAL:
+ return "fatal";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+struct xe_drm_ras_counter {
+ const char *name;
+ int counter;
+};
+
+/**
+ * struct xe_drm_ras - xe drm ras structure
+ *
+ * This structure has details of error counters
+ */
+struct xe_drm_ras {
+ /** @node: DRM RAS node */
+ struct drm_ras_node *node;
+
+ /** @info: info array for all types of errors */
+ struct xe_drm_ras_counter *info[HARDWARE_ERROR_MAX];
+
+};
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 8c65291f36fc..2adc2e6540f6 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -10,6 +10,7 @@
#include "regs/xe_irq_regs.h"
#include "xe_device.h"
+#include "xe_drm_ras.h"
#include "xe_hw_error.h"
#include "xe_mmio.h"
#include "xe_survivability_mode.h"
@@ -17,14 +18,6 @@
#define HEC_UNCORR_FW_ERR_BITS 4
extern struct fault_attr inject_csc_hw_error;
-/* Error categories reported by hardware */
-enum hardware_error {
- HARDWARE_ERROR_CORRECTABLE = 0,
- HARDWARE_ERROR_NONFATAL = 1,
- HARDWARE_ERROR_FATAL = 2,
- HARDWARE_ERROR_MAX,
-};
-
static const char * const hec_uncorrected_fw_errors[] = {
"Fatal",
"CSE Disabled",
@@ -32,20 +25,6 @@ static const char * const hec_uncorrected_fw_errors[] = {
"Data Corruption"
};
-static const char *hw_error_to_str(const enum hardware_error hw_err)
-{
- switch (hw_err) {
- case HARDWARE_ERROR_CORRECTABLE:
- return "CORRECTABLE";
- case HARDWARE_ERROR_NONFATAL:
- return "NONFATAL";
- case HARDWARE_ERROR_FATAL:
- return "FATAL";
- default:
- return "UNKNOWN";
- }
-}
-
static bool fault_inject_csc_hw_error(void)
{
return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error,
1);
@@ -146,6 +125,20 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const
u32 master_ctl)
hw_error_source_handler(tile, hw_err);
}
+static int hw_error_info_init(struct xe_device *xe)
+{
+ int ret;
+
+ if (xe->info.platform != XE_PVC)
+ return 0;
+
+ ret = xe_drm_ras_allocate_nodes(xe);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
/*
* Process hardware errors during boot
*/
@@ -178,5 +171,6 @@ void xe_hw_error_init(struct xe_device *xe)
INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work);
+ hw_error_info_init(xe);
process_hw_errors(xe);
}
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 47853659a705..053cbe1aafbb 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -2273,6 +2273,11 @@ struct drm_xe_vm_query_mem_range_attr {
};
+/**
+ * RAS Counters
+ */
+#define DRM_XE_GENL_CORE_COMPUTE (1)