Allocate correctable, nonfatal and fatal nodes per xe device. Each node contains error classes, counters and respective query counter functions.
Add basic functionality to create and register drm nodes. Signed-off-by: Riana Tauro <[email protected]> --- drivers/gpu/drm/xe/Makefile | 1 + drivers/gpu/drm/xe/xe_device_types.h | 4 + drivers/gpu/drm/xe/xe_drm_ras.c | 221 ++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_drm_ras.h | 12 ++ drivers/gpu/drm/xe/xe_drm_ras_types.h | 54 +++++++ drivers/gpu/drm/xe/xe_hw_error.c | 38 ++--- include/uapi/drm/xe_drm.h | 5 + 7 files changed, 313 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.c create mode 100644 drivers/gpu/drm/xe/xe_drm_ras.h create mode 100644 drivers/gpu/drm/xe/xe_drm_ras_types.h diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile index b848da79a4e1..7bc805b33e12 100644 --- a/drivers/gpu/drm/xe/Makefile +++ b/drivers/gpu/drm/xe/Makefile @@ -41,6 +41,7 @@ xe-y += xe_bb.o \ xe_device_sysfs.o \ xe_dma_buf.o \ xe_drm_client.o \ + xe_drm_ras.o \ xe_eu_stall.o \ xe_exec.o \ xe_exec_queue.o \ diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h index 6ce3247d1bd8..69097e3b3995 100644 --- a/drivers/gpu/drm/xe/xe_device_types.h +++ b/drivers/gpu/drm/xe/xe_device_types.h @@ -13,6 +13,7 @@ #include <drm/ttm/ttm_device.h> #include "xe_devcoredump_types.h" +#include "xe_drm_ras_types.h" #include "xe_heci_gsc.h" #include "xe_late_bind_fw_types.h" #include "xe_lmtt_types.h" @@ -353,6 +354,9 @@ struct xe_device { bool oob_initialized; } wa_active; + /** @ras: ras structure for device */ + struct xe_drm_ras ras; + /** @survivability: survivability information for device */ struct xe_survivability survivability; diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c new file mode 100644 index 000000000000..5320e845e9d5 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2025 Intel Corporation + */ + +#include <drm/drm_managed.h> +#include <drm/drm_ras.h> +#include <drm/xe_drm.h> + +#include "xe_device.h" +#include "xe_drm_ras.h" + +#define ERR_INFO(index, _name) \ + [index] = { .name = _name, .counter = 0 } + +static struct xe_drm_ras_counter error_info[] = { + ERR_INFO(DRM_XE_GENL_CORE_COMPUTE, "GT Error"), +}; + +static int hw_query_error_counter(struct xe_drm_ras_counter *info, + u32 error_id, const char **name, u32 *val) +{ + *name = info[error_id].name; + *val = info[error_id].counter; + + return 0; +} + +static int query_non_fatal_error_counters(struct drm_ras_node *ep, + u32 error_id, const char **name, + u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_NONFATAL]; + + if (error_id >= ARRAY_SIZE(error_info)) + return -EINVAL; + + if (!error_info[error_id].name) + return -ENOENT; + + return hw_query_error_counter(info, error_id, name, val); +} + +static int query_fatal_error_counters(struct drm_ras_node *ep, + u32 error_id, const char **name, + u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_FATAL]; + + if (error_id >= ARRAY_SIZE(error_info)) + return -EINVAL; + + if (!error_info[error_id].name) + return -ENOENT; + + return hw_query_error_counter(info, error_id, name, val); +} + +static int query_correctable_error_counters(struct drm_ras_node *ep, + u32 error_id, const char **name, + u32 *val) +{ + struct xe_device *xe = ep->priv; + struct xe_drm_ras *ras = &xe->ras; + struct xe_drm_ras_counter *info = ras->info[HARDWARE_ERROR_CORRECTABLE]; + + if (error_id >= ARRAY_SIZE(error_info)) + return -EINVAL; + + if (!error_info[error_id].name) + return -ENOENT; + + return hw_query_error_counter(info, error_id, name, val); +} + +static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe, + int count, + struct xe_drm_ras_counter *src) +{ + struct xe_drm_ras_counter *counter; + + counter = drmm_kzalloc(&xe->drm, count * sizeof(struct xe_drm_ras_counter), GFP_KERNEL); + if (!counter) + return ERR_PTR(-ENOMEM); + + memcpy(counter, src, count * sizeof(struct xe_drm_ras_counter)); + + return counter; +} + +static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node, + enum hardware_error hw_err) +{ + struct xe_drm_ras *ras = &xe->ras; + int count = 0, ret = 0; + + count = ARRAY_SIZE(error_info); + node->error_counter_range.first = DRM_XE_GENL_CORE_COMPUTE; + node->error_counter_range.last = count - 1; + + switch (hw_err) { + case HARDWARE_ERROR_CORRECTABLE: + ras->info[hw_err] = allocate_and_copy_counters(xe, count, error_info); + if (IS_ERR(ras->info[hw_err])) + return PTR_ERR(ras->info[hw_err]); + node->query_error_counter = query_correctable_error_counters; + break; + case HARDWARE_ERROR_NONFATAL: + ras->info[hw_err] = allocate_and_copy_counters(xe, count, error_info); + if (IS_ERR(ras->info[hw_err])) + return PTR_ERR(ras->info[hw_err]); + node->query_error_counter = query_non_fatal_error_counters; + break; + case HARDWARE_ERROR_FATAL: + ras->info[hw_err] = allocate_and_copy_counters(xe, count, error_info); + if (IS_ERR(ras->info[hw_err])) + return PTR_ERR(ras->info[hw_err]); + node->query_error_counter = query_fatal_error_counters; + break; + default: + break; + } + + return ret; +} + +static int register_nodes(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + struct xe_drm_ras *ras = &xe->ras; + const char *device_name; + int i = 0, ret; + + device_name = kasprintf(GFP_KERNEL, "%04x:%02x:%02x.%d", + pci_domain_nr(pdev->bus), pdev->bus->number, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + + for (i = 0; i < HARDWARE_ERROR_MAX; i++) { + struct drm_ras_node *node = &ras->node[i]; + const char *hw_err_str = hw_error_to_str(i); + const char *node_name; + + node_name = kasprintf(GFP_KERNEL, "%s-errors", hw_err_str); + + node->device_name = device_name; + node->node_name = node_name; + node->type = DRM_RAS_NODE_TYPE_ERROR_COUNTER; + + ret = assign_node_params(xe, node, i); + if (ret) { + kfree(node->node_name); + return ret; + } + + node->priv = xe; + + ret = drm_ras_node_register(node); + if (ret) { + drm_err(&xe->drm, "Failed to register drm ras tile node\n"); + kfree(node->node_name); + return ret; + } + } + + return 0; +} + +static void xe_drm_ras_unregister_nodes(void *arg) +{ + struct xe_device *xe = arg; + struct xe_drm_ras *ras = &xe->ras; + int i = 0; + + for (i = 0; i < HARDWARE_ERROR_MAX; i++) { + struct drm_ras_node *node = &ras->node[i]; + + drm_ras_node_unregister(node); + + kfree(node->node_name); + if (i == 0) + kfree(node->device_name); + } +} + +/** + * xe_drm_ras_allocate_nodes - Allocate drm ras nodes + * @xe: xe device instance + * + * Allocate xe drm ras nodes for all errors in a tile + * + * Return: 0 on success, error code on failure + */ +int xe_drm_ras_allocate_nodes(struct xe_device *xe) +{ + struct drm_ras_node *node; + int err; + + node = drmm_kzalloc(&xe->drm, HARDWARE_ERROR_MAX * sizeof(struct drm_ras_node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + xe->ras.node = node; + + err = register_nodes(xe); + if (err) { + drm_err(&xe->drm, "Failed to register drm ras node\n"); + return err; + } + + err = devm_add_action_or_reset(xe->drm.dev, xe_drm_ras_unregister_nodes, xe); + if (err) { + drm_err(&xe->drm, "Failed to add action for xe drm_ras\n"); + return err; + } + + return 0; +} diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h new file mode 100644 index 000000000000..6272b5da4e6d --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ +#ifndef XE_DRM_RAS_H_ +#define XE_DRM_RAS_H_ + +struct xe_device; + +int xe_drm_ras_allocate_nodes(struct xe_device *xe); + +#endif diff --git a/drivers/gpu/drm/xe/xe_drm_ras_types.h b/drivers/gpu/drm/xe/xe_drm_ras_types.h new file mode 100644 index 000000000000..452ff9a91510 --- /dev/null +++ b/drivers/gpu/drm/xe/xe_drm_ras_types.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef _XE_DRM_RAS_TYPES_H_ +#define _XE_DRM_RAS_TYPES_H_ + +#include <linux/limits.h> + +struct drm_ras_node; + +/* Error categories reported by hardware */ +enum hardware_error { + HARDWARE_ERROR_CORRECTABLE = 0, + HARDWARE_ERROR_NONFATAL = 1, + HARDWARE_ERROR_FATAL = 2, + HARDWARE_ERROR_MAX, +}; + +static inline const char *hw_error_to_str(const enum hardware_error hw_err) +{ + switch (hw_err) { + case HARDWARE_ERROR_CORRECTABLE: + return "correctable"; + case HARDWARE_ERROR_NONFATAL: + return "nonfatal"; + case HARDWARE_ERROR_FATAL: + return "fatal"; + default: + return "UNKNOWN"; + } +} + +struct xe_drm_ras_counter { + const char *name; + int counter; +}; + +/** + * struct xe_drm_ras - xe drm ras structure + * + * This structure has details of error counters + */ +struct xe_drm_ras { + /** @node: DRM RAS node */ + struct drm_ras_node *node; + + /** @info: info array for all types of errors */ + struct xe_drm_ras_counter *info[HARDWARE_ERROR_MAX]; + +}; + +#endif diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 8c65291f36fc..2adc2e6540f6 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -10,6 +10,7 @@ #include "regs/xe_irq_regs.h" #include "xe_device.h" +#include "xe_drm_ras.h" #include "xe_hw_error.h" #include "xe_mmio.h" #include "xe_survivability_mode.h" @@ -17,14 +18,6 @@ #define HEC_UNCORR_FW_ERR_BITS 4 extern struct fault_attr inject_csc_hw_error; -/* Error categories reported by hardware */ -enum hardware_error { - HARDWARE_ERROR_CORRECTABLE = 0, - HARDWARE_ERROR_NONFATAL = 1, - HARDWARE_ERROR_FATAL = 2, - HARDWARE_ERROR_MAX, -}; - static const char * const hec_uncorrected_fw_errors[] = { "Fatal", "CSE Disabled", @@ -32,20 +25,6 @@ static const char * const hec_uncorrected_fw_errors[] = { "Data Corruption" }; -static const char *hw_error_to_str(const enum hardware_error hw_err) -{ - switch (hw_err) { - case HARDWARE_ERROR_CORRECTABLE: - return "CORRECTABLE"; - case HARDWARE_ERROR_NONFATAL: - return "NONFATAL"; - case HARDWARE_ERROR_FATAL: - return "FATAL"; - default: - return "UNKNOWN"; - } -} - static bool fault_inject_csc_hw_error(void) { return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); @@ -146,6 +125,20 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl) hw_error_source_handler(tile, hw_err); } +static int hw_error_info_init(struct xe_device *xe) +{ + int ret; + + if (xe->info.platform != XE_PVC) + return 0; + + ret = xe_drm_ras_allocate_nodes(xe); + if (ret) + return ret; + + return 0; +} + /* * Process hardware errors during boot */ @@ -178,5 +171,6 @@ void xe_hw_error_init(struct xe_device *xe) INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); + hw_error_info_init(xe); process_hw_errors(xe); } diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 47853659a705..053cbe1aafbb 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -2273,6 +2273,11 @@ struct drm_xe_vm_query_mem_range_attr { }; +/** + * RAS Counters + */ +#define DRM_XE_GENL_CORE_COMPUTE (1) + #if defined(__cplusplus) } #endif -- 2.47.1
