All MTL+ devices supports these correctable and non-fatal error notification over the IRQ. None of current supported platforms support error counter directly in the HW.
But since we are already supporting the error interrupt for these errors, let's incorporate the counter inside the driver itself and start using the drm_ras generic netlink to report them. Keep the CSC_work only for discrete devices. Cc: Riana Tauro <riana.ta...@intel.com> Signed-off-by: Rodrigo Vivi <rodrigo.v...@intel.com> --- drivers/gpu/drm/xe/regs/xe_hw_error_regs.h | 22 +++ drivers/gpu/drm/xe/xe_hw_error.c | 155 ++++++++++++++++++++- 2 files changed, 175 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h index c146b9ef44eb..860fc3b8a3c4 100644 --- a/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_hw_error_regs.h @@ -16,5 +16,27 @@ #define DEV_ERR_STAT_REG(x) XE_REG(_PICK_EVEN((x), \ DEV_ERR_STAT_CORRECTABLE, \ DEV_ERR_STAT_NONFATAL)) +#define XE_SGMI_DATA_PARITY_ERROR BIT(25) +#define XE_MERT_ERROR BIT(20) #define XE_CSC_ERROR BIT(17) +#define XE_SOC_ERROR BIT(16) +#define XE_SGCI_DATA_PARITY_ERROR BIT(13) +#define XE_SGUNIT_ERROR BIT(12) +#define XE_SGLI_DATA_PARITY_ERROR BIT(9) +#define XE_GSC_ERROR BIT(8) +#define XE_SGDI_DATA_PARITY_ERROR BIT(5) +#define XE_DISPLAY_ERROR BIT(4) +#define XE_SGGI_DATA_PARITY_ERROR BIT(1) +#define XE_GT_ERROR BIT(0) + +#define DEV_ERR_STAT_NONFATAL_VALID_MASK \ + (XE_SGMI_DATA_PARITY_ERROR | XE_MERT_ERROR | XE_CSC_ERROR | XE_SOC_ERROR | \ + XE_SGCI_DATA_PARITY_ERROR | XE_SGUNIT_ERROR | XE_SGLI_DATA_PARITY_ERROR | \ + XE_GSC_ERROR | XE_SGDI_DATA_PARITY_ERROR | XE_DISPLAY_ERROR | \ + XE_SGGI_DATA_PARITY_ERROR | XE_GT_ERROR) + +#define DEV_ERR_STAT_CORRECTABLE_VALID_MASK \ + (XE_CSC_ERROR | XE_SOC_ERROR | XE_SGUNIT_ERROR | XE_GSC_ERROR | \ + XE_DISPLAY_ERROR | XE_GT_ERROR) + #endif diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c index 8c65291f36fc..615d10cd83f0 100644 --- a/drivers/gpu/drm/xe/xe_hw_error.c +++ b/drivers/gpu/drm/xe/xe_hw_error.c @@ -3,7 +3,13 @@ * Copyright © 2025 Intel Corporation */ +#include <linux/atomic.h> #include <linux/fault-inject.h> +#include <linux/find.h> +#include <linux/types.h> + +#include <drm/drm_managed.h> +#include <drm/drm_ras.h> #include "regs/xe_gsc_regs.h" #include "regs/xe_hw_error_regs.h" @@ -46,6 +52,93 @@ static const char *hw_error_to_str(const enum hardware_error hw_err) } } +struct error_info { + const char *name; + atomic64_t counter; +}; + +#define ERR_INFO(_bit, _name) \ + [__ffs(_bit)] = { .name = _name, .counter = ATOMIC64_INIT(0) } + +static struct error_info dev_err_stat_nonfatal_reg[] = { + ERR_INFO(XE_GT_ERROR, "GT Error"), + ERR_INFO(XE_SGGI_DATA_PARITY_ERROR, "SGGI Data Parity Error"), + ERR_INFO(XE_DISPLAY_ERROR, "Display Error"), + ERR_INFO(XE_SGDI_DATA_PARITY_ERROR, "SGDI Data Parity Error"), + ERR_INFO(XE_GSC_ERROR, "GSC Error"), + ERR_INFO(XE_SGLI_DATA_PARITY_ERROR, "SGLI Data Parity Error"), + ERR_INFO(XE_SGUNIT_ERROR, "SG Unit Error"), + ERR_INFO(XE_SGCI_DATA_PARITY_ERROR, "SGCI Data Parity Error"), + ERR_INFO(XE_SOC_ERROR, "SoC Error"), + ERR_INFO(XE_CSC_ERROR, "CSC Error"), + ERR_INFO(XE_MERT_ERROR, "MERT Error"), + ERR_INFO(XE_SGMI_DATA_PARITY_ERROR, "SGMI Data Parity Error"), +}; + +static struct error_info dev_err_stat_correctable_reg[] = { + ERR_INFO(XE_GT_ERROR, "GT Error"), + ERR_INFO(XE_DISPLAY_ERROR, "Display Error"), + ERR_INFO(XE_GSC_ERROR, "GSC Error"), + ERR_INFO(XE_SGUNIT_ERROR, "SG Unit Error"), + ERR_INFO(XE_SOC_ERROR, "SoC Error"), + ERR_INFO(XE_CSC_ERROR, "CSC Error"), +}; + +static int hw_query_error_counter(struct error_info *error_list, + u32 error_id, const char **name, u32 *val) +{ + *name = error_list[error_id].name; + *val = atomic64_read(&error_list[error_id].counter); + + return 0; +} + +static int query_error_counter_non_fatal(struct drm_ras_node *ep, + u32 error_id, + const char **name, + u32 *val) +{ + if (error_id >= ARRAY_SIZE(dev_err_stat_nonfatal_reg)) + return -EINVAL; + + if (!(DEV_ERR_STAT_NONFATAL_VALID_MASK & BIT(error_id)) || + !dev_err_stat_nonfatal_reg[error_id].name) + return -ENOENT; + + return hw_query_error_counter(dev_err_stat_nonfatal_reg, + error_id, name, val); +} + +static int query_error_counter_correctable(struct drm_ras_node *ep, + u32 error_id, + const char **name, + u32 *val) +{ + if (error_id >= ARRAY_SIZE(dev_err_stat_correctable_reg)) + return -EINVAL; + + if (!(DEV_ERR_STAT_CORRECTABLE_VALID_MASK & BIT(error_id)) || + !dev_err_stat_correctable_reg[error_id].name) + return -ENOENT; + + return hw_query_error_counter(dev_err_stat_correctable_reg, + error_id, name, val); +} + +static struct drm_ras_node node_non_fatal = { + .node_name = "non-fatal", + .type = DRM_RAS_NODE_TYPE_ERROR_COUNTER, + .error_counter_range.last = __ffs(XE_SGMI_DATA_PARITY_ERROR), + .query_error_counter = query_error_counter_non_fatal, +}; + +static struct drm_ras_node node_correctable = { + .node_name = "correctable", + .type = DRM_RAS_NODE_TYPE_ERROR_COUNTER, + .error_counter_range.last = __ffs(XE_CSC_ERROR), + .query_error_counter = query_error_counter_correctable, +}; + static bool fault_inject_csc_hw_error(void) { return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1); @@ -97,6 +190,29 @@ static void csc_hw_error_handler(struct xe_tile *tile, const enum hardware_error xe_mmio_write32(mmio, HEC_UNCORR_ERR_STATUS(base), err_src); } +static void hw_error_counter(struct xe_device *xe, + const enum hardware_error hw_err, const u32 err_src) +{ + struct error_info *err_info; + unsigned long err_bits = err_src; + unsigned long error; + + if (hw_err == HARDWARE_ERROR_NONFATAL) { + err_info = dev_err_stat_nonfatal_reg; + } else if (hw_err == HARDWARE_ERROR_CORRECTABLE) { + err_info = dev_err_stat_correctable_reg; + } else { + drm_err_ratelimited(&xe->drm, HW_ERR + "Error from non-supported source, err_src=0x%x\n", + err_src); + return; + } + + for_each_set_bit(error, &err_bits, 32) { + atomic64_inc(&err_info[error].counter); + } +} + static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_error hw_err) { const char *hw_err_str = hw_error_to_str(hw_err); @@ -118,6 +234,8 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er if (err_src & XE_CSC_ERROR) csc_hw_error_handler(tile, hw_err); + hw_error_counter(xe, hw_err, err_src); + xe_mmio_write32(&tile->mmio, DEV_ERR_STAT_REG(hw_err), err_src); unlock: @@ -162,6 +280,36 @@ static void process_hw_errors(struct xe_device *xe) } } +static void hw_error_counter_fini(struct drm_device *dev, void *res) +{ + drm_ras_node_unregister(&node_non_fatal); + drm_ras_node_unregister(&node_correctable); +} + +static void hw_error_counter_init(struct xe_device *xe) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + char *name; + + name = kasprintf(GFP_KERNEL, "%02x:%02x.%d", + pdev->bus->number, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn)); + if (!name) { + drm_err(&xe->drm, "Failed to allocate memory for device name for drm_ras\n"); + return; + } + + node_non_fatal.device_name = name; + drm_ras_node_register(&node_non_fatal); + + node_correctable.device_name = name; + drm_ras_node_register(&node_correctable); + + if (drmm_add_action_or_reset(&xe->drm, hw_error_counter_fini, xe)) + drm_err(&xe->drm, "Failed to add action for hw error counter fini\n"); +} + /** * xe_hw_error_init - Initialize hw errors * @xe: xe device instance @@ -173,10 +321,13 @@ void xe_hw_error_init(struct xe_device *xe) { struct xe_tile *tile = xe_device_get_root_tile(xe); - if (!IS_DGFX(xe) || IS_SRIOV_VF(xe)) + if (IS_SRIOV_VF(xe)) return; - INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); + if (IS_DGFX(xe)) + INIT_WORK(&tile->csc_hw_error_work, csc_hw_error_work); process_hw_errors(xe); + + hw_error_counter_init(xe); } -- 2.51.0