Add error-event support for Correctable errors in CRI. Report an error event to userspace for every component that has crossed the threshold on receiving an interrupt.
Cc: Michal Wajdeczko <[email protected]> Signed-off-by: Riana Tauro <[email protected]> --- v2: add warns for unexpected values from system controller (Michal) send an event at most once per component for each interrupt (Raag) use correct parameters for get_counter (Sashiko) --- drivers/gpu/drm/xe/xe_ras.c | 75 +++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index 44f4e1a3455b..b71d51285954 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -77,6 +77,18 @@ static u8 drm_to_xe_ras_severity(u8 severity) } } +static u8 xe_to_drm_ras_severity(u8 severity) +{ + switch (severity) { + case XE_RAS_SEV_CORRECTABLE: + return DRM_XE_RAS_ERR_SEV_CORRECTABLE; + case XE_RAS_SEV_UNCORRECTABLE: + return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE; + default: + return DRM_XE_RAS_ERR_SEV_MAX; + } +} + static u8 drm_to_xe_ras_component(u8 component) { switch (component) { @@ -95,6 +107,24 @@ static u8 drm_to_xe_ras_component(u8 component) } } +static u8 xe_to_drm_ras_component(u8 component) +{ + switch (component) { + case XE_RAS_COMP_DEVICE_MEMORY: + return DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY; + case XE_RAS_COMP_CORE_COMPUTE: + return DRM_XE_RAS_ERR_COMP_CORE_COMPUTE; + case XE_RAS_COMP_PCIE: + return DRM_XE_RAS_ERR_COMP_PCIE; + case XE_RAS_COMP_FABRIC: + return DRM_XE_RAS_ERR_COMP_FABRIC; + case XE_RAS_COMP_SOC_INTERNAL: + return DRM_XE_RAS_ERR_COMP_SOC_INTERNAL; + default: + return DRM_XE_RAS_ERR_COMP_MAX; + } +} + static int ras_status_to_errno(u32 status) { switch (status) { @@ -131,14 +161,41 @@ static inline const char *comp_to_str(u8 component) return xe_ras_components[component]; } +static void ras_send_error_event(struct xe_device *xe, u8 severity, u8 component) +{ + u8 drm_severity, drm_component; + u32 value; + int ret; + + drm_severity = xe_to_drm_ras_severity(severity); + if (drm_severity == DRM_XE_RAS_ERR_SEV_MAX) { + xe_warn(xe, "sysctrl: unexpected severity %u\n", severity); + return; + } + + drm_component = xe_to_drm_ras_component(component); + if (drm_component == DRM_XE_RAS_ERR_COMP_MAX) { + xe_warn(xe, "sysctrl: unexpected component %u\n", component); + return; + } + + ret = xe_ras_get_counter(xe, drm_severity, drm_component, &value); + if (ret) + return; + + xe_drm_ras_event(xe, drm_component, drm_severity, value, GFP_KERNEL); +} + void xe_ras_counter_threshold_crossed(struct xe_device *xe, struct xe_sysctrl_event_response *response) { struct xe_ras_threshold_crossed *pending = (void *)&response->data; struct xe_ras_error_class *errors = pending->counters; u32 id, ncounters = pending->ncounters; + u8 sent = 0; BUILD_BUG_ON(sizeof(response->data) < sizeof(*pending)); + BUILD_BUG_ON(XE_RAS_COMP_MAX > (BITS_PER_BYTE * sizeof(sent))); xe_device_assert_mem_access(xe); if (!ncounters || ncounters > XE_RAS_NUM_COUNTERS) @@ -154,6 +211,24 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe, xe_warn(xe, "[RAS]: %s %s detected\n", comp_to_str(component), sev_to_str(severity)); + + if (severity != XE_RAS_SEV_CORRECTABLE) { + xe_warn(xe, "sysctrl: unexpected severity %s (%u)\n", sev_to_str(severity), + severity); + continue; + } + + if (component >= XE_RAS_COMP_MAX) { + xe_warn(xe, "sysctrl: unexpected component %u\n", component); + continue; + } + + /* Send event once per component */ + if (sent & BIT(component)) + continue; + sent |= BIT(component); + + ras_send_error_event(xe, severity, component); } } -- 2.47.1
