Add a write-only debugfs file 'inject_punit_error' under the DRI device root that allows triggering punit_error_handler() manually for testing. Writing 1 to the file invokes the handler, which suppresses spurious Surprise Link Down AER events on slots and declares the device wedged for a cold reset recovery.
Usage: echo 1 > /sys/kernel/debug/dri/<N>/inject_punit_error Signed-off-by: Mallesh Koujalagi <[email protected]> --- drivers/gpu/drm/xe/xe_debugfs.c | 3 +++ drivers/gpu/drm/xe/xe_ras.c | 35 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/xe/xe_ras.h | 7 +++++++ 3 files changed, 45 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c index 22b471303984..bf22985a9fcf 100644 --- a/drivers/gpu/drm/xe/xe_debugfs.c +++ b/drivers/gpu/drm/xe/xe_debugfs.c @@ -28,6 +28,7 @@ #include "xe_sriov_pf_debugfs.h" #include "xe_sriov_vf.h" #include "xe_step.h" +#include "xe_ras.h" #include "xe_tile_debugfs.h" #include "xe_vsec.h" #include "xe_wa.h" @@ -610,6 +611,8 @@ void xe_debugfs_register(struct xe_device *xe) xe_psmi_debugfs_register(xe); + xe_ras_debugfs_register(xe, root); + fault_create_debugfs_attr("fail_gt_reset", root, >_reset_failure); if (IS_SRIOV_PF(xe)) diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c index acdedf403649..687ac6b3a2fe 100644 --- a/drivers/gpu/drm/xe/xe_ras.c +++ b/drivers/gpu/drm/xe/xe_ras.c @@ -3,6 +3,8 @@ * Copyright © 2026 Intel Corporation */ +#include <linux/debugfs.h> + #include "xe_bo.h" #include "xe_assert.h" #include "xe_device_types.h" @@ -522,6 +524,39 @@ enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe) return XE_RAS_RECOVERY_ACTION_RESET; } +#ifdef CONFIG_DRM_XE_DEBUG +static ssize_t inject_punit_error_write(struct file *f, const char __user *ubuf, + size_t size, loff_t *pos) +{ + struct xe_device *xe = f->private_data; + u32 val; + int ret; + + ret = kstrtouint_from_user(ubuf, size, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + + punit_error_handler(xe); + + return size; +} + +static const struct file_operations inject_punit_error_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = inject_punit_error_write, +}; + +void xe_ras_debugfs_register(struct xe_device *xe, struct dentry *root) +{ + debugfs_create_file("inject_punit_error", 0200, root, xe, + &inject_punit_error_fops); +} +#endif /* CONFIG_DRM_XE_DEBUG */ + static struct pci_dev *find_usp_dev(struct pci_dev *pdev) { struct pci_dev *vsp; diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h index 8d106c708ff1..c9f84ef238c4 100644 --- a/drivers/gpu/drm/xe/xe_ras.h +++ b/drivers/gpu/drm/xe/xe_ras.h @@ -16,4 +16,11 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe, void xe_ras_init(struct xe_device *xe); enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe); +#ifdef CONFIG_DRM_XE_DEBUG +struct dentry; +void xe_ras_debugfs_register(struct xe_device *xe, struct dentry *root); +#else +static inline void xe_ras_debugfs_register(struct xe_device *xe, struct dentry *root) {} +#endif + #endif -- 2.34.1
