Add a write-only debugfs file 'inject_punit_error' under the DRI device
root that allows triggering punit_error_handler() manually for testing.
Writing 1 to the file invokes the handler, which suppresses spurious
Surprise Link Down AER events on slots and declares the
device wedged for a cold reset recovery.

Usage:
  echo 1 > /sys/kernel/debug/dri/<N>/inject_punit_error

Signed-off-by: Mallesh Koujalagi <[email protected]>
---
 drivers/gpu/drm/xe/xe_debugfs.c |  3 +++
 drivers/gpu/drm/xe/xe_ras.c     | 35 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_ras.h     |  7 +++++++
 3 files changed, 45 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index 22b471303984..bf22985a9fcf 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -28,6 +28,7 @@
 #include "xe_sriov_pf_debugfs.h"
 #include "xe_sriov_vf.h"
 #include "xe_step.h"
+#include "xe_ras.h"
 #include "xe_tile_debugfs.h"
 #include "xe_vsec.h"
 #include "xe_wa.h"
@@ -610,6 +611,8 @@ void xe_debugfs_register(struct xe_device *xe)
 
        xe_psmi_debugfs_register(xe);
 
+       xe_ras_debugfs_register(xe, root);
+
        fault_create_debugfs_attr("fail_gt_reset", root, &gt_reset_failure);
 
        if (IS_SRIOV_PF(xe))
diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
index acdedf403649..687ac6b3a2fe 100644
--- a/drivers/gpu/drm/xe/xe_ras.c
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -3,6 +3,8 @@
  * Copyright © 2026 Intel Corporation
  */
 
+#include <linux/debugfs.h>
+
 #include "xe_bo.h"
 #include "xe_assert.h"
 #include "xe_device_types.h"
@@ -522,6 +524,39 @@ enum xe_ras_recovery_action xe_ras_process_errors(struct 
xe_device *xe)
        return XE_RAS_RECOVERY_ACTION_RESET;
 }
 
+#ifdef CONFIG_DRM_XE_DEBUG
+static ssize_t inject_punit_error_write(struct file *f, const char __user 
*ubuf,
+                                       size_t size, loff_t *pos)
+{
+       struct xe_device *xe = f->private_data;
+       u32 val;
+       int ret;
+
+       ret = kstrtouint_from_user(ubuf, size, 0, &val);
+       if (ret)
+               return ret;
+
+       if (val != 1)
+               return -EINVAL;
+
+       punit_error_handler(xe);
+
+       return size;
+}
+
+static const struct file_operations inject_punit_error_fops = {
+       .owner = THIS_MODULE,
+       .open = simple_open,
+       .write = inject_punit_error_write,
+};
+
+void xe_ras_debugfs_register(struct xe_device *xe, struct dentry *root)
+{
+       debugfs_create_file("inject_punit_error", 0200, root, xe,
+                           &inject_punit_error_fops);
+}
+#endif /* CONFIG_DRM_XE_DEBUG */
+
 static struct pci_dev *find_usp_dev(struct pci_dev *pdev)
 {
        struct pci_dev *vsp;
diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
index 8d106c708ff1..c9f84ef238c4 100644
--- a/drivers/gpu/drm/xe/xe_ras.h
+++ b/drivers/gpu/drm/xe/xe_ras.h
@@ -16,4 +16,11 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe,
 void xe_ras_init(struct xe_device *xe);
 enum xe_ras_recovery_action xe_ras_process_errors(struct xe_device *xe);
 
+#ifdef CONFIG_DRM_XE_DEBUG
+struct dentry;
+void xe_ras_debugfs_register(struct xe_device *xe, struct dentry *root);
+#else
+static inline void xe_ras_debugfs_register(struct xe_device *xe, struct dentry 
*root) {}
+#endif
+
 #endif
-- 
2.34.1

Reply via email to