We have a problem of distinguishing intended hangs
submitted by igt during CI/bat and hangs that are nonintended
happening in close proximity.

As we know how igt constructs a batch intended to hang
the gpu, we can use this in our advantage when error state
is constructed. The signature of a intended hang can
be parsed out in this stage.

Batches that are expected to hang can be watermarked by:

batch[i++] = MI_BATCH_BUFFER_END;
batch[i++] = MI_NOOP;
batch[i++] = IGT_HANG_SIGNATURE; /* 0xc5c5c5c5 */

Note that we do the parsing only with gem debug builds.

Later when we spew out notification about the hang into
the dmesg, we can use DRM_INFO for intended hangs
and DRM_WARN for nonintended. This way auxiliary tools
can make better judgement on what to consider important
for reporting.

Cc: Chris Wilson <[email protected]>
Cc: Daniel Vetter <[email protected]>
Cc: Joonas Lahtinen <[email protected]>
Signed-off-by: Mika Kuoppala <[email protected]>
---
 drivers/gpu/drm/i915/i915_debugfs.c              |   2 +-
 drivers/gpu/drm/i915/i915_drv.h                  |   6 +-
 drivers/gpu/drm/i915/i915_gpu_error.c            | 100 +++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_irq.c                  |   4 +-
 drivers/gpu/drm/i915/intel_hangcheck.c           |   6 +-
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c |   2 +-
 6 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 39883cd915db..678f472a8956 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4092,7 +4092,7 @@ i915_wedged_set(void *data, u64 val)
                engine->hangcheck.stalled = true;
        }
 
-       i915_handle_error(i915, val, "Manually setting wedged to %llu", val);
+       i915_handle_error(i915, true, val, "Manually setting wedged to %llu", 
val);
 
        wait_on_bit(&i915->gpu_error.flags,
                    I915_RESET_HANDOFF,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fe93115c4caa..91e0ab8509d2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1004,6 +1004,8 @@ struct i915_gpu_state {
                        u32 *pages[0];
                } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
+               bool batch_had_hang_signature;
+
                struct drm_i915_error_object **user_bo;
                long user_bo_count;
 
@@ -3372,8 +3374,9 @@ static inline void i915_queue_hangcheck(struct 
drm_i915_private *dev_priv)
                           &dev_priv->gpu_error.hangcheck_work, delay);
 }
 
-__printf(3, 4)
+__printf(4, 5)
 void i915_handle_error(struct drm_i915_private *dev_priv,
+                      bool intentional,
                       u32 engine_mask,
                       const char *fmt, ...);
 
@@ -3960,6 +3963,7 @@ static inline void i915_error_state_buf_release(
 
 struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
 void i915_capture_error_state(struct drm_i915_private *dev_priv,
+                             bool intentional,
                              u32 engine_mask,
                              const char *error_msg);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5c2d83a838d8..a5bfefb94d6b 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -918,15 +918,52 @@ void __i915_gpu_state_free(struct kref *error_ref)
        kfree(error);
 }
 
+#ifdef CONFIG_DRM_I915_DEBUG_GEM
+
+#define HANG_SIGNATURE_MAGIC 0xc5c5c5c5
+
+static bool batch_has_hang_signature(const u32 * const batch, u64 len)
+{
+       struct marker { u32 val; u64 offset; } m[2] = {
+               { MI_BATCH_BUFFER_END, -1 },
+               { HANG_SIGNATURE_MAGIC, -1 }
+       };
+       u64 i, j;
+
+       len = min_t(u64, len, PAGE_SIZE);
+       len = DIV_ROUND_UP(len, 4);
+
+       for (i = 0, j = 0; i < len && j < ARRAY_SIZE(m); i++) {
+               if (m[j].offset == -1 && batch[i] == m[j].val)
+                       m[j++].offset = i;
+
+               if (m[0].offset != -1 && i - m[0].offset > 2)
+                       break;
+       }
+
+       if (m[0].offset == -1 || m[1].offset == -1)
+               return false;
+
+       return m[1].offset - m[0].offset == 2;
+}
+#else
+static bool batch_has_hang_signature(struct i915_vma *vma)
+{
+       return false;
+}
+#endif
+
 static struct drm_i915_error_object *
-i915_error_object_create(struct drm_i915_private *i915,
-                        struct i915_vma *vma)
+__i915_error_object_create(struct drm_i915_private *i915,
+                          struct i915_vma *vma,
+                          bool *hangsig)
 {
        struct i915_ggtt *ggtt = &i915->ggtt;
        const u64 slot = ggtt->error_capture.start;
        struct drm_i915_error_object *dst;
        struct compress compress;
-       unsigned long num_pages;
+       unsigned long num_pages, src_pages;
+       unsigned long page_count = 0;
        struct sgt_iter iter;
        dma_addr_t dma;
 
@@ -934,6 +971,7 @@ i915_error_object_create(struct drm_i915_private *i915,
                return NULL;
 
        num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
+       src_pages = num_pages;
        num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
        dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
                      GFP_ATOMIC | __GFP_NOWARN);
@@ -958,9 +996,16 @@ i915_error_object_create(struct drm_i915_private *i915,
                                       I915_CACHE_NONE, 0);
 
                s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
+
+               /* Check last page only for intentional hang signature */
+               if (hangsig && page_count == src_pages - 1)
+                       *hangsig = batch_has_hang_signature((void __force *)s,
+                                                           vma->size);
+
                ret = compress_page(&compress, (void  __force *)s, dst);
                io_mapping_unmap_atomic(s);
 
+               page_count++;
                if (ret)
                        goto unwind;
        }
@@ -978,6 +1023,21 @@ i915_error_object_create(struct drm_i915_private *i915,
        return dst;
 }
 
+static struct drm_i915_error_object *
+i915_error_object_create(struct drm_i915_private *i915,
+                        struct i915_vma *vma)
+{
+       return __i915_error_object_create(i915, vma, NULL);
+}
+
+static struct drm_i915_error_object *
+i915_error_batch_object_create(struct drm_i915_private *i915,
+                              struct i915_vma *vma,
+                              bool *signature)
+{
+       return __i915_error_object_create(i915, vma, signature);
+}
+
 /* The error capture is special as tries to run underneath the normal
  * locking rules - so we use the raw version of the i915_gem_active lookup.
  */
@@ -1484,8 +1544,9 @@ static void i915_gem_record_rings(struct drm_i915_private 
*dev_priv,
                         * by userspace.
                         */
                        ee->batchbuffer =
-                               i915_error_object_create(dev_priv,
-                                                        request->batch);
+                               i915_error_batch_object_create(dev_priv,
+                                                              request->batch,
+                                                              
&ee->batch_had_hang_signature);
 
                        if (HAS_BROKEN_CS_TLB(dev_priv))
                                ee->wa_batchbuffer =
@@ -1701,7 +1762,8 @@ static void i915_capture_reg_state(struct 
drm_i915_private *dev_priv,
 static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
                                   struct i915_gpu_state *error,
                                   u32 engine_mask,
-                                  const char *error_msg)
+                                  const char *error_msg,
+                                  bool simulated)
 {
        u32 ecode;
        int engine_id = -1, len;
@@ -1709,7 +1771,8 @@ static void i915_error_capture_msg(struct 
drm_i915_private *dev_priv,
        ecode = i915_error_generate_code(dev_priv, error, &engine_id);
 
        len = scnprintf(error->error_msg, sizeof(error->error_msg),
-                       "GPU HANG: ecode %d:%d:0x%08x",
+                       "GPU HANG%s: ecode %d:%d:0x%08x",
+                       simulated ? "(simulated)" : "",
                        INTEL_GEN(dev_priv), engine_id, ecode);
 
        if (engine_id != -1 && error->engine[engine_id].context.pid)
@@ -1803,6 +1866,18 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
        return error;
 }
 
+static bool error_state_has_hang_signature(const struct i915_gpu_state *error)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
+               if (error->engine[i].batch_had_hang_signature)
+                       return true;
+       }
+
+       return false;
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -1813,12 +1888,14 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * to pick up.
  */
 void i915_capture_error_state(struct drm_i915_private *dev_priv,
+                             bool intentional,
                              u32 engine_mask,
                              const char *error_msg)
 {
        static bool warned;
        struct i915_gpu_state *error;
        unsigned long flags;
+       bool expected;
 
        if (!i915_modparams.error_capture)
                return;
@@ -1832,8 +1909,13 @@ void i915_capture_error_state(struct drm_i915_private 
*dev_priv,
                return;
        }
 
-       i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
-       DRM_INFO("%s\n", error->error_msg);
+       expected = intentional || error_state_has_hang_signature(error);
+       i915_error_capture_msg(dev_priv, error, engine_mask, error_msg, 
expected);
+
+       if (expected)
+               DRM_INFO("%s\n", error->error_msg);
+       else
+               DRM_WARN("%s\n", error->error_msg);
 
        if (!error->simulated) {
                spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ff00e462697a..75f519f910cc 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2828,6 +2828,7 @@ static void i915_clear_error_registers(struct 
drm_i915_private *dev_priv)
 /**
  * i915_handle_error - handle a gpu error
  * @dev_priv: i915 device private
+ * @intentional: error was intentionally injected
  * @engine_mask: mask representing engines that are hung
  * @fmt: Error message format string
  *
@@ -2838,6 +2839,7 @@ static void i915_clear_error_registers(struct 
drm_i915_private *dev_priv)
  * of a ring dump etc.).
  */
 void i915_handle_error(struct drm_i915_private *dev_priv,
+                      bool intentional,
                       u32 engine_mask,
                       const char *fmt, ...)
 {
@@ -2859,7 +2861,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
         */
        intel_runtime_pm_get(dev_priv);
 
-       i915_capture_error_state(dev_priv, engine_mask, error_msg);
+       i915_capture_error_state(dev_priv, intentional, engine_mask, error_msg);
        i915_clear_error_registers(dev_priv);
 
        /*
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c 
b/drivers/gpu/drm/i915/intel_hangcheck.c
index 12ac270a5f93..7b9e8dec7d65 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -266,7 +266,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
         */
        tmp = I915_READ_CTL(engine);
        if (tmp & RING_WAIT) {
-               i915_handle_error(dev_priv, 0,
+               i915_handle_error(dev_priv, false, 0,
                                  "Kicking stuck wait on %s",
                                  engine->name);
                I915_WRITE_CTL(engine, tmp);
@@ -278,7 +278,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
                default:
                        return ENGINE_DEAD;
                case 1:
-                       i915_handle_error(dev_priv, 0,
+                       i915_handle_error(dev_priv, false, 0,
                                          "Kicking stuck semaphore on %s",
                                          engine->name);
                        I915_WRITE_CTL(engine, tmp);
@@ -407,7 +407,7 @@ static void hangcheck_declare_hang(struct drm_i915_private 
*i915,
                                 "%s, ", engine->name);
        msg[len-2] = '\0';
 
-       return i915_handle_error(i915, hung, "%s", msg);
+       return i915_handle_error(i915, false, hung, "%s", msg);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c 
b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 71ce06680d66..7900bba9fe16 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -846,7 +846,7 @@ static int igt_handle_error(void *arg)
        engine->hangcheck.stalled = true;
        engine->hangcheck.seqno = intel_engine_get_seqno(engine);
 
-       i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
+       i915_handle_error(i915, true, intel_engine_flag(engine), "%s", 
__func__);
 
        xchg(&i915->gpu_error.first_error, error);
 
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to