After finding the guilty batch and request, we can use it to find the
process that submitted the batch and then add the culprit into the error
state.

This is a slightly different approach from Ben's in that instead of
adding the extra information into the struct i915_hw_context, we use the
information already captured in struct drm_file which is then referenced
from the request.

v2: Also capture the workaround buffer for gen2, so that we can compare
    its contents against the intended batch for the active request.

Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html
Signed-off-by: Chris Wilson <[email protected]>
Cc: Ben Widawsky <[email protected]>
Cc: Mika Kuoppala <[email protected]>
---

Jesse would like similar information included with the error message, as
Mika is working on improving that error message, I delegate that task to
him. ;-)
-Chris

---
 drivers/gpu/drm/i915/i915_drv.h       |   6 +-
 drivers/gpu/drm/i915/i915_gem.c       |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 115 +++++++++++++++++++++-------------
 3 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d317f8c9f05..159f94637f3a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -359,7 +359,7 @@ struct drm_i915_error_state {
                        int page_count;
                        u32 gtt_offset;
                        u32 *pages[0];
-               } *ringbuffer, *batchbuffer, *ctx, *hws_page;
+               } *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
                struct drm_i915_error_request {
                        long jiffies;
@@ -374,6 +374,9 @@ struct drm_i915_error_state {
                                u32 pp_dir_base;
                        };
                } vm_info;
+
+               pid_t pid;
+               char comm[TASK_COMM_LEN];
        } ring[I915_NUM_RINGS];
 
        struct drm_i915_error_buffer {
@@ -1825,6 +1828,7 @@ struct drm_i915_gem_request {
 
 struct drm_i915_file_private {
        struct drm_i915_private *dev_priv;
+       struct drm_file *file;
 
        struct {
                spinlock_t lock;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 157877bd7a0b..605e4ab636e8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4981,6 +4981,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file 
*file)
 
        file->driver_priv = file_priv;
        file_priv->dev_priv = dev->dev_private;
+       file_priv->file = file;
 
        spin_lock_init(&file_priv->mm.lock);
        INIT_LIST_HEAD(&file_priv->mm.request_list);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0ff158939f1d..c1da48b5189d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -302,13 +302,28 @@ void i915_error_printf(struct drm_i915_error_state_buf 
*e, const char *f, ...)
        va_end(args);
 }
 
+static void print_error_obj(struct drm_i915_error_state_buf *m,
+                           struct drm_i915_error_object *obj)
+{
+       int page, offset, elt;
+
+       for (page = offset = 0; page < obj->page_count; page++) {
+               for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+                       err_printf(m, "%08x :  %08x\n", offset,
+                                  obj->pages[page][elt]);
+                       offset += 4;
+               }
+       }
+}
+
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
                            const struct i915_error_state_file_priv *error_priv)
 {
        struct drm_device *dev = error_priv->dev;
        drm_i915_private_t *dev_priv = dev->dev_private;
        struct drm_i915_error_state *error = error_priv->error;
-       int i, j, page, offset, elt;
+       int i, j, offset, elt;
+       int max_hangcheck_score;
 
        if (!error) {
                err_printf(m, "no error state collected\n");
@@ -318,6 +333,20 @@ int i915_error_state_to_str(struct 
drm_i915_error_state_buf *m,
        err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
                   error->time.tv_usec);
        err_printf(m, "Kernel: " UTS_RELEASE "\n");
+       max_hangcheck_score = 0;
+       for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+               if (error->ring[i].hangcheck_score > max_hangcheck_score)
+                       max_hangcheck_score = error->ring[i].hangcheck_score;
+       }
+       for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+               if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+                   error->ring[i].pid != -1) {
+                       err_printf(m, "Active process (on ring %s): %s [%d]\n",
+                                  ring_str(i),
+                                  error->ring[i].comm,
+                                  error->ring[i].pid);
+               }
+       }
        err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
        err_printf(m, "EIR: 0x%08x\n", error->eir);
        err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -363,17 +392,19 @@ int i915_error_state_to_str(struct 
drm_i915_error_state_buf *m,
                struct drm_i915_error_object *obj;
 
                if ((obj = error->ring[i].batchbuffer)) {
-                       err_printf(m, "%s --- gtt_offset = 0x%08x\n",
-                                  dev_priv->ring[i].name,
+                       err_puts(m, dev_priv->ring[i].name);
+                       if (error->ring[i].pid != -1)
+                               err_printf(m, " (submitted by %s [%d])",
+                                          error->ring[i].comm, 
error->ring[i].pid);
+                       err_printf(m, " --- gtt_offset = 0x%08x\n",
                                   obj->gtt_offset);
-                       offset = 0;
-                       for (page = 0; page < obj->page_count; page++) {
-                               for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-                                       err_printf(m, "%08x :  %08x\n", offset,
-                                                  obj->pages[page][elt]);
-                                       offset += 4;
-                               }
-                       }
+                       print_error_obj(m, obj);
+               }
+
+               if ((obj = error->ring[i].wa_batchbuffer)) {
+                       err_printf(m, "%s (w/a) --- gtt_offset = 0x%08x\n",
+                                  dev_priv->ring[i].name, obj->gtt_offset);
+                       print_error_obj(m, obj);
                }
 
                if (error->ring[i].num_requests) {
@@ -392,15 +423,7 @@ int i915_error_state_to_str(struct 
drm_i915_error_state_buf *m,
                        err_printf(m, "%s --- ringbuffer = 0x%08x\n",
                                   dev_priv->ring[i].name,
                                   obj->gtt_offset);
-                       offset = 0;
-                       for (page = 0; page < obj->page_count; page++) {
-                               for (elt = 0; elt < PAGE_SIZE/4; elt++) {
-                                       err_printf(m, "%08x :  %08x\n",
-                                                  offset,
-                                                  obj->pages[page][elt]);
-                                       offset += 4;
-                               }
-                       }
+                       print_error_obj(m, obj);
                }
 
                if ((obj = error->ring[i].hws_page)) {
@@ -725,9 +748,9 @@ static void i915_gem_record_fences(struct drm_device *dev,
        }
 }
 
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
-                            struct intel_ring_buffer *ring)
+static struct drm_i915_gem_request *
+i915_error_first_request(struct drm_i915_private *dev_priv,
+                        struct intel_ring_buffer *ring)
 {
        struct drm_i915_gem_request *request;
        u32 seqno;
@@ -735,29 +758,12 @@ i915_error_first_batchbuffer(struct drm_i915_private 
*dev_priv,
        if (!ring->get_seqno)
                return NULL;
 
-       if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
-               struct drm_i915_gem_object *obj;
-               u32 acthd = I915_READ(ACTHD);
-
-               if (WARN_ON(ring->id != RCS))
-                       return NULL;
-
-               obj = ring->scratch.obj;
-               if (obj != NULL &&
-                   acthd >= i915_gem_obj_ggtt_offset(obj) &&
-                   acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
-                       return i915_error_ggtt_object_create(dev_priv, obj);
-       }
-
        seqno = ring->get_seqno(ring, false);
        list_for_each_entry(request, &ring->request_list, list) {
                if (i915_seqno_passed(seqno, request->seqno))
                        continue;
 
-               /* We need to copy these to an anonymous buffer as the simplest
-                * method to avoid being overwritten by userspace.
-                */
-               return i915_error_object_create(dev_priv, request->batch_obj, 
request->ctx->vm);
+               return request;
        }
 
        return NULL;
@@ -911,8 +917,31 @@ static void i915_gem_record_rings(struct drm_device *dev,
 
                i915_record_ring_state(dev, ring, &error->ring[i]);
 
-               error->ring[i].batchbuffer =
-                       i915_error_first_batchbuffer(dev_priv, ring);
+               error->ring[i].pid = -1;
+               request = i915_error_first_request(dev_priv, ring);
+               if (request) {
+                       /* We need to copy these to an anonymous buffer as the 
simplest
+                        * method to avoid being overwritten by userspace.
+                        */
+                       error->ring[i].batchbuffer =
+                               i915_error_object_create(dev_priv, 
request->batch_obj, request->ctx->vm);
+
+                       if (HAS_BROKEN_CS_TLB(dev_priv->dev) && 
ring->scratch.obj)
+                               error->ring[i].wa_batchbuffer =
+                                       i915_error_ggtt_object_create(dev_priv, 
ring->scratch.obj);
+
+                       if (request->file_priv) {
+                               struct task_struct *task;
+
+                               rcu_read_lock();
+                               task = pid_task(request->file_priv->file->pid, 
PIDTYPE_PID);
+                               if (task) {
+                                       strcpy(error->ring[i].comm, task->comm);
+                                       error->ring[i].pid = task->pid;
+                               }
+                               rcu_read_unlock();
+                       }
+               }
 
                error->ring[i].ringbuffer =
                        i915_error_ggtt_object_create(dev_priv, ring->obj);
-- 
1.9.rc1

_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to