Usually we have no idea about the upper bound we need to wait to catch
up with userspace when idling the device, but in a few situations we
know the system was idle beforehand and can provide a short timeout in
order to very quickly catch a failure, long before hangcheck kicks in.

In particular, with a broken GPU we expect it to fail during the initial
GPU setup where do a couple of context switches to record the defaults.
This is a task that takes a few milliseconds even on the slowest of
devices, but we may have to wait 60s for hangcheck to give in and
declare the machine inoperable. In thiis a case where any gpu hang is
unacceptable, both from a timeliness and practical standpoint.

The other improvement is that in selftests, we do not need to arm an
independent timer to inject a wedge, as we can just limit the timeout on
the wait directly.

Signed-off-by: Chris Wilson <[email protected]>
Cc: Joonas Lahtinen <[email protected]>
Cc: Mika Kuoppala <[email protected]>
Cc: Tvrtko Ursulin <[email protected]>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  6 +-
 drivers/gpu/drm/i915/i915_drv.h               |  2 +-
 drivers/gpu/drm/i915/i915_gem.c               | 46 ++++++++++------
 drivers/gpu/drm/i915/i915_gem_evict.c         |  3 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
 drivers/gpu/drm/i915/i915_gem_shrinker.c      | 11 +++-
 drivers/gpu/drm/i915/i915_perf.c              |  4 +-
 drivers/gpu/drm/i915/i915_request.c           |  6 +-
 .../gpu/drm/i915/selftests/i915_gem_context.c |  3 +-
 drivers/gpu/drm/i915/selftests/i915_request.c |  4 +-
 .../gpu/drm/i915/selftests/igt_flush_test.c   | 55 +++----------------
 11 files changed, 65 insertions(+), 77 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 544e5e7f011f..099f97ef2303 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
 
        err = i915_gem_wait_for_idle(i915,
                                     I915_WAIT_LOCKED |
-                                    I915_WAIT_INTERRUPTIBLE);
+                                    I915_WAIT_INTERRUPTIBLE,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err)
                goto err_unlock;
 
@@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
                if (val & DROP_ACTIVE)
                        ret = i915_gem_wait_for_idle(dev_priv,
                                                     I915_WAIT_INTERRUPTIBLE |
-                                                    I915_WAIT_LOCKED);
+                                                    I915_WAIT_LOCKED,
+                                                    MAX_SCHEDULE_TIMEOUT);
 
                if (val & DROP_RETIRE)
                        i915_retire_requests(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 09ab12458244..bec0a2796c37 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private 
*dev_priv);
 void i915_gem_fini(struct drm_i915_private *dev_priv);
 void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
 int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
-                          unsigned int flags);
+                          unsigned int flags, long timeout);
 int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
 void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
 void i915_gem_resume(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1a9dab302433..28d62f002276 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct 
drm_i915_gem_object *obj)
 
        /* Attempt to reap some mmap space from dead objects */
        do {
-               err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
+               err = i915_gem_wait_for_idle(dev_priv,
+                                            I915_WAIT_INTERRUPTIBLE,
+                                            MAX_SCHEDULE_TIMEOUT);
                if (err)
                        break;
 
@@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, 
struct drm_file *file)
        return ret;
 }
 
-static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
+static long wait_for_timeline(struct i915_timeline *tl,
+                             unsigned int flags, long timeout)
 {
        struct i915_request *rq;
-       long ret;
 
        rq = i915_gem_active_get_unlocked(&tl->last_request);
        if (!rq)
-               return 0;
+               return timeout;
 
        /*
         * "Race-to-idle".
@@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, 
unsigned int flags)
        if (flags & I915_WAIT_FOR_IDLE_BOOST)
                gen6_rps_boost(rq, NULL);
 
-       ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
+       timeout = i915_request_wait(rq, flags, timeout);
        i915_request_put(rq);
 
-       return ret < 0 ? ret : 0;
+       return timeout;
 }
 
 static int wait_for_engines(struct drm_i915_private *i915)
@@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
        return 0;
 }
 
-int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
+int i915_gem_wait_for_idle(struct drm_i915_private *i915,
+                          unsigned int flags, long timeout)
 {
        GEM_TRACE("flags=%x (%s)\n",
                  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
@@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, 
unsigned int flags)
                lockdep_assert_held(&i915->drm.struct_mutex);
 
                list_for_each_entry(tl, &i915->gt.timelines, link) {
-                       err = wait_for_timeline(tl, flags);
-                       if (err)
-                               return err;
+                       timeout = wait_for_timeline(tl, flags, timeout);
+                       if (timeout < 0)
+                               return timeout;
                }
 
                err = wait_for_engines(i915);
@@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private 
*i915, unsigned int flags)
        } else {
                struct intel_engine_cs *engine;
                enum intel_engine_id id;
-               int err;
 
                for_each_engine(engine, i915, id) {
-                       err = wait_for_timeline(&engine->timeline, flags);
-                       if (err)
-                               return err;
+                       struct i915_timeline *tl = &engine->timeline;
+
+                       timeout = wait_for_timeline(tl, flags, timeout);
+                       if (timeout < 0)
+                               return timeout;
                }
        }
 
@@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
                ret = i915_gem_wait_for_idle(dev_priv,
                                             I915_WAIT_INTERRUPTIBLE |
                                             I915_WAIT_LOCKED |
-                                            I915_WAIT_FOR_IDLE_BOOST);
+                                            I915_WAIT_FOR_IDLE_BOOST,
+                                            MAX_SCHEDULE_TIMEOUT);
                if (ret && ret != -EIO)
                        goto err_unlock;
 
@@ -5356,9 +5361,12 @@ static int __intel_engines_record_defaults(struct 
drm_i915_private *i915)
        if (err)
                goto err_active;
 
-       err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
-       if (err)
+       if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
+               dev_err(i915->drm.dev, "Initial GPU setup failed; disabling\n");
+               i915_gem_set_wedged(i915);
+               err = -EIO;
                goto err_active;
+       }
 
        assert_kernel_context_is_current(i915);
 
@@ -5421,7 +5429,9 @@ static int __intel_engines_record_defaults(struct 
drm_i915_private *i915)
        if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
                goto out_ctx;
 
-       if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
+       if (WARN_ON(i915_gem_wait_for_idle(i915,
+                                          I915_WAIT_LOCKED,
+                                          MAX_SCHEDULE_TIMEOUT)))
                goto out_ctx;
 
        i915_gem_contexts_lost(i915);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c 
b/drivers/gpu/drm/i915/i915_gem_evict.c
index 54814a196ee4..02b83a5ed96c 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
 
        err = i915_gem_wait_for_idle(i915,
                                     I915_WAIT_INTERRUPTIBLE |
-                                    I915_WAIT_LOCKED);
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err)
                return err;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 2ad319e17e39..abd81fb9b0b6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object 
*obj,
        struct i915_ggtt *ggtt = &dev_priv->ggtt;
 
        if (unlikely(ggtt->do_idle_maps)) {
-               if (i915_gem_wait_for_idle(dev_priv, 0)) {
+               if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
                        DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
                        /* Wait a bit, in hopes it avoids the hang */
                        udelay(10);
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c 
b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 55e84e71f526..c61f5b80fee3 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
         * we will free as much as we can and hope to get a second chance.
         */
        if (flags & I915_SHRINK_ACTIVE)
-               i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+               i915_gem_wait_for_idle(i915,
+                                      I915_WAIT_LOCKED,
+                                      MAX_SCHEDULE_TIMEOUT);
 
        trace_i915_gem_shrink(i915, target, flags);
        i915_retire_requests(i915);
@@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private 
*i915, bool *unlock,
        unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
 
        do {
-               if (i915_gem_wait_for_idle(i915, 0) == 0 &&
+               if (i915_gem_wait_for_idle(i915,
+                                          0, MAX_SCHEDULE_TIMEOUT) == 0 &&
                    shrinker_lock(i915, unlock))
                        break;
 
@@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned 
long event, void *ptr
                return NOTIFY_DONE;
 
        /* Force everything onto the inactive lists */
-       ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+       ret = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (ret)
                goto out;
 
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 447407fee3b8..6bf10952c724 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct 
drm_i915_private *dev_priv,
         * So far the best way to work around this issue seems to be draining
         * the GPU from any submitted work.
         */
-       ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
+       ret = i915_gem_wait_for_idle(dev_priv,
+                                    wait_flags,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (ret)
                goto out;
 
diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index 3248369dbcfb..5c2c93cbab12 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private 
*i915, u32 seqno)
        /* Carefully retire all requests without writing to the rings */
        ret = i915_gem_wait_for_idle(i915,
                                     I915_WAIT_INTERRUPTIBLE |
-                                    I915_WAIT_LOCKED);
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (ret)
                return ret;
 
@@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct 
i915_gem_context *ctx)
                /* Ratelimit ourselves to prevent oom from malicious clients */
                ret = i915_gem_wait_for_idle(i915,
                                             I915_WAIT_LOCKED |
-                                            I915_WAIT_INTERRUPTIBLE);
+                                            I915_WAIT_INTERRUPTIBLE,
+                                            MAX_SCHEDULE_TIMEOUT);
                if (ret)
                        goto err_unreserve;
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c 
b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 5fbe15f4effd..bebb3d37fac8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -478,7 +478,8 @@ static int __igt_switch_to_kernel_context(struct 
drm_i915_private *i915,
                }
        }
 
-       err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+       err = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
        if (err)
                return err;
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c 
b/drivers/gpu/drm/i915/selftests/i915_request.c
index 43995fc3534d..c4aac6141e04 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
        t->func = func;
        t->name = name;
 
-       err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+       err = i915_gem_wait_for_idle(i915,
+                                    I915_WAIT_LOCKED,
+                                    MAX_SCHEDULE_TIMEOUT);
        if (err) {
                pr_err("%s(%s): failed to idle before, with err=%d!",
                       func, name, err);
diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c 
b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
index 0d06f559243f..af66e3d4e23a 100644
--- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
+++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
@@ -9,52 +9,8 @@
 #include "../i915_selftest.h"
 #include "igt_flush_test.h"
 
-struct wedge_me {
-       struct delayed_work work;
-       struct drm_i915_private *i915;
-       const void *symbol;
-};
-
-static void wedge_me(struct work_struct *work)
-{
-       struct wedge_me *w = container_of(work, typeof(*w), work.work);
-
-       pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
-
-       GEM_TRACE("%pS timed out.\n", w->symbol);
-       GEM_TRACE_DUMP();
-
-       i915_gem_set_wedged(w->i915);
-}
-
-static void __init_wedge(struct wedge_me *w,
-                        struct drm_i915_private *i915,
-                        long timeout,
-                        const void *symbol)
-{
-       w->i915 = i915;
-       w->symbol = symbol;
-
-       INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
-       schedule_delayed_work(&w->work, timeout);
-}
-
-static void __fini_wedge(struct wedge_me *w)
-{
-       cancel_delayed_work_sync(&w->work);
-       destroy_delayed_work_on_stack(&w->work);
-       w->i915 = NULL;
-}
-
-#define wedge_on_timeout(W, DEV, TIMEOUT)                              \
-       for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
-            (W)->i915;                                                 \
-            __fini_wedge((W)))
-
 int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
 {
-       struct wedge_me w;
-
        cond_resched();
 
        if (flags & I915_WAIT_LOCKED &&
@@ -63,8 +19,15 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned 
int flags)
                i915_gem_set_wedged(i915);
        }
 
-       wedge_on_timeout(&w, i915, HZ)
-               i915_gem_wait_for_idle(i915, flags);
+       if (i915_gem_wait_for_idle(i915, flags, HZ / 5) == -ETIME) {
+               pr_err("%pS timed out, cancelling all further testing.\n",
+                      __builtin_return_address(0));
+
+               GEM_TRACE("%pS timed out.\n", __builtin_return_address(0));
+               GEM_TRACE_DUMP();
+
+               i915_gem_set_wedged(i915);
+       }
 
        return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
 }
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to