Chris Wilson <[email protected]> writes:

> Usually we have no idea about the upper bound we need to wait to catch
> up with userspace when idling the device, but in a few situations we
> know the system was idle beforehand and can provide a short timeout in
> order to very quickly catch a failure, long before hangcheck kicks in.
>
> In the following patches, we will use the timeout to curtain two overly
> long waits, where we know we can expect the GPU to complete within a
> reasonable time or declare it broken.
>
> In particular, with a broken GPU we expect it to fail during the initial
> GPU setup where do a couple of context switches to record the defaults.
> This is a task that takes a few milliseconds even on the slowest of
> devices, but we may have to wait 60s for hangcheck to give in and
> declare the machine inoperable. In this a case where any gpu hang is
> unacceptable, both from a timeliness and practical standpoint.
>
> The other improvement is that in selftests, we do not need to arm an
> independent timer to inject a wedge, as we can just limit the timeout on
> the wait directly.
>
> Signed-off-by: Chris Wilson <[email protected]>
> Cc: Joonas Lahtinen <[email protected]>
> Cc: Mika Kuoppala <[email protected]>
> Cc: Tvrtko Ursulin <[email protected]>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c           |  6 ++-
>  drivers/gpu/drm/i915/i915_drv.h               |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c               | 43 +++++++++++--------
>  drivers/gpu/drm/i915/i915_gem_evict.c         |  3 +-
>  drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
>  drivers/gpu/drm/i915/i915_gem_shrinker.c      | 11 +++--
>  drivers/gpu/drm/i915/i915_perf.c              |  4 +-
>  drivers/gpu/drm/i915/i915_request.c           |  6 ++-
>  .../gpu/drm/i915/selftests/i915_gem_context.c |  4 +-
>  drivers/gpu/drm/i915/selftests/i915_request.c |  4 +-
>  .../gpu/drm/i915/selftests/igt_flush_test.c   |  2 +-
>  11 files changed, 56 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
> b/drivers/gpu/drm/i915/i915_debugfs.c
> index 544e5e7f011f..099f97ef2303 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
>  
>       err = i915_gem_wait_for_idle(i915,
>                                    I915_WAIT_LOCKED |
> -                                  I915_WAIT_INTERRUPTIBLE);
> +                                  I915_WAIT_INTERRUPTIBLE,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (err)
>               goto err_unlock;
>  
> @@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
>               if (val & DROP_ACTIVE)
>                       ret = i915_gem_wait_for_idle(dev_priv,
>                                                    I915_WAIT_INTERRUPTIBLE |
> -                                                  I915_WAIT_LOCKED);
> +                                                  I915_WAIT_LOCKED,
> +                                                  MAX_SCHEDULE_TIMEOUT);
>  
>               if (val & DROP_RETIRE)
>                       i915_retire_requests(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 09ab12458244..bec0a2796c37 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private 
> *dev_priv);
>  void i915_gem_fini(struct drm_i915_private *dev_priv);
>  void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
>  int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
> -                        unsigned int flags);
> +                        unsigned int flags, long timeout);
>  int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
>  void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
>  void i915_gem_resume(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1a9dab302433..093d98ed7755 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct 
> drm_i915_gem_object *obj)
>  
>       /* Attempt to reap some mmap space from dead objects */
>       do {
> -             err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
> +             err = i915_gem_wait_for_idle(dev_priv,
> +                                          I915_WAIT_INTERRUPTIBLE,
> +                                          MAX_SCHEDULE_TIMEOUT);
>               if (err)
>                       break;
>  
> @@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *file)
>       return ret;
>  }
>  
> -static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
> +static long wait_for_timeline(struct i915_timeline *tl,
> +                           unsigned int flags, long timeout)
>  {
>       struct i915_request *rq;
> -     long ret;
>  
>       rq = i915_gem_active_get_unlocked(&tl->last_request);
>       if (!rq)
> -             return 0;
> +             return timeout;
>  
>       /*
>        * "Race-to-idle".
> @@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline 
> *tl, unsigned int flags)
>       if (flags & I915_WAIT_FOR_IDLE_BOOST)
>               gen6_rps_boost(rq, NULL);
>  
> -     ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
> +     timeout = i915_request_wait(rq, flags, timeout);
>       i915_request_put(rq);
>  
> -     return ret < 0 ? ret : 0;
> +     return timeout;
>  }
>  
>  static int wait_for_engines(struct drm_i915_private *i915)
> @@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private 
> *i915)
>       return 0;
>  }
>  
> -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> +int i915_gem_wait_for_idle(struct drm_i915_private *i915,
> +                        unsigned int flags, long timeout)
>  {
>       GEM_TRACE("flags=%x (%s)\n",
>                 flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");

Did you omit enhancing the trace on purpose?

Eventually i915_request_wait will assert for silly timeout values, but
should we add assertion here too as wait_for_timeline will
return what we put, for nonactive timelines?

> @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private 
> *i915, unsigned int flags)
>               lockdep_assert_held(&i915->drm.struct_mutex);
>  
>               list_for_each_entry(tl, &i915->gt.timelines, link) {
> -                     err = wait_for_timeline(tl, flags);
> -                     if (err)
> -                             return err;
> +                     timeout = wait_for_timeline(tl, flags, timeout);
> +                     if (timeout < 0)
> +                             return timeout;
>               }
>  
>               err = wait_for_engines(i915);

To truely serve the caller, the remaining timeout could be passed to
wait_for_engines too, but that can be followup.

So the only real thing is the missing trace.
-Mika


> @@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private 
> *i915, unsigned int flags)
>       } else {
>               struct intel_engine_cs *engine;
>               enum intel_engine_id id;
> -             int err;
>  
>               for_each_engine(engine, i915, id) {
> -                     err = wait_for_timeline(&engine->timeline, flags);
> -                     if (err)
> -                             return err;
> +                     struct i915_timeline *tl = &engine->timeline;
> +
> +                     timeout = wait_for_timeline(tl, flags, timeout);
> +                     if (timeout < 0)
> +                             return timeout;
>               }
>       }
>  
> @@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
>               ret = i915_gem_wait_for_idle(dev_priv,
>                                            I915_WAIT_INTERRUPTIBLE |
>                                            I915_WAIT_LOCKED |
> -                                          I915_WAIT_FOR_IDLE_BOOST);
> +                                          I915_WAIT_FOR_IDLE_BOOST,
> +                                          MAX_SCHEDULE_TIMEOUT);
>               if (ret && ret != -EIO)
>                       goto err_unlock;
>  
> @@ -5356,7 +5361,9 @@ static int __intel_engines_record_defaults(struct 
> drm_i915_private *i915)
>       if (err)
>               goto err_active;
>  
> -     err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +     err = i915_gem_wait_for_idle(i915,
> +                                  I915_WAIT_LOCKED,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (err)
>               goto err_active;
>  
> @@ -5421,7 +5428,9 @@ static int __intel_engines_record_defaults(struct 
> drm_i915_private *i915)
>       if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
>               goto out_ctx;
>  
> -     if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
> +     if (WARN_ON(i915_gem_wait_for_idle(i915,
> +                                        I915_WAIT_LOCKED,
> +                                        MAX_SCHEDULE_TIMEOUT)))
>               goto out_ctx;
>  
>       i915_gem_contexts_lost(i915);
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c 
> b/drivers/gpu/drm/i915/i915_gem_evict.c
> index 54814a196ee4..02b83a5ed96c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
>  
>       err = i915_gem_wait_for_idle(i915,
>                                    I915_WAIT_INTERRUPTIBLE |
> -                                  I915_WAIT_LOCKED);
> +                                  I915_WAIT_LOCKED,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (err)
>               return err;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c 
> b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 2ad319e17e39..abd81fb9b0b6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct 
> drm_i915_gem_object *obj,
>       struct i915_ggtt *ggtt = &dev_priv->ggtt;
>  
>       if (unlikely(ggtt->do_idle_maps)) {
> -             if (i915_gem_wait_for_idle(dev_priv, 0)) {
> +             if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
>                       DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
>                       /* Wait a bit, in hopes it avoids the hang */
>                       udelay(10);
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c 
> b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 55e84e71f526..c61f5b80fee3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
>        * we will free as much as we can and hope to get a second chance.
>        */
>       if (flags & I915_SHRINK_ACTIVE)
> -             i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +             i915_gem_wait_for_idle(i915,
> +                                    I915_WAIT_LOCKED,
> +                                    MAX_SCHEDULE_TIMEOUT);
>  
>       trace_i915_gem_shrink(i915, target, flags);
>       i915_retire_requests(i915);
> @@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private 
> *i915, bool *unlock,
>       unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
>  
>       do {
> -             if (i915_gem_wait_for_idle(i915, 0) == 0 &&
> +             if (i915_gem_wait_for_idle(i915,
> +                                        0, MAX_SCHEDULE_TIMEOUT) == 0 &&
>                   shrinker_lock(i915, unlock))
>                       break;
>  
> @@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, 
> unsigned long event, void *ptr
>               return NOTIFY_DONE;
>  
>       /* Force everything onto the inactive lists */
> -     ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +     ret = i915_gem_wait_for_idle(i915,
> +                                  I915_WAIT_LOCKED,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (ret)
>               goto out;
>  
> diff --git a/drivers/gpu/drm/i915/i915_perf.c 
> b/drivers/gpu/drm/i915/i915_perf.c
> index 447407fee3b8..6bf10952c724 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct 
> drm_i915_private *dev_priv,
>        * So far the best way to work around this issue seems to be draining
>        * the GPU from any submitted work.
>        */
> -     ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
> +     ret = i915_gem_wait_for_idle(dev_priv,
> +                                  wait_flags,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (ret)
>               goto out;
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.c 
> b/drivers/gpu/drm/i915/i915_request.c
> index 3248369dbcfb..5c2c93cbab12 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private 
> *i915, u32 seqno)
>       /* Carefully retire all requests without writing to the rings */
>       ret = i915_gem_wait_for_idle(i915,
>                                    I915_WAIT_INTERRUPTIBLE |
> -                                  I915_WAIT_LOCKED);
> +                                  I915_WAIT_LOCKED,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (ret)
>               return ret;
>  
> @@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct 
> i915_gem_context *ctx)
>               /* Ratelimit ourselves to prevent oom from malicious clients */
>               ret = i915_gem_wait_for_idle(i915,
>                                            I915_WAIT_LOCKED |
> -                                          I915_WAIT_INTERRUPTIBLE);
> +                                          I915_WAIT_INTERRUPTIBLE,
> +                                          MAX_SCHEDULE_TIMEOUT);
>               if (ret)
>                       goto err_unreserve;
>  
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c 
> b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 5fbe15f4effd..ab2590242033 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -478,7 +478,9 @@ static int __igt_switch_to_kernel_context(struct 
> drm_i915_private *i915,
>               }
>       }
>  
> -     err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +     err = i915_gem_wait_for_idle(i915,
> +                                  I915_WAIT_LOCKED,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (err)
>               return err;
>  
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c 
> b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 43995fc3534d..c4aac6141e04 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
>       t->func = func;
>       t->name = name;
>  
> -     err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +     err = i915_gem_wait_for_idle(i915,
> +                                  I915_WAIT_LOCKED,
> +                                  MAX_SCHEDULE_TIMEOUT);
>       if (err) {
>               pr_err("%s(%s): failed to idle before, with err=%d!",
>                      func, name, err);
> diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c 
> b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> index 0d06f559243f..09ab037ce803 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> @@ -64,7 +64,7 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned 
> int flags)
>       }
>  
>       wedge_on_timeout(&w, i915, HZ)
> -             i915_gem_wait_for_idle(i915, flags);
> +             i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
>  
>       return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
>  }
> -- 
> 2.18.0
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to