Re: [Intel-gfx] [PATCH v2] drm/i915: Separate out reset flags from the reset counter

Mika Kuoppala Tue, 30 Aug 2016 07:43:09 -0700

Chris Wilson <[email protected]> writes:

> In preparation for introducing a per-engine reset, we can first separate
> the mixing of the reset state from the global reset counter.
>
> The loss of atomicity in updating the reset state poses a small problem
> for handling the waiters. For requests, this is solved by advancing the
> seqno so that a waiter waking up after the reset knows the request is
> complete. For pending flips, we still rely on the increment of the
> global reset epoch (as well as the reset-in-progress flag) to signify
> when the hardware was reset.
>
> The advantage, now that we do not inspect the reset state during reset
> itself i.e. we no longer emit requests during reset, is that we can use
> the atomic updates of the state flags to ensure that only one reset
> worker is active.
>
> v2: Mika spotted that I transformed the i915_gem_wait_for_error() wakeup
> into a waiter wakeup.
>
> Signed-off-by: Chris Wilson <[email protected]>


The wait_for_request docs are infested with reset_count which
is no more for them. But not fault of this patch.

Reviewed-by: Mika Kuoppala <[email protected]>

> Cc: Arun Siluvery <[email protected]>
> Cc: Mika Kuoppala <[email protected]>
> Link: 
> http://patchwork.freedesktop.org/patch/msgid/[email protected]
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  16 ++---
>  drivers/gpu/drm/i915/i915_drv.h         |  46 +++++---------
>  drivers/gpu/drm/i915/i915_gem.c         |   2 +-
>  drivers/gpu/drm/i915/i915_gem_request.c |  13 ++--
>  drivers/gpu/drm/i915/i915_irq.c         | 103 
> ++++++++++++++------------------
>  drivers/gpu/drm/i915/intel_display.c    |  25 +++++---
>  drivers/gpu/drm/i915/intel_drv.h        |   4 +-
>  7 files changed, 92 insertions(+), 117 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 47fe07283d88..01b518dcbd7a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1602,7 +1602,7 @@ static int i915_drm_resume(struct drm_device *dev)
>       mutex_lock(&dev->struct_mutex);
>       if (i915_gem_init_hw(dev)) {
>               DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
> -             atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
> +             set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
>       }
>       mutex_unlock(&dev->struct_mutex);
>  
> @@ -1764,20 +1764,13 @@ int i915_reset(struct drm_i915_private *dev_priv)
>  {
>       struct drm_device *dev = &dev_priv->drm;
>       struct i915_gpu_error *error = &dev_priv->gpu_error;
> -     unsigned reset_counter;
>       int ret;
>  
>       mutex_lock(&dev->struct_mutex);
>  
>       /* Clear any previous failed attempts at recovery. Time to try again. */
> -     atomic_andnot(I915_WEDGED, &error->reset_counter);
> -
> -     /* Clear the reset-in-progress flag and increment the reset epoch. */
> -     reset_counter = atomic_inc_return(&error->reset_counter);
> -     if (WARN_ON(__i915_reset_in_progress(reset_counter))) {
> -             ret = -EIO;
> -             goto error;
> -     }
> +     __clear_bit(I915_WEDGED, &error->flags);
> +     error->reset_count++;
>  
>       pr_notice("drm/i915: Resetting chip after gpu hang\n");
>  
> @@ -1814,6 +1807,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
>               goto error;
>       }
>  
> +     clear_bit(I915_RESET_IN_PROGRESS, &error->flags);
>       mutex_unlock(&dev->struct_mutex);
>  
>       /*
> @@ -1828,7 +1822,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
>       return 0;
>  
>  error:
> -     atomic_or(I915_WEDGED, &error->reset_counter);
> +     set_bit(I915_WEDGED, &error->flags);
>       mutex_unlock(&dev->struct_mutex);
>       return ret;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c413587895cf..e574eaa65c4d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1386,9 +1386,10 @@ struct i915_gpu_error {
>        * State variable controlling the reset flow and count
>        *
>        * This is a counter which gets incremented when reset is triggered,
> -      * and again when reset has been handled. So odd values (lowest bit set)
> -      * means that reset is in progress and even values that
> -      * (reset_counter >> 1):th reset was successfully completed.
> +      *
> +      * Before the reset commences, the I915_RESET_IN_PROGRESS bit is set
> +      * meaning that any waiters holding onto the struct_mutex should
> +      * relinquish the lock immediately in order for the reset to start.
>        *
>        * If reset is not completed succesfully, the I915_WEDGE bit is
>        * set meaning that hardware is terminally sour and there is no
> @@ -1403,10 +1404,11 @@ struct i915_gpu_error {
>        * naturally enforces the correct ordering between the bail-out of the
>        * waiter and the gpu reset work code.
>        */
> -     atomic_t reset_counter;
> +     unsigned long reset_count;
>  
> -#define I915_RESET_IN_PROGRESS_FLAG  1
> -#define I915_WEDGED                  (1 << 31)
> +     unsigned long flags;
> +#define I915_RESET_IN_PROGRESS       0
> +#define I915_WEDGED          (BITS_PER_LONG - 1)
>  
>       /**
>        * Waitqueue to signal when a hang is detected. Used to for waiters
> @@ -3234,44 +3236,24 @@ i915_gem_find_active_request(struct intel_engine_cs 
> *engine);
>  
>  void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
>  
> -static inline u32 i915_reset_counter(struct i915_gpu_error *error)
> -{
> -     return atomic_read(&error->reset_counter);
> -}
> -
> -static inline bool __i915_reset_in_progress(u32 reset)
> -{
> -     return unlikely(reset & I915_RESET_IN_PROGRESS_FLAG);
> -}
> -
> -static inline bool __i915_reset_in_progress_or_wedged(u32 reset)
> -{
> -     return unlikely(reset & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
> -}
> -
> -static inline bool __i915_terminally_wedged(u32 reset)
> -{
> -     return unlikely(reset & I915_WEDGED);
> -}
> -
>  static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
>  {
> -     return __i915_reset_in_progress(i915_reset_counter(error));
> +     return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
>  }
>  
> -static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error 
> *error)
> +static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
>  {
> -     return __i915_reset_in_progress_or_wedged(i915_reset_counter(error));
> +     return unlikely(test_bit(I915_WEDGED, &error->flags));
>  }
>  
> -static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> +static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error 
> *error)
>  {
> -     return __i915_terminally_wedged(i915_reset_counter(error));
> +     return i915_reset_in_progress(error) | i915_terminally_wedged(error);
>  }
>  
>  static inline u32 i915_reset_count(struct i915_gpu_error *error)
>  {
> -     return ((i915_reset_counter(error) & ~I915_WEDGED) + 1) / 2;
> +     return READ_ONCE(error->reset_count);
>  }
>  
>  void i915_gem_reset(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 838a275e7fac..c06dacdae87f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4525,7 +4525,7 @@ int i915_gem_init(struct drm_device *dev)
>                * for all other failure, such as an allocation failure, bail.
>                */
>               DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
> -             atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
> +             set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
>               ret = 0;
>       }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
> b/drivers/gpu/drm/i915/i915_gem_request.c
> index ec613fd5e01c..24eb4b1b7540 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -233,16 +233,18 @@ void i915_gem_request_retire_upto(struct 
> drm_i915_gem_request *req)
>       } while (tmp != req);
>  }
>  
> -static int i915_gem_check_wedge(unsigned int reset_counter, bool 
> interruptible)
> +static int i915_gem_check_wedge(struct drm_i915_private *dev_priv)
>  {
> -     if (__i915_terminally_wedged(reset_counter))
> +     struct i915_gpu_error *error = &dev_priv->gpu_error;
> +
> +     if (i915_terminally_wedged(error))
>               return -EIO;
>  
> -     if (__i915_reset_in_progress(reset_counter)) {
> +     if (i915_reset_in_progress(error)) {
>               /* Non-interruptible callers can't handle -EAGAIN, hence return
>                * -EIO unconditionally for these.
>                */
> -             if (!interruptible)
> +             if (!dev_priv->mm.interruptible)
>                       return -EIO;
>  
>               return -EAGAIN;
> @@ -331,7 +333,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>                      struct i915_gem_context *ctx)
>  {
>       struct drm_i915_private *dev_priv = engine->i915;
> -     unsigned int reset_counter = i915_reset_counter(&dev_priv->gpu_error);
>       struct drm_i915_gem_request *req;
>       u32 seqno;
>       int ret;
> @@ -340,7 +341,7 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>        * EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
>        * and restart.
>        */
> -     ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible);
> +     ret = i915_gem_check_wedge(dev_priv);
>       if (ret)
>               return ERR_PTR(ret);
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 82358d4e0cc2..ed172d7beecb 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2501,53 +2501,41 @@ static void i915_reset_and_wakeup(struct 
> drm_i915_private *dev_priv)
>  
>       kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>  
> +     DRM_DEBUG_DRIVER("resetting chip\n");
> +     kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> +
>       /*
> -      * Note that there's only one work item which does gpu resets, so we
> -      * need not worry about concurrent gpu resets potentially incrementing
> -      * error->reset_counter twice. We only need to take care of another
> -      * racing irq/hangcheck declaring the gpu dead for a second time. A
> -      * quick check for that is good enough: schedule_work ensures the
> -      * correct ordering between hang detection and this work item, and since
> -      * the reset in-progress bit is only ever set by code outside of this
> -      * work we don't need to worry about any other races.
> +      * In most cases it's guaranteed that we get here with an RPM
> +      * reference held, for example because there is a pending GPU
> +      * request that won't finish until the reset is done. This
> +      * isn't the case at least when we get here by doing a
> +      * simulated reset via debugs, so get an RPM reference.
>        */
> -     if (i915_reset_in_progress(&dev_priv->gpu_error)) {
> -             DRM_DEBUG_DRIVER("resetting chip\n");
> -             kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> -
> -             /*
> -              * In most cases it's guaranteed that we get here with an RPM
> -              * reference held, for example because there is a pending GPU
> -              * request that won't finish until the reset is done. This
> -              * isn't the case at least when we get here by doing a
> -              * simulated reset via debugs, so get an RPM reference.
> -              */
> -             intel_runtime_pm_get(dev_priv);
> +     intel_runtime_pm_get(dev_priv);
>  
> -             intel_prepare_reset(dev_priv);
> +     intel_prepare_reset(dev_priv);
>  
> -             /*
> -              * All state reset _must_ be completed before we update the
> -              * reset counter, for otherwise waiters might miss the reset
> -              * pending state and not properly drop locks, resulting in
> -              * deadlocks with the reset work.
> -              */
> -             ret = i915_reset(dev_priv);
> +     /*
> +      * All state reset _must_ be completed before we update the
> +      * reset counter, for otherwise waiters might miss the reset
> +      * pending state and not properly drop locks, resulting in
> +      * deadlocks with the reset work.
> +      */
> +     ret = i915_reset(dev_priv);
>  
> -             intel_finish_reset(dev_priv);
> +     intel_finish_reset(dev_priv);
>  
> -             intel_runtime_pm_put(dev_priv);
> +     intel_runtime_pm_put(dev_priv);
>  
> -             if (ret == 0)
> -                     kobject_uevent_env(kobj,
> -                                        KOBJ_CHANGE, reset_done_event);
> +     if (ret == 0)
> +             kobject_uevent_env(kobj,
> +                                KOBJ_CHANGE, reset_done_event);
>  
> -             /*
> -              * Note: The wake_up also serves as a memory barrier so that
> -              * waiters see the update value of the reset counter atomic_t.
> -              */
> -             wake_up_all(&dev_priv->gpu_error.reset_queue);
> -     }
> +     /*
> +      * Note: The wake_up also serves as a memory barrier so that
> +      * waiters see the updated value of the dev_priv->gpu_error.
> +      */
> +     wake_up_all(&dev_priv->gpu_error.reset_queue);
>  }
>  
>  static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
> @@ -2666,25 +2654,26 @@ void i915_handle_error(struct drm_i915_private 
> *dev_priv,
>       i915_capture_error_state(dev_priv, engine_mask, error_msg);
>       i915_report_and_clear_eir(dev_priv);
>  
> -     if (engine_mask) {
> -             atomic_or(I915_RESET_IN_PROGRESS_FLAG,
> -                             &dev_priv->gpu_error.reset_counter);
> +     if (!engine_mask)
> +             return;
>  
> -             /*
> -              * Wakeup waiting processes so that the reset function
> -              * i915_reset_and_wakeup doesn't deadlock trying to grab
> -              * various locks. By bumping the reset counter first, the woken
> -              * processes will see a reset in progress and back off,
> -              * releasing their locks and then wait for the reset completion.
> -              * We must do this for _all_ gpu waiters that might hold locks
> -              * that the reset work needs to acquire.
> -              *
> -              * Note: The wake_up serves as the required memory barrier to
> -              * ensure that the waiters see the updated value of the reset
> -              * counter atomic_t.
> -              */
> -             i915_error_wake_up(dev_priv);
> -     }
> +     if (test_and_set_bit(I915_RESET_IN_PROGRESS,
> +                          &dev_priv->gpu_error.flags))
> +             return;
> +
> +     /*
> +      * Wakeup waiting processes so that the reset function
> +      * i915_reset_and_wakeup doesn't deadlock trying to grab
> +      * various locks. By bumping the reset counter first, the woken
> +      * processes will see a reset in progress and back off,
> +      * releasing their locks and then wait for the reset completion.
> +      * We must do this for _all_ gpu waiters that might hold locks
> +      * that the reset work needs to acquire.
> +      *
> +      * Note: The wake_up also provides a memory barrier to ensure that the
> +      * waiters see the updated value of the reset flags.
> +      */
> +     i915_error_wake_up(dev_priv);
>  
>       i915_reset_and_wakeup(dev_priv);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_display.c 
> b/drivers/gpu/drm/i915/intel_display.c
> index 19ffd024ddec..2e63e5cfa98d 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -3643,15 +3643,26 @@ void intel_finish_reset(struct drm_i915_private 
> *dev_priv)
>       mutex_unlock(&dev->mode_config.mutex);
>  }
>  
> +static bool abort_flip_on_reset(struct intel_crtc *crtc)
> +{
> +     struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
> +
> +     if (i915_reset_in_progress(error))
> +             return true;
> +
> +     if (crtc->reset_count != i915_reset_count(error))
> +             return true;
> +
> +     return false;
> +}
> +
>  static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
>  {
>       struct drm_device *dev = crtc->dev;
>       struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
> -     unsigned reset_counter;
>       bool pending;
>  
> -     reset_counter = i915_reset_counter(&to_i915(dev)->gpu_error);
> -     if (intel_crtc->reset_counter != reset_counter)
> +     if (abort_flip_on_reset(intel_crtc))
>               return false;
>  
>       spin_lock_irq(&dev->event_lock);
> @@ -11549,10 +11560,8 @@ static bool __pageflip_finished_cs(struct intel_crtc 
> *crtc,
>  {
>       struct drm_device *dev = crtc->base.dev;
>       struct drm_i915_private *dev_priv = to_i915(dev);
> -     unsigned reset_counter;
>  
> -     reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> -     if (crtc->reset_counter != reset_counter)
> +     if (abort_flip_on_reset(crtc))
>               return true;
>  
>       /*
> @@ -12218,8 +12227,8 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>       if (ret)
>               goto cleanup;
>  
> -     intel_crtc->reset_counter = i915_reset_counter(&dev_priv->gpu_error);
> -     if (__i915_reset_in_progress_or_wedged(intel_crtc->reset_counter)) {
> +     intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
> +     if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
>               ret = -EIO;
>               goto cleanup;
>       }
> diff --git a/drivers/gpu/drm/i915/intel_drv.h 
> b/drivers/gpu/drm/i915/intel_drv.h
> index 570a7ca7983f..60e1cd915b85 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -712,8 +712,8 @@ struct intel_crtc {
>  
>       struct intel_crtc_state *config;
>  
> -     /* reset counter value when the last flip was submitted */
> -     unsigned int reset_counter;
> +     /* global reset count when the last flip was submitted */
> +     unsigned int reset_count;
>  
>       /* Access to these should be protected by dev_priv->irq_lock. */
>       bool cpu_fifo_underrun_disabled;
> -- 
> 2.9.3
>
> _______________________________________________
> Intel-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH v2] drm/i915: Separate out reset flags from the reset counter

Reply via email to