On Thu, Dec 12, 2019 at 02:04:58PM +0000, Chris Wilson wrote:
> We monitor the health of the system via periodic heartbeat pulses. The
> pulses also provide the opportunity to perform garbage collection.
> However, we interpret an incomplete pulse (a missed heartbeat) as an
> indication that the system is no longer responsive, i.e. hung, and
> perform an engine or full GPU reset. Given that the preemption
> granularity can be very coarse on a system, we let the sysadmin override
> our legacy timeouts which were "optimised" for desktop applications.
> 
> The heartbeat interval can be adjusted per-engine using,
> 
>       /sys/class/drm/card?/engine/*/heartbeat_interval_ms
> 
> Signed-off-by: Chris Wilson <[email protected]>

Code looks good.
Performed light testing with other sysfs related patches in the series.
The heartbeat_interval_ms file exists and can be modified.

Reviewed-by: Steve Carbonari <[email protected]>
Tested-by: Steve Carbonari <[email protected]


> ---
>  drivers/gpu/drm/i915/Kconfig.profile         |  3 ++
>  drivers/gpu/drm/i915/gt/intel_engine_sysfs.c | 47 ++++++++++++++++++++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig.profile 
> b/drivers/gpu/drm/i915/Kconfig.profile
> index 1f4e98a8532f..ba8767fc0d6e 100644
> --- a/drivers/gpu/drm/i915/Kconfig.profile
> +++ b/drivers/gpu/drm/i915/Kconfig.profile
> @@ -20,6 +20,9 @@ config DRM_I915_HEARTBEAT_INTERVAL
>         check the health of the GPU and undertake regular house-keeping of
>         internal driver state.
>  
> +       This is adjustable via
> +       /sys/class/drm/card?/engine/*/heartbeat_interval_ms
> +
>         May be 0 to disable heartbeats and therefore disable automatic GPU
>         hang detection.
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c 
> b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> index d299c66cf7ec..33b4c00b93f2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> @@ -9,6 +9,7 @@
>  
>  #include "i915_drv.h"
>  #include "intel_engine.h"
> +#include "intel_engine_heartbeat.h"
>  #include "intel_engine_sysfs.h"
>  
>  struct kobj_engine {
> @@ -315,6 +316,49 @@ preempt_timeout_show(struct kobject *kobj, struct 
> kobj_attribute *attr,
>  static struct kobj_attribute preempt_timeout_attr =
>  __ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, 
> preempt_timeout_store);
>  
> +static ssize_t
> +heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
> +             const char *buf, size_t count)
> +{
> +     struct intel_engine_cs *engine = kobj_to_engine(kobj);
> +     unsigned long long delay;
> +     int err;
> +
> +     /*
> +      * We monitor the health of the system via periodic heartbeat pulses.
> +      * The pulses also provide the opportunity to perform garbage
> +      * collection.  However, we interpret an incomplete pulse (a missed
> +      * heartbeat) as an indication that the system is no longer responsive,
> +      * i.e. hung, and perform an engine or full GPU reset. Given that the
> +      * preemption granularity can be very coarse on a system, the optimal
> +      * value for any workload is unknowable!
> +      */
> +
> +     err = kstrtoull(buf, 0, &delay);
> +     if (err)
> +             return err;
> +
> +     if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> +             return -EINVAL;
> +
> +     err = intel_engine_set_heartbeat(engine, delay);
> +     if (err)
> +             return err;
> +
> +     return count;
> +}
> +
> +static ssize_t
> +heartbeat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> +     struct intel_engine_cs *engine = kobj_to_engine(kobj);
> +
> +     return sprintf(buf, "%lu\n", engine->props.heartbeat_interval_ms);
> +}
> +
> +static struct kobj_attribute heartbeat_interval_attr =
> +__ATTR(heartbeat_interval_ms, 0644, heartbeat_show, heartbeat_store);
> +
>  static void kobj_engine_release(struct kobject *kobj)
>  {
>       kfree(kobj);
> @@ -357,6 +401,9 @@ void intel_engines_add_sysfs(struct drm_i915_private 
> *i915)
>               &all_caps_attr.attr,
>               &max_spin_attr.attr,
>               &stop_timeout_attr.attr,
> +#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL
> +             &heartbeat_interval_attr.attr,
> +#endif
>               NULL
>       };
>  
_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to