On Wed, 2017-11-22 at 12:46 +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <[email protected]>
> 
> We add a PMU counter to expose the number of requests currently submitted
> to the GPU, plus the number of runnable requests waiting on GPU time.
> 
> This is useful to analyze the overall load of the system.
> 
> Signed-off-by: Tvrtko Ursulin <[email protected]>
> ---
>  drivers/gpu/drm/i915/i915_pmu.c | 30 +++++++++++++++++++++++++-----
>  include/uapi/drm/i915_drm.h     |  6 ++++++
>  2 files changed, 31 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index 112243720ff3..b2b4b32af35f 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -36,7 +36,8 @@
>  #define ENGINE_SAMPLE_MASK \
>       (BIT(I915_SAMPLE_BUSY) | \
>        BIT(I915_SAMPLE_WAIT) | \
> -      BIT(I915_SAMPLE_SEMA))
> +      BIT(I915_SAMPLE_SEMA) | \
> +      BIT(I915_SAMPLE_QUEUED))
>  
>  #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS)
>  
> @@ -223,6 +224,12 @@ static void engines_sample(struct drm_i915_private 
> *dev_priv)
>  
>               update_sample(&engine->pmu.sample[I915_SAMPLE_SEMA],
>                             PERIOD, !!(val & RING_WAIT_SEMAPHORE));
> +
> +             if (engine->pmu.enable & BIT(I915_SAMPLE_QUEUED))
> +                     update_sample(&engine->pmu.sample[I915_SAMPLE_QUEUED],
> +                                   1 / I915_SAMPLE_QUEUED_SCALE,
> +                                   engine->queued +
> +                                   (last_seqno - current_seqno));
>       }
>  
>       if (fw)
> @@ -310,6 +317,10 @@ static int engine_event_init(struct perf_event *event)
>               if (INTEL_GEN(i915) < 6)
>                       return -ENODEV;
>               break;
> +     case I915_SAMPLE_QUEUED:
> +             if (INTEL_GEN(i915) < 8)
> +                     return -ENODEV;
> +             break;
>       default:
>               return -ENOENT;
>       }
> @@ -399,6 +410,10 @@ static u64 __i915_pmu_event_read(struct perf_event 
> *event)
>               } else if (sample == I915_SAMPLE_BUSY &&
>                          engine->pmu.busy_stats) {
>                       val = ktime_to_ns(intel_engine_get_busy_time(engine));
> +             } else if (sample == I915_SAMPLE_QUEUED) {
> +                     val =
> +                        div_u64(engine->pmu.sample[I915_SAMPLE_QUEUED].cur,
> +                                FREQUENCY);
>               } else {
>                       val = engine->pmu.sample[sample].cur;
>               }
> @@ -679,13 +694,18 @@ static ssize_t i915_pmu_event_show(struct device *dev,
>       I915_EVENT_STR(_name.unit, _unit)
>  
>  #define I915_ENGINE_EVENT(_name, _class, _instance, _sample) \
> -     I915_EVENT_ATTR(_name, __I915_PMU_ENGINE(_class, _instance, _sample)), \
> +     I915_EVENT_ATTR(_name, __I915_PMU_ENGINE(_class, _instance, _sample))
> +
> +#define I915_ENGINE_EVENT_NS(_name, _class, _instance, _sample) \
> +     I915_ENGINE_EVENT(_name, _class, _instance, _sample), \
>       I915_EVENT_STR(_name.unit, "ns")
>  
>  #define I915_ENGINE_EVENTS(_name, _class, _instance) \
> -     I915_ENGINE_EVENT(_name##_instance-busy, _class, _instance, 
> I915_SAMPLE_BUSY), \
> -     I915_ENGINE_EVENT(_name##_instance-sema, _class, _instance, 
> I915_SAMPLE_SEMA), \
> -     I915_ENGINE_EVENT(_name##_instance-wait, _class, _instance, 
> I915_SAMPLE_WAIT)
> +     I915_ENGINE_EVENT_NS(_name##_instance-busy, _class, _instance, 
> I915_SAMPLE_BUSY), \
> +     I915_ENGINE_EVENT_NS(_name##_instance-sema, _class, _instance, 
> I915_SAMPLE_SEMA), \
> +     I915_ENGINE_EVENT_NS(_name##_instance-wait, _class, _instance, 
> I915_SAMPLE_WAIT), \
> +     I915_ENGINE_EVENT(_name##_instance-queued, _class, _instance, 
> I915_SAMPLE_QUEUED), \
> +     I915_EVENT_STR(_name##_instance-queued.scale, 
> __stringify(I915_SAMPLE_QUEUED_SCALE))

We expose queued as an "instant" metric, i.e. that's a number of
requests on the very moment when we query the metric, i.e. that's not an
ever growing counter - is that right? I doubt such a metric will make
sense for perf-stat. Can we somehow restrict it to be queried by uAPI
only and avoid perf-stat for it?

>  
>  static struct attribute *i915_pmu_events_attrs[] = {
>       I915_ENGINE_EVENTS(rcs, I915_ENGINE_CLASS_RENDER, 0),
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 915a6e85a855..20ee668d1428 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -111,9 +111,12 @@ enum drm_i915_pmu_engine_sample {
>       I915_SAMPLE_BUSY = 0,
>       I915_SAMPLE_WAIT = 1,
>       I915_SAMPLE_SEMA = 2,
> +     I915_SAMPLE_QUEUED = 3,
>       I915_ENGINE_SAMPLE_MAX /* non-ABI */
>  };
>  
> +#define I915_SAMPLE_QUEUED_SCALE 1e-2 /* No braces please. */
> +
>  #define I915_PMU_SAMPLE_BITS (4)
>  #define I915_PMU_SAMPLE_MASK (0xf)
>  #define I915_PMU_SAMPLE_INSTANCE_BITS (8)
> @@ -134,6 +137,9 @@ enum drm_i915_pmu_engine_sample {
>  #define I915_PMU_ENGINE_SEMA(class, instance) \
>       __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA)
>  
> +#define I915_PMU_ENGINE_QUEUED(class, instance) \
> +     __I915_PMU_ENGINE(class, instance, I915_SAMPLE_QUEUED)
> +
>  #define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x))
>  
>  #define I915_PMU_ACTUAL_FREQUENCY    __I915_PMU_OTHER(0)

_______________________________________________
Intel-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to