The V3D engine has various hardware counters which might be interesting
to userspace performance analysis tools.

Expose new ioctls to create/destroy a performance monitor object and
query the counter values of this perfmance monitor.

Not that a perfomance monitor is given an ID that is only valid on the
file descriptor it has been allocated from. A perfmance monitor can be
attached to a CL submission and the driver will enable HW counters for
this request and update the performance monitor values at the end of the
job.

Signed-off-by: Boris Brezillon <[email protected]>
---
 drivers/gpu/drm/vc4/Makefile      |   1 +
 drivers/gpu/drm/vc4/vc4_drv.c     |  26 +++++
 drivers/gpu/drm/vc4/vc4_drv.h     |  43 +++++++++
 drivers/gpu/drm/vc4/vc4_gem.c     |  33 +++++--
 drivers/gpu/drm/vc4/vc4_irq.c     |  13 ++-
 drivers/gpu/drm/vc4/vc4_perfmon.c | 195 ++++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/vc4/vc4_regs.h    |  35 +------
 drivers/gpu/drm/vc4/vc4_v3d.c     |  64 ++++++-------
 include/uapi/drm/vc4_drm.h        |  75 +++++++++++++++
 9 files changed, 414 insertions(+), 71 deletions(-)
 create mode 100644 drivers/gpu/drm/vc4/vc4_perfmon.c

diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
index 719a771f3d5c..1100e34d1947 100644
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -14,6 +14,7 @@ vc4-y := \
        vc4_vec.o \
        vc4_hvs.o \
        vc4_irq.o \
+       vc4_perfmon.o \
        vc4_plane.o \
        vc4_render_cl.o \
        vc4_trace_points.o \
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 5c62013f8ca3..ca0d4419ba5a 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -102,6 +102,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void 
*data,
        case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER:
        case DRM_VC4_PARAM_SUPPORTS_MADVISE:
        case DRM_VC4_PARAM_SUPPORTS_EXTENDED_CL:
+       case DRM_VC4_PARAM_SUPPORTS_PERFMON:
                args->value = true;
                break;
        default:
@@ -119,6 +120,26 @@ static void vc4_lastclose(struct drm_device *dev)
        drm_fbdev_cma_restore_mode(vc4->fbdev);
 }
 
+static int vc4_open(struct drm_device *dev, struct drm_file *file)
+{
+       struct vc4_file *vc4file;
+
+       vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL);
+       if (!vc4file)
+               return -ENOMEM;
+
+       vc4_perfmon_open_file(vc4file);
+       file->driver_priv = vc4file;
+       return 0;
+}
+
+static void vc4_close(struct drm_device *dev, struct drm_file *file)
+{
+       struct vc4_file *vc4file = file->driver_priv;
+
+       vc4_perfmon_close_file(vc4file);
+}
+
 static const struct vm_operations_struct vc4_vm_ops = {
        .fault = vc4_fault,
        .open = drm_gem_vm_open,
@@ -151,6 +172,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
        DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, 
DRM_RENDER_ALLOW),
        DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
        DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, 
DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, 
DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, 
DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, 
DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver vc4_drm_driver = {
@@ -161,6 +185,8 @@ static struct drm_driver vc4_drm_driver = {
                            DRIVER_RENDER |
                            DRIVER_PRIME),
        .lastclose = vc4_lastclose,
+       .open = vc4_open,
+       .postclose = vc4_close,
        .irq_handler = vc4_irq,
        .irq_preinstall = vc4_irq_preinstall,
        .irq_postinstall = vc4_irq_postinstall,
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 3c54cc386443..d8156a00d77a 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -11,6 +11,8 @@
 #include <drm/drm_encoder.h>
 #include <drm/drm_gem_cma_helper.h>
 
+#include "uapi/drm/vc4_drm.h"
+
 /* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
  * this.
  */
@@ -29,6 +31,13 @@ enum vc4_kernel_bo_type {
        VC4_BO_TYPE_COUNT
 };
 
+struct vc4_perfmon {
+       refcount_t refcnt;
+       u8 ncounters;
+       u8 events[DRM_VC4_MAX_PERF_COUNTERS];
+       u64 counters[0];
+};
+
 struct vc4_dev {
        struct drm_device *dev;
 
@@ -158,6 +167,8 @@ struct vc4_dev {
        } hangcheck;
 
        struct semaphore async_modeset;
+
+       bool perfmon_active;
 };
 
 static inline struct vc4_dev *
@@ -410,6 +421,22 @@ struct vc4_exec_info {
        void *uniforms_v;
        uint32_t uniforms_p;
        uint32_t uniforms_size;
+
+       /* Pointer to a performance monitor object if the user requested it,
+        * NULL otherwise.
+        */
+       struct vc4_perfmon *perfmon;
+};
+
+/*
+ * Per-open file private data. Any driver-specific resource that has to be
+ * released when the DRM file is closed should be placed here.
+ */
+struct vc4_file {
+       struct {
+               struct idr idr;
+               struct mutex lock;
+       } perfmon;
 };
 
 static inline struct vc4_exec_info *
@@ -650,3 +677,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec,
 /* vc4_validate_shader.c */
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
+
+/* vc4_perfmon.c */
+void vc4_perfmon_get(struct vc4_perfmon *perfmon);
+void vc4_perfmon_put(struct vc4_perfmon *perfmon);
+void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon);
+void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
+                     bool capture);
+struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id);
+void vc4_perfmon_open_file(struct vc4_file *vc4file);
+void vc4_perfmon_close_file(struct vc4_file *vc4file);
+int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
+                            struct drm_file *file_priv);
+int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+                             struct drm_file *file_priv);
+int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+                                struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index 06976c61422a..d2813c7412e3 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -454,6 +454,8 @@ vc4_submit_next_bin_job(struct drm_device *dev)
 
        vc4_flush_caches(dev);
 
+       vc4_perfmon_start(vc4, exec->perfmon);
+
        /* Either put the job in the binner if it uses the binner, or
         * immediately move it to the to-be-rendered queue.
         */
@@ -646,11 +648,11 @@ vc4_queue_submit(struct drm_device *dev, struct 
vc4_exec_info *exec,
 
        list_add_tail(&exec->head, &vc4->bin_job_list);
 
-       /* If no job was executing, kick ours off.  Otherwise, it'll
-        * get started when the previous job's flush done interrupt
-        * occurs.
+       /* If no job was executing and the previous job did not activate
+        * the performance monitor, kick ours off.  Otherwise, it'll get
+        * started when the previous job's flush/render done interrupt occurs.
         */
-       if (vc4_first_bin_job(vc4) == exec) {
+       if (vc4_first_bin_job(vc4) == exec && !vc4->perfmon_active) {
                vc4_submit_next_bin_job(dev);
                vc4_queue_hangcheck(dev);
        }
@@ -913,6 +915,9 @@ vc4_complete_exec(struct drm_device *dev, struct 
vc4_exec_info *exec)
        vc4->bin_alloc_used &= ~exec->bin_slots;
        spin_unlock_irqrestore(&vc4->job_lock, irqflags);
 
+       /* Release the reference we had on the perf monitor. */
+       vc4_perfmon_put(exec->perfmon);
+
        mutex_lock(&vc4->power_lock);
        if (--vc4->power_refcount == 0) {
                pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1050,7 +1055,8 @@ vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
 }
 
 static int
-vc4_parse_cl_chunk(struct vc4_dev *vc4, struct vc4_exec_info *exec,
+vc4_parse_cl_chunk(struct vc4_dev *vc4, struct vc4_file *vc4file,
+                  struct vc4_exec_info *exec,
                   const union drm_vc4_submit_cl_chunk *chunk)
 {
        switch(chunk->dummy.type) {
@@ -1063,6 +1069,20 @@ vc4_parse_cl_chunk(struct vc4_dev *vc4, struct 
vc4_exec_info *exec,
                exec->args->bin_cl_size = chunk->bin.size;
                break;
 
+       case VC4_PERFMON_CHUNK:
+               if (chunk->perfmon.pad)
+                       return -EINVAL;
+
+               if (exec->perfmon)
+                       return -EINVAL;
+
+               exec->perfmon = vc4_perfmon_find(vc4file,
+                                                chunk->perfmon.id);
+               if (!exec->perfmon)
+                       return -EINVAL;
+
+               break;
+
        default:
                return -EINVAL;
        }
@@ -1087,6 +1107,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
                    struct drm_file *file_priv)
 {
        struct vc4_dev *vc4 = to_vc4_dev(dev);
+       struct vc4_file *vc4file = file_priv->driver_priv;
        struct drm_vc4_submit_cl *args = data;
        struct vc4_exec_info *exec;
        struct ww_acquire_ctx acquire_ctx;
@@ -1154,7 +1175,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
                exec->args->bin_cl = 0;
                exec->args->bin_cl_size = 0;
                for (i = 0; i < exec->nchunks; i++) {
-                       ret = vc4_parse_cl_chunk(vc4, exec,
+                       ret = vc4_parse_cl_chunk(vc4, vc4file, exec,
                                                 &exec->chunks[i]);
                        if (ret)
                                goto fail;
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
index 7d7af3a93d94..181f1fa05c7a 100644
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -110,7 +110,9 @@ vc4_irq_finish_bin_job(struct drm_device *dev)
                return;
 
        vc4_move_job_to_render(dev, exec);
-       vc4_submit_next_bin_job(dev);
+
+       if (!exec->perfmon)
+               vc4_submit_next_bin_job(dev);
 }
 
 static void
@@ -122,6 +124,7 @@ vc4_cancel_bin_job(struct drm_device *dev)
        if (!exec)
                return;
 
+       vc4_perfmon_stop(vc4, exec->perfmon, false);
        list_move_tail(&exec->head, &vc4->bin_job_list);
        vc4_submit_next_bin_job(dev);
 }
@@ -135,6 +138,14 @@ vc4_irq_finish_render_job(struct drm_device *dev)
        if (!exec)
                return;
 
+       vc4_perfmon_stop(vc4, exec->perfmon, true);
+
+       /* perfmon may have stalled the binner, re-arm the dequeuing
+        * logic.
+        */
+       if (exec->perfmon)
+               vc4_submit_next_bin_job(dev);
+
        vc4->finished_seqno++;
        list_move_tail(&exec->head, &vc4->job_done_list);
        if (exec->fence) {
diff --git a/drivers/gpu/drm/vc4/vc4_perfmon.c 
b/drivers/gpu/drm/vc4/vc4_perfmon.c
new file mode 100644
index 000000000000..f736a25661e1
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_perfmon.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: VC4 V3D performance monitor module
+ *
+ * The V3D block provides 16 hardware counters which can count various events.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+
+void vc4_perfmon_get(struct vc4_perfmon *perfmon)
+{
+       if (perfmon)
+               refcount_inc(&perfmon->refcnt);
+}
+
+void vc4_perfmon_put(struct vc4_perfmon *perfmon)
+{
+       if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
+               kfree(perfmon);
+}
+
+void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon)
+{
+       unsigned int i;
+       u32 mask;
+
+       if (!perfmon || WARN_ON(vc4->perfmon_active))
+               return;
+
+       for (i = 0; i < perfmon->ncounters; i++)
+               V3D_WRITE(V3D_PCTRS(i), perfmon->events[i]);
+
+       mask = GENMASK(perfmon->ncounters - 1, 0);
+       V3D_WRITE(V3D_PCTRE, V3D_PCTRE_EN | mask);
+       V3D_WRITE(V3D_PCTRC, mask);
+       vc4->perfmon_active = true;
+}
+
+void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
+                     bool capture)
+{
+       unsigned int i;
+
+       if (!perfmon || WARN_ON(!vc4->perfmon_active))
+               return;
+
+       if (capture) {
+               for (i = 0; i < perfmon->ncounters; i++)
+                       perfmon->counters[i] += V3D_READ(V3D_PCTR(i));
+       }
+
+       V3D_WRITE(V3D_PCTRE, 0);
+       vc4->perfmon_active = false;
+}
+
+struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id)
+{
+       struct vc4_perfmon *perfmon;
+
+       mutex_lock(&vc4file->perfmon.lock);
+       perfmon = idr_find(&vc4file->perfmon.idr, id);
+       vc4_perfmon_get(perfmon);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       return perfmon;
+}
+
+void vc4_perfmon_open_file(struct vc4_file *vc4file)
+{
+       mutex_init(&vc4file->perfmon.lock);
+       idr_init(&vc4file->perfmon.idr);
+}
+
+static int vc4_perfmon_idr_del(int id, void *elem, void *data)
+{
+       struct vc4_perfmon *perfmon = elem;
+
+       vc4_perfmon_put(perfmon);
+
+       return 0;
+}
+
+void vc4_perfmon_close_file(struct vc4_file *vc4file)
+{
+       mutex_lock(&vc4file->perfmon.lock);
+       idr_for_each(&vc4file->perfmon.idr, vc4_perfmon_idr_del, NULL);
+       idr_destroy(&vc4file->perfmon.idr);
+       mutex_unlock(&vc4file->perfmon.lock);
+}
+
+int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
+                            struct drm_file *file_priv)
+{
+       struct vc4_file *vc4file = file_priv->driver_priv;
+       struct drm_vc4_perfmon_create *req = data;
+       struct vc4_perfmon *perfmon;
+       unsigned int i;
+       int ret;
+
+       /* Number of monitored counters cannot exceed HW limits. */
+       if (req->ncounters > DRM_VC4_MAX_PERF_COUNTERS ||
+           !req->ncounters)
+               return -EINVAL;
+
+       /* Make sure all events are valid. */
+       for (i = 0; i < req->ncounters; i++) {
+               if (req->events[i] >= VC4_PERFCNT_NUM_EVENTS)
+                       return -EINVAL;
+       }
+
+       perfmon = kzalloc(sizeof(*perfmon) + (req->ncounters * sizeof(u64)),
+                         GFP_KERNEL);
+       if (!perfmon)
+               return -ENOMEM;
+
+       for (i = 0; i < req->ncounters; i++)
+               perfmon->events[i] = req->events[i];
+
+       perfmon->ncounters = req->ncounters;
+
+       refcount_set(&perfmon->refcnt, 1);
+
+       mutex_lock(&vc4file->perfmon.lock);
+       ret = idr_alloc(&vc4file->perfmon.idr, perfmon, 0, INT_MAX,
+                       GFP_KERNEL);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       if (ret < 0) {
+               kfree(perfmon);
+               return ret;
+       }
+
+       req->id = ret;
+       return 0;
+}
+
+int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+                             struct drm_file *file_priv)
+{
+       struct vc4_file *vc4file = file_priv->driver_priv;
+       struct drm_vc4_perfmon_destroy *req = data;
+       struct vc4_perfmon *perfmon;
+
+       mutex_lock(&vc4file->perfmon.lock);
+       perfmon = idr_remove(&vc4file->perfmon.idr, req->id);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       if (!perfmon)
+               return -EINVAL;
+
+       vc4_perfmon_put(perfmon);
+       return 0;
+}
+
+int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+                                struct drm_file *file_priv)
+{
+       struct vc4_file *vc4file = file_priv->driver_priv;
+       struct drm_vc4_perfmon_get_values *req = data;
+       struct vc4_perfmon *perfmon;
+       int ret;
+
+       mutex_lock(&vc4file->perfmon.lock);
+       perfmon = idr_find(&vc4file->perfmon.idr, req->id);
+       vc4_perfmon_get(perfmon);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       if (!perfmon)
+               return -EINVAL;
+
+       if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->counters,
+                        perfmon->ncounters * sizeof(u64)))
+               ret = -EFAULT;
+       else
+               ret = 0;
+
+       vc4_perfmon_put(perfmon);
+       return ret;
+}
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
index 55677bd50f66..b9749cb24063 100644
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -122,38 +122,9 @@
 #define V3D_VPMBASE  0x00504
 #define V3D_PCTRC    0x00670
 #define V3D_PCTRE    0x00674
-#define V3D_PCTR0    0x00680
-#define V3D_PCTRS0   0x00684
-#define V3D_PCTR1    0x00688
-#define V3D_PCTRS1   0x0068c
-#define V3D_PCTR2    0x00690
-#define V3D_PCTRS2   0x00694
-#define V3D_PCTR3    0x00698
-#define V3D_PCTRS3   0x0069c
-#define V3D_PCTR4    0x006a0
-#define V3D_PCTRS4   0x006a4
-#define V3D_PCTR5    0x006a8
-#define V3D_PCTRS5   0x006ac
-#define V3D_PCTR6    0x006b0
-#define V3D_PCTRS6   0x006b4
-#define V3D_PCTR7    0x006b8
-#define V3D_PCTRS7   0x006bc
-#define V3D_PCTR8    0x006c0
-#define V3D_PCTRS8   0x006c4
-#define V3D_PCTR9    0x006c8
-#define V3D_PCTRS9   0x006cc
-#define V3D_PCTR10   0x006d0
-#define V3D_PCTRS10  0x006d4
-#define V3D_PCTR11   0x006d8
-#define V3D_PCTRS11  0x006dc
-#define V3D_PCTR12   0x006e0
-#define V3D_PCTRS12  0x006e4
-#define V3D_PCTR13   0x006e8
-#define V3D_PCTRS13  0x006ec
-#define V3D_PCTR14   0x006f0
-#define V3D_PCTRS14  0x006f4
-#define V3D_PCTR15   0x006f8
-#define V3D_PCTRS15  0x006fc
+# define V3D_PCTRE_EN  BIT(31)
+#define V3D_PCTR(x)  (0x00680 + ((x) * 8))
+#define V3D_PCTRS(x) (0x00684 + ((x) * 8))
 #define V3D_DBGE     0x00f00
 #define V3D_FDBGO    0x00f04
 #define V3D_FDBGB    0x00f08
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
index 622cd43840b8..35c00050d18b 100644
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -68,38 +68,38 @@ static const struct {
        REGDEF(V3D_VPMBASE),
        REGDEF(V3D_PCTRC),
        REGDEF(V3D_PCTRE),
-       REGDEF(V3D_PCTR0),
-       REGDEF(V3D_PCTRS0),
-       REGDEF(V3D_PCTR1),
-       REGDEF(V3D_PCTRS1),
-       REGDEF(V3D_PCTR2),
-       REGDEF(V3D_PCTRS2),
-       REGDEF(V3D_PCTR3),
-       REGDEF(V3D_PCTRS3),
-       REGDEF(V3D_PCTR4),
-       REGDEF(V3D_PCTRS4),
-       REGDEF(V3D_PCTR5),
-       REGDEF(V3D_PCTRS5),
-       REGDEF(V3D_PCTR6),
-       REGDEF(V3D_PCTRS6),
-       REGDEF(V3D_PCTR7),
-       REGDEF(V3D_PCTRS7),
-       REGDEF(V3D_PCTR8),
-       REGDEF(V3D_PCTRS8),
-       REGDEF(V3D_PCTR9),
-       REGDEF(V3D_PCTRS9),
-       REGDEF(V3D_PCTR10),
-       REGDEF(V3D_PCTRS10),
-       REGDEF(V3D_PCTR11),
-       REGDEF(V3D_PCTRS11),
-       REGDEF(V3D_PCTR12),
-       REGDEF(V3D_PCTRS12),
-       REGDEF(V3D_PCTR13),
-       REGDEF(V3D_PCTRS13),
-       REGDEF(V3D_PCTR14),
-       REGDEF(V3D_PCTRS14),
-       REGDEF(V3D_PCTR15),
-       REGDEF(V3D_PCTRS15),
+       REGDEF(V3D_PCTR(0)),
+       REGDEF(V3D_PCTRS(0)),
+       REGDEF(V3D_PCTR(1)),
+       REGDEF(V3D_PCTRS(1)),
+       REGDEF(V3D_PCTR(2)),
+       REGDEF(V3D_PCTRS(2)),
+       REGDEF(V3D_PCTR(3)),
+       REGDEF(V3D_PCTRS(3)),
+       REGDEF(V3D_PCTR(4)),
+       REGDEF(V3D_PCTRS(4)),
+       REGDEF(V3D_PCTR(5)),
+       REGDEF(V3D_PCTRS(5)),
+       REGDEF(V3D_PCTR(6)),
+       REGDEF(V3D_PCTRS(6)),
+       REGDEF(V3D_PCTR(7)),
+       REGDEF(V3D_PCTRS(7)),
+       REGDEF(V3D_PCTR(8)),
+       REGDEF(V3D_PCTRS(8)),
+       REGDEF(V3D_PCTR(9)),
+       REGDEF(V3D_PCTRS(9)),
+       REGDEF(V3D_PCTR(10)),
+       REGDEF(V3D_PCTRS(10)),
+       REGDEF(V3D_PCTR(11)),
+       REGDEF(V3D_PCTRS(11)),
+       REGDEF(V3D_PCTR(12)),
+       REGDEF(V3D_PCTRS(12)),
+       REGDEF(V3D_PCTR(13)),
+       REGDEF(V3D_PCTRS(13)),
+       REGDEF(V3D_PCTR(14)),
+       REGDEF(V3D_PCTRS(14)),
+       REGDEF(V3D_PCTR(15)),
+       REGDEF(V3D_PCTRS(15)),
        REGDEF(V3D_DBGE),
        REGDEF(V3D_FDBGO),
        REGDEF(V3D_FDBGB),
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index ddcaa72da82a..088e519caf6e 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -42,6 +42,9 @@ extern "C" {
 #define DRM_VC4_GET_TILING                        0x09
 #define DRM_VC4_LABEL_BO                          0x0a
 #define DRM_VC4_GEM_MADVISE                       0x0b
+#define DRM_VC4_PERFMON_CREATE                    0x0c
+#define DRM_VC4_PERFMON_DESTROY                   0x0d
+#define DRM_VC4_PERFMON_GET_VALUES                0x0e
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -55,6 +58,9 @@ extern "C" {
 #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
 #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
 #define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
+#define DRM_IOCTL_VC4_PERFMON_CREATE      DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_PERFMON_CREATE, struct drm_vc4_perfmon_create)
+#define DRM_IOCTL_VC4_PERFMON_DESTROY     DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_PERFMON_DESTROY, struct drm_vc4_perfmon_destroy)
+#define DRM_IOCTL_VC4_PERFMON_GET_VALUES  DRM_IOWR(DRM_COMMAND_BASE + 
DRM_VC4_PERFMON_GET_VALUES, struct drm_vc4_perfmon_get_values)
 
 struct drm_vc4_submit_rcl_surface {
        __u32 hindex; /* Handle index, or ~0 if not present. */
@@ -71,9 +77,11 @@ struct drm_vc4_submit_rcl_surface {
 
 /**
  * @VC4_BIN_CL_CHUNK: binner CL chunk
+ * @VC4_PERFMON_CHUNK: performance monitor chunk
  */
 enum {
        VC4_BIN_CL_CHUNK,
+       VC4_PERFMON_CHUNK,
 };
 
 /**
@@ -102,6 +110,20 @@ struct drm_vc4_submit_cl_bin_chunk {
 };
 
 /**
+ * struct drm_vc4_submit_cl_perfmon_chunk - performance monitor extension
+ *
+ * @type: extention type, should be set to %VC4_PERFMON_CHUNK
+ * @id: id of the perfmance monitor previously allocated with
+ *     %DRM_IOCTL_VC4_PERFMON_CREATE
+ * @pad: unused, should be set to zero
+ */
+struct drm_vc4_submit_cl_perfmon_chunk {
+       __u32 type;
+       __u32 id;
+       __u64 pad;
+};
+
+/**
  * union drm_vc4_submit_cl_chunk - CL chunk
  *
  * CL chunks allow us to easily extend the set of arguments one can pass
@@ -111,6 +133,7 @@ struct drm_vc4_submit_cl_bin_chunk {
 union drm_vc4_submit_cl_chunk {
        struct drm_vc4_submit_cl_dummy_chunk dummy;
        struct drm_vc4_submit_cl_bin_chunk bin;
+       struct drm_vc4_submit_cl_perfmon_chunk perfmon;
 };
 
 /**
@@ -369,6 +392,7 @@ struct drm_vc4_get_hang_state {
 #define DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER 6
 #define DRM_VC4_PARAM_SUPPORTS_MADVISE         7
 #define DRM_VC4_PARAM_SUPPORTS_EXTENDED_CL     8
+#define DRM_VC4_PARAM_SUPPORTS_PERFMON         9
 
 struct drm_vc4_get_param {
        __u32 param;
@@ -413,6 +437,57 @@ struct drm_vc4_gem_madvise {
        __u32 pad;
 };
 
+enum {
+       VC4_PERFCNT_FEP_VALID_PRIMS_NO_RENDER,
+       VC4_PERFCNT_FEP_VALID_PRIMS_RENDER,
+       VC4_PERFCNT_FEP_CLIPPED_QUADS,
+       VC4_PERFCNT_FEP_VALID_QUADS,
+       VC4_PERFCNT_TLB_QUADS_NOT_PASSING_STENCIL,
+       VC4_PERFCNT_TLB_QUADS_NOT_PASSING_Z_AND_STENCIL,
+       VC4_PERFCNT_TLB_QUADS_PASSING_Z_AND_STENCIL,
+       VC4_PERFCNT_TLB_QUADS_ZERO_COVERAGE,
+       VC4_PERFCNT_TLB_QUADS_NON_ZERO_COVERAGE,
+       VC4_PERFCNT_TLB_QUADS_WRITTEN_TO_COLOR_BUF,
+       VC4_PERFCNT_PLB_PRIMS_OUTSIDE_VIEWPORT,
+       VC4_PERFCNT_PLB_PRIMS_NEED_CLIPPING,
+       VC4_PERFCNT_PSE_PRIMS_REVERSED,
+       VC4_PERFCNT_QPU_TOTAL_IDLE_CYCLES,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_VERTEX_COORD_SHADING,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_FRAGMENT_SHADING,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_EXEC_VALID_INST,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_TMUS,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_SCOREBOARD,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_VARYINGS,
+       VC4_PERFCNT_QPU_TOTAL_INST_CACHE_HIT,
+       VC4_PERFCNT_QPU_TOTAL_INST_CACHE_MISS,
+       VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_HIT,
+       VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_MISS,
+       VC4_PERFCNT_TMU_TOTAL_TEXT_QUADS_PROCESSED,
+       VC4_PERFCNT_TMU_TOTAL_TEXT_CACHE_MISS,
+       VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VDW_STALLED,
+       VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VCD_STALLED,
+       VC4_PERFCNT_L2C_TOTAL_L2_CACHE_HIT,
+       VC4_PERFCNT_L2C_TOTAL_L2_CACHE_MISS,
+       VC4_PERFCNT_NUM_EVENTS,
+};
+
+#define DRM_VC4_MAX_PERF_COUNTERS      16
+
+struct drm_vc4_perfmon_create {
+       __u32 id;
+       __u32 ncounters;
+       __u8 events[DRM_VC4_MAX_PERF_COUNTERS];
+};
+
+struct drm_vc4_perfmon_destroy {
+       __u32 id;
+};
+
+struct drm_vc4_perfmon_get_values {
+       __u32 id;
+       __u64 values_ptr;
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
2.11.0

_______________________________________________
dri-devel mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to