On 04/03/2013 11:59 AM, Matt Turner wrote:
On Wed, Apr 3, 2013 at 10:25 AM, Eric Anholt <[email protected]> wrote:
The way we were allocating registers before, packing into low register
numbers for Ironlake, resulted in an overly-constrained dependency graph
for instruction scheduling.  Improves GLBenchmark 2.1 performance by
3.4% +/- 0.6% (n=26)
---
  src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp |    2 ++
  src/mesa/program/register_allocate.c              |   31 +++++++++++++++++++--
  src/mesa/program/register_allocate.h              |    1 +
  3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 4ee7bbc..b9b0303 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -108,6 +108,8 @@ brw_alloc_reg_set(struct brw_context *brw, int reg_width)

     uint8_t *ra_reg_to_grf = ralloc_array(brw, uint8_t, ra_reg_count);
     struct ra_regs *regs = ra_alloc_reg_set(brw, ra_reg_count);
+   if (intel->gen >= 6)
+      ra_set_allocate_round_robin(regs);
     int *classes = ralloc_array(brw, int, class_count);
     int aligned_pairs_class = -1;

diff --git a/src/mesa/program/register_allocate.c 
b/src/mesa/program/register_allocate.c
index a9064c3..5f45662 100644
--- a/src/mesa/program/register_allocate.c
+++ b/src/mesa/program/register_allocate.c
@@ -70,6 +70,7 @@
   * this during ra_set_finalize().
   */

+#include <stdbool.h>
  #include <ralloc.h>

  #include "main/imports.h"
@@ -93,6 +94,8 @@ struct ra_regs {

     struct ra_class **classes;
     unsigned int class_count;
+
+   bool round_robin;
  };

  struct ra_class {
@@ -185,6 +188,22 @@ ra_alloc_reg_set(void *mem_ctx, unsigned int count)
     return regs;
  }

+/**
+ * The register allocator by default prefers to allocate low register numbers,
+ * since it was written for hardware (gen4/5 Intel) that is limited in its
+ * multithreadedness by the number of registers used in a given shader.
+ *
+ * However, for hardware without that restriction, densely packed register
+ * allocation can put serious constraints on instruction scheduling.  This
+ * function tells the allocator to rotate around the registers if possible as
+ * it allocates the nodes.
+ */
+void
+ra_set_allocate_round_robin(struct ra_regs *regs)
+{
+   regs->round_robin = true;
+}
+
  static void
  ra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2)
  {
@@ -436,16 +455,19 @@ GLboolean
  ra_select(struct ra_graph *g)
  {
     int i;
+   int start_search_reg = 0;

     while (g->stack_count != 0) {
-      unsigned int r;
+      unsigned int ri;
+      unsigned int r = -1;
        int n = g->stack[g->stack_count - 1];
        struct ra_class *c = g->regs->classes[g->nodes[n].class];

        /* Find the lowest-numbered reg which is not used by a member
         * of the graph adjacent to us.
         */
-      for (r = 0; r < g->regs->count; r++) {
+      for (ri = 0; ri < g->regs->count; ri++) {
+         r = (start_search_reg + ri) % g->regs->count;
          if (!c->regs[r])
             continue;

@@ -461,12 +483,15 @@ ra_select(struct ra_graph *g)
          if (i == g->nodes[n].adjacency_count)
             break;
        }
-      if (r == g->regs->count)
+      if (ri == g->regs->count)
          return GL_FALSE;

        g->nodes[n].reg = r;
        g->nodes[n].in_stack = GL_FALSE;
        g->stack_count--;
+
+      if (g->regs->round_robin)
+         start_search_reg = r;
     }

With the s/= r/= r + 1/ change mentioned on IRC to make this work for
8-wide too, it gets my

Reviewed-by: Matt Turner <[email protected]>

With that fixed,
Reviewed-by: Kenneth Graunke <[email protected]>

3.4% is pretty awesome...nice work!
_______________________________________________
mesa-dev mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to