Hi,

this is probably rather an RFC than a patch as I'm not sure whether
reassoc is the right place to fix it.  On top, the heuristic might
be a bit "ad-hoc".  Maybe we can also work around it in the vectorizer?

The following function is vectorized in a very inefficient way because we
construct vectors from scalar loads.

uint64_t
foo (uint8_t *pix, int i_stride)
{
  uint32_t sum = 0, sqr = 0;
  int x, y;
  for (y = 0; y < 16; y++)
    {
      for (x = 0; x < 16; x++)
        sum += pix[x];
      pix += i_stride;
    }
  return sum;
}

The reason for this is that reassoc reorders the last three operands of
the summation sequence by rank introducing a temporary in the process
that breaks the "homogeneity" (sum_n = sum_{n - 1} + pix[n]) of the sum.

This patch adds a function likely_vectorizable_p that checks if an
operand vector contains only operands of the same rank except the last
one.  In that case the sequence is likely vectorizable and the easier
vectorization will outweigh any CSE opportunity we can expose by
swapping operands.

Bootstrapped and regtested on x86, regtested on rv64gcv.

Regards
 Robin

gcc/ChangeLog:

        * tree-ssa-reassoc.cc (likely_vectorizable_p): New function.
        (reassociate_bb): Use new function.
        (dump_ops_vector): Change prototype.
        (debug_ops_vector): Change prototype.

gcc/testsuite/ChangeLog:

        * gcc.dg/tree-ssa/reassoc-52.c: New test.
---
 gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c |  27 ++++++
 gcc/tree-ssa-reassoc.cc                    | 102 ++++++++++++++++++++-
 2 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c 
b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
new file mode 100644
index 00000000000..b117b7519bb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/reassoc-52.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -fdump-tree-reassoc-details" } */
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long uint64_t;
+
+uint64_t
+foo (uint8_t *pix, int i_stride)
+{
+  uint32_t sum = 0, sqr = 0;
+  int x, y;
+  for (y = 0; y < 16; y++)
+    {
+      for (x = 0; x < 16; x++)
+       sum += pix[x];
+      pix += i_stride;
+    }
+  return sum;
+}
+
+/* Ensure that we only add to sum variables and don't create a temporary that
+   does something else.  In doing so we enable a much more efficient
+   vectorization scheme.  */
+
+/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s+=\\s+sum_\\d+\\s+\\\+" 
16 "reassoc1" } } */
+/* { dg-final { scan-tree-dump-times "\\s+sum_\\d+\\s=.*\\\+\\ssum_\\d+;" 1 
"reassoc1" } } */
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 556ecdebe2d..13f8a1070b6 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -6992,6 +6992,96 @@ rank_ops_for_fma (vec<operand_entry *> *ops)
     }
   return mult_num;
 }
+
+/* Check if the operand vector contains operands that all have the same rank
+   apart from the last one.  The last one must be a PHI node which links back
+   to the first operand.
+   This can indicate an easily vectorizable sequence in which case we don't
+   want to reorder the first three elements for rank reasons.
+
+   Consider the following example:
+
+    for (int y = 0; y < 16; y++)
+      {
+       for (int x = 0; x < 16; x++)
+         {
+           sum += pix[x];
+         }
+         ...
+      }
+
+    # sum_201 = PHI <sum_230(3), 0(2)>
+    # y_149 = PHI <y_24(3), 0(2)>
+
+    _33 = *pix_214;
+    _34 = (unsigned intD.4) _33;
+    sum_35 = sum_201 + _34;
+
+    _46 = MEM[(uint8_tD.2849 *)pix_214 + 1B];
+    _47 = (unsigned intD.4) _46;
+    sum_48 = sum_35 + _47;
+    _49 = (intD.1) _46;
+
+    All loads of pix are of the same rank, just the sum PHI has a different
+    one in {..., _46, _33, sum_201}.
+    swap_ops_for_binary_stmt will move sum_201 before _46 and _33 so that
+    the first operation _33 OP _46 is exposed as an optimization opportunity.
+    In doing so it will trip up the vectorizer which now needs to work much
+    harder because it can't recognize the whole sequence as doing the same
+    operation.  */
+
+static bool
+likely_vectorizable_p (vec<operand_entry *> *ops)
+{
+  operand_entry *oe;
+  unsigned int i;
+  unsigned int rank = 0;
+
+  if (ops->length () < 3)
+    return false;
+
+  /* Check if the last operand is a PHI.  */
+  operand_entry *oelast = (*ops)[ops->length () - 1];
+
+  if (TREE_CODE (oelast->op) != SSA_NAME)
+    return false;
+
+  gimple *glast = SSA_NAME_DEF_STMT (oelast->op);
+
+  if (gimple_code (glast) != GIMPLE_PHI
+      || gimple_phi_num_args (glast) != 2)
+    return false;
+
+  /* If so, check if its first argument is a gimple assign.  */
+  tree phi_arg = gimple_phi_arg_def (glast, 0);
+
+  if (TREE_CODE (phi_arg) != SSA_NAME)
+    return false;
+
+  gimple *garg_def = SSA_NAME_DEF_STMT (phi_arg);
+  if (!is_gimple_assign (garg_def))
+    return false;
+
+  /* It must link back to the beginning of our sequence.  */
+  if (gimple_assign_rhs2 (garg_def) != (*ops)[0]->op)
+    return false;
+
+  /* Make sure everything apart from the PHI has the same rank.  */
+  FOR_EACH_VEC_ELT (*ops, i, oe)
+    {
+      if (i < ops->length () - 1)
+       {
+         if (rank == 0)
+           rank = oe->rank;
+         else
+           if (rank != oe->rank)
+             return false;
+       }
+    }
+
+  return true;
+}
+
 /* Reassociate expressions in basic block BB and its post-dominator as
    children.
 
@@ -7182,6 +7272,8 @@ reassociate_bb (basic_block bb)
                      mult_num = rank_ops_for_fma (&ops);
                    }
 
+                 bool vectorizable_p = likely_vectorizable_p (&ops);
+
                  /* Only rewrite the expression tree to parallel in the
                     last reassoc pass to avoid useless work back-and-forth
                     with initial linearization.  */
@@ -7208,7 +7300,7 @@ reassociate_bb (basic_block bb)
                         binary op are chosen wisely.  */
                      int len = ops.length ();
                      if (len >= 3
-                         && (!has_fma
+                         && ((!has_fma && !vectorizable_p)
                              /* width > 1 means ranking ops results in better
                                 parallelism.  Check current value to avoid
                                 calling get_reassociation_width again.  */
@@ -7346,13 +7438,13 @@ branch_fixup (void)
   reassoc_branch_fixups.release ();
 }
 
-void dump_ops_vector (FILE *file, vec<operand_entry *> ops);
-void debug_ops_vector (vec<operand_entry *> ops);
+void dump_ops_vector (FILE *file, const vec<operand_entry *> &ops);
+void debug_ops_vector (const vec<operand_entry *> &ops);
 
 /* Dump the operand entry vector OPS to FILE.  */
 
 void
-dump_ops_vector (FILE *file, vec<operand_entry *> ops)
+dump_ops_vector (FILE *file, const vec<operand_entry *> &ops)
 {
   operand_entry *oe;
   unsigned int i;
@@ -7368,7 +7460,7 @@ dump_ops_vector (FILE *file, vec<operand_entry *> ops)
 /* Dump the operand entry vector OPS to STDERR.  */
 
 DEBUG_FUNCTION void
-debug_ops_vector (vec<operand_entry *> ops)
+debug_ops_vector (const vec<operand_entry *> &ops)
 {
   dump_ops_vector (stderr, ops);
 }
-- 
2.46.2

Reply via email to