[PATCH] Fix SLP vectorization of shifts (PR tree-optimization/48616)

Jakub Jelinek Fri, 15 Apr 2011 23:16:58 -0700

Hi!

As the attached testcase shows, while the current detection of what
shifts are by scalar and what shifts are by vector shift count
may work well for loop vectorizer (vect_internal_def being vector
shift, vect_external_def or vect_constant_def scalar shift),
it is incorrect for SLP, where vect_external_def and vect_constant_def
may be either scalar or vector shift (vect_external_def is e.g.
if def_stmt is in a different bb and either it can be the same
SSA_NAMEs in all shifts, or different, for vect_constant_def it
can be either the same constant in all cases, or different)
and in theory vect_internal_def could be also scalar shift (tried
to test that in fn3's first bb, though currently SLP doesn't attempt
to vectorize that, missed-optimization).


The following patch fixes that.  Bootstrapped/regtested on x86_64-linux
and i686-linux and additionally tested on the testcase with additional
-mxop.  Ok for trunk/4.6?

For better test coverage, perhaps the testcase should be duplicated
after effective target xop and xop_runtime are added and additionally
perhaps the testcase in gcc.dg/vect with -mxop testing what is SLP
vectorized from the dumps would be nice.

2011-04-16  Jakub Jelinek  <[email protected]>

        PR tree-optimization/48616
        * tree-vect-stmts.c (vectorizable_shift): If SLP, determine
        whether the shift is by scalar or vector based on whether all SLP
        scalar stmts have the same rhs.

        * gcc.dg/pr48616.c: New test.

--- gcc/tree-vect-stmts.c.jj    2011-03-14 14:12:15.000000000 +0100
+++ gcc/tree-vect-stmts.c       2011-04-15 20:44:30.000000000 +0200
@@ -2077,7 +2077,7 @@ vectorizable_shift (gimple stmt, gimple_
   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
   tree vop0, vop1;
   unsigned int k;
-  bool scalar_shift_arg = false;
+  bool scalar_shift_arg = true;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   int vf;
 
@@ -2159,8 +2159,34 @@ vectorizable_shift (gimple stmt, gimple_
   /* Determine whether the shift amount is a vector, or scalar.  If the
      shift/rotate amount is a vector, use the vector/vector shift optabs.  */
 
+  if (dt[1] == vect_internal_def && !slp_node)
+    scalar_shift_arg = false;
+  else if (dt[1] == vect_constant_def
+          || dt[1] == vect_external_def
+          || dt[1] == vect_internal_def)
+    {
+      /* In SLP, need to check whether the shift count is the same,
+        in loops if it is a constant or invariant, it is always
+        a scalar shift.  */
+      if (slp_node)
+       {
+         VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+         gimple slpstmt;
+
+         FOR_EACH_VEC_ELT (gimple, stmts, k, slpstmt)
+           if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
+             scalar_shift_arg = false;
+       }
+    }
+  else
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "operand mode requires invariant argument.");
+      return false;
+    }
+
   /* Vector shifted by vector.  */
-  if (dt[1] == vect_internal_def)
+  if (!scalar_shift_arg)
     {
       optab = optab_for_tree_code (code, vectype, optab_vector);
       if (vect_print_dump_info (REPORT_DETAILS))
@@ -2168,13 +2194,12 @@ vectorizable_shift (gimple stmt, gimple_
     }
   /* See if the machine has a vector shifted by scalar insn and if not
      then see if it has a vector shifted by vector insn.  */
-  else if (dt[1] == vect_constant_def || dt[1] == vect_external_def)
+  else
     {
       optab = optab_for_tree_code (code, vectype, optab_scalar);
       if (optab
           && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing)
         {
-          scalar_shift_arg = true;
           if (vect_print_dump_info (REPORT_DETAILS))
             fprintf (vect_dump, "vector/scalar shift/rotate found.");
         }
@@ -2185,6 +2210,8 @@ vectorizable_shift (gimple stmt, gimple_
                && (optab_handler (optab, TYPE_MODE (vectype))
                       != CODE_FOR_nothing))
             {
+             scalar_shift_arg = false;
+
               if (vect_print_dump_info (REPORT_DETAILS))
                 fprintf (vect_dump, "vector/vector shift/rotate found.");
 
@@ -2197,12 +2224,6 @@ vectorizable_shift (gimple stmt, gimple_
             }
         }
     }
-  else
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "operand mode requires invariant argument.");
-      return false;
-    }
 
   /* Supportable by target?  */
   if (!optab)
--- gcc/testsuite/gcc.dg/pr48616.c.jj   2011-04-15 21:02:27.000000000 +0200
+++ gcc/testsuite/gcc.dg/pr48616.c      2011-04-15 21:00:24.000000000 +0200
@@ -0,0 +1,134 @@
+/* PR tree-optimization/48616 */
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+extern void abort (void);
+int a[4] __attribute__((aligned (32)));
+int b[4] __attribute__((aligned (32)));
+int c[4] __attribute__((aligned (32)));
+int d[4] __attribute__((aligned (32)));
+int e[4] __attribute__((aligned (32)));
+
+__attribute__((noinline, noclone))
+int
+foo (int x)
+{
+  asm ("" : "+r" (x));
+  return x;
+}
+
+__attribute__((noinline, noclone))
+void
+fn1 (int i)
+{
+  a[0] = b[0] << c[0];
+  a[1] = b[1] << c[1];
+  a[2] = b[2] << c[2];
+  a[3] = b[3] << c[3];
+  if (i)
+    {
+      d[0] = e[0] >> c[0];
+      d[1] = e[1] >> c[1];
+      d[2] = e[2] >> c[2];
+      d[3] = e[3] >> c[3];
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn2 (int i)
+{
+  a[0] = b[0] << 1;
+  a[1] = b[1] << 2;
+  a[2] = b[2] << 3;
+  a[3] = b[3] << 4;
+  if (i)
+    {
+      d[0] = e[0] >> 1;
+      d[1] = e[1] >> 2;
+      d[2] = e[2] >> 3;
+      d[3] = e[3] >> 4;
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn3 (int i, int j)
+{
+  int x = foo (j);
+  a[0] = b[0] << x;
+  a[1] = b[1] << x;
+  a[2] = b[2] << x;
+  a[3] = b[3] << x;
+  if (i)
+    {
+      d[0] = e[0] >> x;
+      d[1] = e[1] >> x;
+      d[2] = e[2] >> x;
+      d[3] = e[3] >> x;
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn4 (int i)
+{
+  a[0] = b[0] << 1;
+  a[1] = b[1] << 1;
+  a[2] = b[2] << 1;
+  a[3] = b[3] << 1;
+  if (i)
+    {
+      d[0] = e[0] >> 1;
+      d[1] = e[1] >> 1;
+      d[2] = e[2] >> 1;
+      d[3] = e[3] >> 1;
+    }
+}
+
+int
+main ()
+{
+  int i;
+  int *t;
+  for (i = 0; i < 4; i++)
+    {
+      b[i] = 32;
+      c[i] = i + 1;
+      e[i] = 32;
+    }
+  asm volatile ("" : : "r" (b) : "memory");
+  asm volatile ("" : : "r" (c) : "memory");
+  asm volatile ("" : "=r" (t) : "0" (d) : "memory");
+  fn1 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
+       abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn2 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
+       abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn3 (t != 0, t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
+       abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn4 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
+       abort ();
+    }
+  return 0;
+}

        Jakub

[PATCH] Fix SLP vectorization of shifts (PR tree-optimization/48616)

Reply via email to