Hi!

As the attached testcase shows, while the current detection of what
shifts are by scalar and what shifts are by vector shift count
may work well for loop vectorizer (vect_internal_def being vector
shift, vect_external_def or vect_constant_def scalar shift),
it is incorrect for SLP, where vect_external_def and vect_constant_def
may be either scalar or vector shift (vect_external_def is e.g.
if def_stmt is in a different bb and either it can be the same
SSA_NAMEs in all shifts, or different, for vect_constant_def it
can be either the same constant in all cases, or different)
and in theory vect_internal_def could be also scalar shift (tried
to test that in fn3's first bb, though currently SLP doesn't attempt
to vectorize that, missed-optimization).

The following patch fixes that.  Bootstrapped/regtested on x86_64-linux
and i686-linux and additionally tested on the testcase with additional
-mxop.  Ok for trunk/4.6?

For better test coverage, perhaps the testcase should be duplicated
after effective target xop and xop_runtime are added and additionally
perhaps the testcase in gcc.dg/vect with -mxop testing what is SLP
vectorized from the dumps would be nice.

2011-04-16  Jakub Jelinek  <ja...@redhat.com>

        PR tree-optimization/48616
        * tree-vect-stmts.c (vectorizable_shift): If SLP, determine
        whether the shift is by scalar or vector based on whether all SLP
        scalar stmts have the same rhs.

        * gcc.dg/pr48616.c: New test.

--- gcc/tree-vect-stmts.c.jj    2011-03-14 14:12:15.000000000 +0100
+++ gcc/tree-vect-stmts.c       2011-04-15 20:44:30.000000000 +0200
@@ -2077,7 +2077,7 @@ vectorizable_shift (gimple stmt, gimple_
   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
   tree vop0, vop1;
   unsigned int k;
-  bool scalar_shift_arg = false;
+  bool scalar_shift_arg = true;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   int vf;
 
@@ -2159,8 +2159,34 @@ vectorizable_shift (gimple stmt, gimple_
   /* Determine whether the shift amount is a vector, or scalar.  If the
      shift/rotate amount is a vector, use the vector/vector shift optabs.  */
 
+  if (dt[1] == vect_internal_def && !slp_node)
+    scalar_shift_arg = false;
+  else if (dt[1] == vect_constant_def
+          || dt[1] == vect_external_def
+          || dt[1] == vect_internal_def)
+    {
+      /* In SLP, need to check whether the shift count is the same,
+        in loops if it is a constant or invariant, it is always
+        a scalar shift.  */
+      if (slp_node)
+       {
+         VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
+         gimple slpstmt;
+
+         FOR_EACH_VEC_ELT (gimple, stmts, k, slpstmt)
+           if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
+             scalar_shift_arg = false;
+       }
+    }
+  else
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+       fprintf (vect_dump, "operand mode requires invariant argument.");
+      return false;
+    }
+
   /* Vector shifted by vector.  */
-  if (dt[1] == vect_internal_def)
+  if (!scalar_shift_arg)
     {
       optab = optab_for_tree_code (code, vectype, optab_vector);
       if (vect_print_dump_info (REPORT_DETAILS))
@@ -2168,13 +2194,12 @@ vectorizable_shift (gimple stmt, gimple_
     }
   /* See if the machine has a vector shifted by scalar insn and if not
      then see if it has a vector shifted by vector insn.  */
-  else if (dt[1] == vect_constant_def || dt[1] == vect_external_def)
+  else
     {
       optab = optab_for_tree_code (code, vectype, optab_scalar);
       if (optab
           && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing)
         {
-          scalar_shift_arg = true;
           if (vect_print_dump_info (REPORT_DETAILS))
             fprintf (vect_dump, "vector/scalar shift/rotate found.");
         }
@@ -2185,6 +2210,8 @@ vectorizable_shift (gimple stmt, gimple_
                && (optab_handler (optab, TYPE_MODE (vectype))
                       != CODE_FOR_nothing))
             {
+             scalar_shift_arg = false;
+
               if (vect_print_dump_info (REPORT_DETAILS))
                 fprintf (vect_dump, "vector/vector shift/rotate found.");
 
@@ -2197,12 +2224,6 @@ vectorizable_shift (gimple stmt, gimple_
             }
         }
     }
-  else
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "operand mode requires invariant argument.");
-      return false;
-    }
 
   /* Supportable by target?  */
   if (!optab)
--- gcc/testsuite/gcc.dg/pr48616.c.jj   2011-04-15 21:02:27.000000000 +0200
+++ gcc/testsuite/gcc.dg/pr48616.c      2011-04-15 21:00:24.000000000 +0200
@@ -0,0 +1,134 @@
+/* PR tree-optimization/48616 */
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+extern void abort (void);
+int a[4] __attribute__((aligned (32)));
+int b[4] __attribute__((aligned (32)));
+int c[4] __attribute__((aligned (32)));
+int d[4] __attribute__((aligned (32)));
+int e[4] __attribute__((aligned (32)));
+
+__attribute__((noinline, noclone))
+int
+foo (int x)
+{
+  asm ("" : "+r" (x));
+  return x;
+}
+
+__attribute__((noinline, noclone))
+void
+fn1 (int i)
+{
+  a[0] = b[0] << c[0];
+  a[1] = b[1] << c[1];
+  a[2] = b[2] << c[2];
+  a[3] = b[3] << c[3];
+  if (i)
+    {
+      d[0] = e[0] >> c[0];
+      d[1] = e[1] >> c[1];
+      d[2] = e[2] >> c[2];
+      d[3] = e[3] >> c[3];
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn2 (int i)
+{
+  a[0] = b[0] << 1;
+  a[1] = b[1] << 2;
+  a[2] = b[2] << 3;
+  a[3] = b[3] << 4;
+  if (i)
+    {
+      d[0] = e[0] >> 1;
+      d[1] = e[1] >> 2;
+      d[2] = e[2] >> 3;
+      d[3] = e[3] >> 4;
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn3 (int i, int j)
+{
+  int x = foo (j);
+  a[0] = b[0] << x;
+  a[1] = b[1] << x;
+  a[2] = b[2] << x;
+  a[3] = b[3] << x;
+  if (i)
+    {
+      d[0] = e[0] >> x;
+      d[1] = e[1] >> x;
+      d[2] = e[2] >> x;
+      d[3] = e[3] >> x;
+    }
+}
+
+__attribute__((noinline, noclone))
+void
+fn4 (int i)
+{
+  a[0] = b[0] << 1;
+  a[1] = b[1] << 1;
+  a[2] = b[2] << 1;
+  a[3] = b[3] << 1;
+  if (i)
+    {
+      d[0] = e[0] >> 1;
+      d[1] = e[1] >> 1;
+      d[2] = e[2] >> 1;
+      d[3] = e[3] >> 1;
+    }
+}
+
+int
+main ()
+{
+  int i;
+  int *t;
+  for (i = 0; i < 4; i++)
+    {
+      b[i] = 32;
+      c[i] = i + 1;
+      e[i] = 32;
+    }
+  asm volatile ("" : : "r" (b) : "memory");
+  asm volatile ("" : : "r" (c) : "memory");
+  asm volatile ("" : "=r" (t) : "0" (d) : "memory");
+  fn1 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
+       abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn2 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
+       abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn3 (t != 0, t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
+       abort ();
+      a[i] = 0;
+      d[i] = 0;
+    }
+  fn4 (t != 0);
+  for (i = 0; i < 4; i++)
+    {
+      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
+       abort ();
+    }
+  return 0;
+}

        Jakub

Reply via email to