This PR shows one example (IIRC I've seen others recently) where we
fail to handle outer loop vectorization because we do a poor job
identifying "safe" nested cycles.  This improves the situation.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

I've also built SPEC 2006 CPU with and without LTO on a Haswell machine.

I do expect fallout since the reduction code is still incredibly 
fragile...

Richard.

>From 854d80f1822ae6b37afa865ae49d64ceaee68b26 Mon Sep 17 00:00:00 2001
From: Richard Guenther <rguent...@suse.de>
Date: Wed, 7 Nov 2018 12:19:45 +0100
Subject: [PATCH] fix-pr87914

2018-11-07  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/87914
        * tree-vect-loop.c (vect_is_simple_reduction): Improve detection
        of nested cycles.
        (vectorizable_reduction): Handle shifts and rotates by dispatching
        to vectorizable_shift.
        * tree-vect-stmts.c (vect_get_vec_def_for_operand_1): Handle
        in-loop uses of vect_nested_cycle defs.  Merge cycle and internal
        def cases.
        (vectorizable_shift): Export and handle being called as
        vect_nested_cycle.
        (vect_analyze_stmt): Call vectorizable_shift after
        vectorizable_reduction.
        * tree-vectorizer.h (vectorizable_shift): Declare.

        * lib/target-supports.exp (check_effective_target_vect_var_shift): New.
        (check_avx2_available): Likewise.
        * g++.dg/vect/pr87914.cc: New testcase.

diff --git a/gcc/testsuite/g++.dg/vect/pr87914.cc 
b/gcc/testsuite/g++.dg/vect/pr87914.cc
new file mode 100644
index 00000000000..12fbba3af2f
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/pr87914.cc
@@ -0,0 +1,49 @@
+// { dg-do run }
+// { dg-additional-options "-fopenmp-simd" }
+// { dg-additional-options "-mavx2" { target { avx2_runtime } } }
+
+extern "C" int memcmp(const void *s1, const void *s2, __SIZE_TYPE__ n);
+extern "C" void abort(void);
+
+template <typename T>
+T reverseBits(T x)
+{
+  unsigned int s = sizeof(x) * 8;
+  T mask = ~T(0);
+  while ((s >>= 1) > 0)
+    {
+      mask ^= (mask << s);
+      x = ((x >> s) & mask) | ((x << s) & ~mask); // unsupported use in stmt
+    }
+  return x;
+}
+
+void __attribute__((noinline,noipa))
+test_reverseBits(unsigned* x)
+{
+#pragma omp simd aligned(x:32)
+  for (int i = 0; i < 16; ++i)
+    x[i] = reverseBits(x[i]); // couldn't vectorize loop
+}
+
+int main()
+{
+  unsigned arr[16] __attribute__((aligned(32)))
+    = { 0x01020304, 0x05060708, 0x0a0b0c0d, 0x0e0f1011,
+        0x11121314, 0x45065708, 0xfa0b3c0du, 0x0e0f1211,
+        0x21222324, 0x55066708, 0xfa0b2c0du, 0x1e0f1011,
+        0x31323334, 0x65067708, 0xfa0b5c0du, 0x0e3f1011 };
+  unsigned arr2[16]
+    = { 0x20c04080, 0x10e060a0, 0xb030d050, 0x8808f070u,
+        0x28c84888, 0x10ea60a2, 0xb03cd05f, 0x8848f070u,
+        0x24c44484, 0x10e660aa, 0xb034d05f, 0x8808f078u, 
+        0x2ccc4c8c, 0x10ee60a6, 0xb03ad05f, 0x8808fc70u };
+
+  test_reverseBits (arr);
+
+  if (memcmp (arr, arr2, sizeof (arr)) != 0)
+    abort ();
+  return 0;
+}
+
+// { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target { 
vect_var_shift && vect_int } } } }
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 9780e53dfc0..1d5ad9abdca 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -5316,6 +5316,15 @@ proc check_effective_target_vect_shift { } {
                 && [check_effective_target_s390_vx]) }}]
 }
 
+# Return 1 if the target supports hardware vector shift by register operation.
+
+proc check_effective_target_vect_var_shift { } {
+    return [check_cached_effective_target_indexed vect_var_shift {
+      expr {(([istarget i?86-*-*] || [istarget x86_64-*-*])
+            && [check_avx2_available])
+      }}]
+}
+
 proc check_effective_target_whole_vector_shift { } {
     if { [istarget i?86-*-*] || [istarget x86_64-*-*]
         || [istarget ia64-*-*]
@@ -7150,6 +7159,19 @@ proc check_avx_available { } {
   return 0;
 }
 
+# Return true if we are compiling for AVX2 target.
+
+proc check_avx2_available { } {
+  if { [check_no_compiler_messages avx_available assembly {
+    #ifndef __AVX2__
+    #error unsupported
+    #endif
+  } ""] } {
+    return 1;
+  }
+  return 0;
+}
+
 # Return true if we are compiling for SSSE3 target.
 
 proc check_ssse3_available { } {
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 51be405b5a0..e392aab1d52 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2880,6 +2880,11 @@ vect_is_simple_reduction (loop_vec_info loop_info, 
stmt_vec_info phi_info,
           return NULL;
         }
 
+      /* For inner loop reductions in nested vectorization there are no
+         constraints on the number of uses in the inner loop.  */
+      if (loop == vect_loop->inner)
+       continue;
+
       nloop_uses++;
       if (nloop_uses > 1)
         {
@@ -2938,13 +2943,19 @@ vect_is_simple_reduction (loop_vec_info loop_info, 
stmt_vec_info phi_info,
       else
        /* We can have more than one loop-closed PHI.  */
        lcphis.safe_push (as_a <gphi *> (use_stmt));
-      if (nloop_uses > 1)
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.\n");
-         return NULL;
-       }
+    }
+
+  /* If this isn't a nested cycle or if the nested cycle reduction value
+     is used ouside of the inner loop we cannot handle uses of the reduction
+     value.  */
+  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
+  if ((!nested_in_vect_loop || !lcphis.is_empty ())
+      && nloop_uses > 1)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "reduction used in loop.\n");
+      return NULL;
     }
 
   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
@@ -3005,9 +3016,15 @@ vect_is_simple_reduction (loop_vec_info loop_info, 
stmt_vec_info phi_info,
     }
 
   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
-  bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
   code = orig_code = gimple_assign_rhs_code (def_stmt);
 
+  if (nested_in_vect_loop && !check_reduction)
+    {
+      if (dump_enabled_p ())
+       report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
+      return def_stmt_info;
+    }
+
   /* We can handle "res -= x[i]", which is non-associative by
      simply rewriting this into "res += -x[i]".  Avoid changing
      gimple instruction for the first simple tests and only do this
@@ -6488,6 +6505,19 @@ vectorizable_reduction (stmt_vec_info stmt_info, 
gimple_stmt_iterator *gsi,
   vec_mode = TYPE_MODE (vectype_in);
   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
+  if (nested_cycle)
+    {
+      def_bb = gimple_bb (reduc_def_phi);
+      def_stmt_loop = def_bb->loop_father;
+      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                                       loop_preheader_edge (def_stmt_loop));
+      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
+      if (def_arg_stmt_info
+         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
+             == vect_double_reduction_def))
+        double_reduc = true;
+    }
+
   if (code == COND_EXPR)
     {
       /* Only call during the analysis stage, otherwise we'll lose
@@ -6502,20 +6532,26 @@ vectorizable_reduction (stmt_vec_info stmt_info, 
gimple_stmt_iterator *gsi,
          return false;
         }
     }
-  else
+  else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
+          || code == LROTATE_EXPR || code == RROTATE_EXPR)
     {
-      /* 4. Supportable by target?  */
-
-      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
-         || code == LROTATE_EXPR || code == RROTATE_EXPR)
+      /* Only call during the analysis stage, otherwise we'll lose
+        STMT_VINFO_TYPE.  We only support this for nested cycles
+        without double reductions at the moment.  */
+      if (!nested_cycle
+         || double_reduc
+         || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
+                                               NULL, cost_vec)))
        {
-         /* Shifts and rotates are only supported by vectorizable_shifts,
-            not vectorizable_reduction.  */
           if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported shift or rotation.\n");
+                            "unsupported shift or rotation in reduction\n");
          return false;
        }
+    }
+  else
+    {
+      /* 4. Supportable by target?  */
 
       /* 4.1. check support for the operation in the loop  */
       optab = optab_for_tree_code (code, vectype_in, optab_default);
@@ -6620,19 +6656,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, 
gimple_stmt_iterator *gsi,
        orig_code = cond_reduc_op_code;
     }
 
-  if (nested_cycle)
-    {
-      def_bb = gimple_bb (reduc_def_phi);
-      def_stmt_loop = def_bb->loop_father;
-      def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                       loop_preheader_edge (def_stmt_loop));
-      stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
-      if (def_arg_stmt_info
-         && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
-             == vect_double_reduction_def))
-        double_reduc = true;
-    }
-
   reduc_fn = IFN_LAST;
 
   if (reduction_type == TREE_CODE_REDUCTION
@@ -7003,6 +7026,12 @@ vectorizable_reduction (stmt_vec_info stmt_info, 
gimple_stmt_iterator *gsi,
           /* Multiple types are not supported for condition.  */
           break;
         }
+      if (code == LSHIFT_EXPR
+         || code == RSHIFT_EXPR)
+       {
+         vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
+         break;
+       }
 
       /* Handle uses.  */
       if (j == 0)
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 7127c17c788..8133149b2dc 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1461,6 +1461,16 @@ vect_get_vec_def_for_operand_1 (stmt_vec_info 
def_stmt_info,
       /* Code should use vect_get_vec_def_for_operand.  */
       gcc_unreachable ();
 
+    /* Operand is defined by a loop header phi.  In case of nested
+       cycles we also may have uses of the backedge def.  */
+    case vect_reduction_def:
+    case vect_double_reduction_def:
+    case vect_nested_cycle:
+    case vect_induction_def:
+      gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
+                 || dt == vect_nested_cycle);
+      /* Fallthru.  */
+
     /* operand is defined inside the loop.  */
     case vect_internal_def:
       {
@@ -1480,23 +1490,6 @@ vect_get_vec_def_for_operand_1 (stmt_vec_info 
def_stmt_info,
        return vec_oprnd;
       }
 
-    /* operand is defined by a loop header phi.  */
-    case vect_reduction_def:
-    case vect_double_reduction_def:
-    case vect_nested_cycle:
-    case vect_induction_def:
-      {
-       gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI);
-
-       /* Get the def from the vectorized stmt.  */
-       vec_stmt_info = STMT_VINFO_VEC_STMT (def_stmt_info);
-       if (gphi *phi = dyn_cast <gphi *> (vec_stmt_info->stmt))
-         vec_oprnd = PHI_RESULT (phi);
-       else
-         vec_oprnd = gimple_get_lhs (vec_stmt_info->stmt);
-       return vec_oprnd;
-      }
-
     default:
       gcc_unreachable ();
     }
@@ -5363,7 +5356,7 @@ vect_supportable_shift (enum tree_code code, tree 
scalar_type)
    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
    Return true if STMT_INFO is vectorizable in this way.  */
 
-static bool
+bool
 vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                    stmt_vec_info *vec_stmt, slp_tree slp_node,
                    stmt_vector_for_cost *cost_vec)
@@ -5401,6 +5394,7 @@ vectorizable_shift (stmt_vec_info stmt_info, 
gimple_stmt_iterator *gsi,
     return false;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
+      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle
       && ! vec_stmt)
     return false;
 
@@ -5480,7 +5474,8 @@ vectorizable_shift (stmt_vec_info stmt_info, 
gimple_stmt_iterator *gsi,
      shift/rotate amount is a vector, use the vector/vector shift optabs.  */
 
   if ((dt[1] == vect_internal_def
-       || dt[1] == vect_induction_def)
+       || dt[1] == vect_induction_def
+       || dt[1] == vect_nested_cycle)
       && !slp_node)
     scalar_shift_arg = false;
   else if (dt[1] == vect_constant_def
@@ -9540,7 +9535,6 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool 
*need_to_vectorize,
          || vectorizable_simd_clone_call (stmt_info, NULL, NULL, node,
                                           cost_vec)
          || vectorizable_conversion (stmt_info, NULL, NULL, node, cost_vec)
-         || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_operation (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_assignment (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_load (stmt_info, NULL, NULL, node, node_instance,
@@ -9549,6 +9543,7 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool 
*need_to_vectorize,
          || vectorizable_reduction (stmt_info, NULL, NULL, node,
                                     node_instance, cost_vec)
          || vectorizable_induction (stmt_info, NULL, NULL, node, cost_vec)
+         || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec)
          || vectorizable_condition (stmt_info, NULL, NULL, NULL, 0, node,
                                     cost_vec)
          || vectorizable_comparison (stmt_info, NULL, NULL, NULL, node,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 1434eeaf270..72a12aea8f3 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1487,6 +1487,9 @@ extern opt_result vect_analyze_stmt (stmt_vec_info, bool 
*, slp_tree,
 extern bool vectorizable_condition (stmt_vec_info, gimple_stmt_iterator *,
                                    stmt_vec_info *, tree, int, slp_tree,
                                    stmt_vector_for_cost *);
+extern bool vectorizable_shift (stmt_vec_info, gimple_stmt_iterator *,
+                               stmt_vec_info *, slp_tree,
+                               stmt_vector_for_cost *);
 extern void vect_get_load_cost (stmt_vec_info, int, bool,
                                unsigned int *, unsigned int *,
                                stmt_vector_for_cost *,

Reply via email to