The following fixes a vectorization issue that appears when trying
to vectorize the bwaves mat_times_vec kernel after interchange was
performed by the interchange pass.  That interchange inserts the
following code for the former reduction created by LIM store-motion:

  <bb 13> [local count: 161061274]:
  # m_58 = PHI <1(10), m_84(20)>
...
  <bb 14> [local count: 912680551]:
  # l_35 = PHI <1(13), l_57(21)>
...
  y__I_lsm.113_140 = *y_139(D)[_31];
  y__I_lsm.113_94 = m_58 != 1 ? y__I_lsm.113_140 : 0.0;
...
  *y_139(D)[_31] = _101;


so we have a COND_EXPR with a test on an integer IV m_58 with
double values.  Note that the m_58 != 1 condition is invariant
in the l loop.

Currently we vectorize this condition using V8SImode vectors
causing a vectorization factor of 8 and thus forcing the scalar
path for the bwaves case (the loops have an iteration count of 5).

The following patch makes the vectorizer handle invariant conditions
in the first place and second handle widening of operands of invariant
conditions transparently (the promotion will happen on the invariant
scalars).  This makes it possible to use a vectorization factor of 4,
reducing the bwaves runtime from 208s before interchange
(via 190s after interchange) to 172s after interchange and vectorization
with AVX256 (on a Haswell machine).

For the vectorizable_condition part to work I need to avoid
pulling apart the condition from the COND_EXPR during pattern
detection.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

2017-12-06  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/81303
        * tree-vect-stmts.c (vect_is_simple_cond): For invariant
        conditions try to create a comparison vector type matching
        the data vector type.
        (vectorizable_condition): Adjust.
        * tree-vect-patterns.c (vect_recog_mask_conversion_pattern):
        Leave invariant conditions alone in case we can vectorize those.

        * gcc.target/i386/vectorize9.c: New testcase.
        * gcc.target/i386/vectorize10.c: New testcase.

Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       (revision 255438)
+++ gcc/tree-vect-stmts.c       (working copy)
@@ -7792,7 +7792,8 @@ vectorizable_load (gimple *stmt, gimple_
 
 static bool
 vect_is_simple_cond (tree cond, vec_info *vinfo,
-                    tree *comp_vectype, enum vect_def_type *dts)
+                    tree *comp_vectype, enum vect_def_type *dts,
+                    tree vectype)
 {
   tree lhs, rhs;
   tree vectype1 = NULL_TREE, vectype2 = NULL_TREE;
@@ -7845,6 +7846,20 @@ vect_is_simple_cond (tree cond, vec_info
     return false;
 
   *comp_vectype = vectype1 ? vectype1 : vectype2;
+  /* Invariant comparison.  */
+  if (! *comp_vectype)
+    {
+      tree scalar_type = TREE_TYPE (lhs);
+      /* If we can widen the comparison to match vectype do so.  */
+      if (INTEGRAL_TYPE_P (scalar_type)
+         && tree_int_cst_lt (TYPE_SIZE (scalar_type),
+                             TYPE_SIZE (TREE_TYPE (vectype))))
+       scalar_type = build_nonstandard_integer_type
+         (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))),
+          TYPE_UNSIGNED (scalar_type));
+      *comp_vectype = get_vectype_for_scalar_type (scalar_type);
+    }
+
   return true;
 }
 
@@ -7942,7 +7957,7 @@ vectorizable_condition (gimple *stmt, gi
   else_clause = gimple_assign_rhs3 (stmt);
 
   if (!vect_is_simple_cond (cond_expr, stmt_info->vinfo,
-                           &comp_vectype, &dts[0])
+                           &comp_vectype, &dts[0], vectype)
       || !comp_vectype)
     return false;
 
Index: gcc/tree-vect-patterns.c
===================================================================
--- gcc/tree-vect-patterns.c    (revision 255438)
+++ gcc/tree-vect-patterns.c    (working copy)
@@ -3976,6 +3976,32 @@ vect_recog_mask_conversion_pattern (vec<
          || TYPE_VECTOR_SUBPARTS (vectype1) == TYPE_VECTOR_SUBPARTS (vectype2))
        return NULL;
 
+      /* If rhs1 is invariant and we can promote it leave the COND_EXPR
+         in place, we can handle it in vectorizable_condition.  This avoids
+        unnecessary promotion stmts and increased vectorization factor.  */
+      if (COMPARISON_CLASS_P (rhs1)
+         && INTEGRAL_TYPE_P (rhs1_type)
+         && TYPE_VECTOR_SUBPARTS (vectype1) < TYPE_VECTOR_SUBPARTS (vectype2))
+       {
+         gimple *dummy;
+         enum vect_def_type dt;
+         if (vect_is_simple_use (TREE_OPERAND (rhs1, 0), stmt_vinfo->vinfo,
+                                 &dummy, &dt)
+             && dt == vect_external_def
+             && vect_is_simple_use (TREE_OPERAND (rhs1, 1), stmt_vinfo->vinfo,
+                                    &dummy, &dt)
+             && (dt == vect_external_def
+                 || dt == vect_constant_def))
+           {
+             tree wide_scalar_type = build_nonstandard_integer_type
+               (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype1))),
+                TYPE_UNSIGNED (rhs1_type));
+             tree vectype3 = get_vectype_for_scalar_type (wide_scalar_type);
+             if (expand_vec_cond_expr_p (vectype1, vectype3, TREE_CODE (rhs1)))
+               return NULL;
+           }
+       }
+
       /* If rhs1 is a comparison we need to move it into a
         separate statement.  */
       if (TREE_CODE (rhs1) != SSA_NAME)
Index: gcc/testsuite/gcc.target/i386/vectorize9.c
===================================================================
--- gcc/testsuite/gcc.target/i386/vectorize9.c  (nonexistent)
+++ gcc/testsuite/gcc.target/i386/vectorize9.c  (working copy)
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details 
-fno-tree-loop-im -mavx2 -mprefer-vector-width=256" } */
+
+double x[1024][1024], red[1024];
+void foo (void)
+{
+  for (int i = 0; i < 1024; ++i)
+    for (int j = 0; j < 1024; ++j)
+      {
+       double v = i == 0 ? 0.0 : red[j];
+       v = v + x[i][j];
+       red[j] = v;
+      }
+}
+
+/* { dg-final { scan-tree-dump "vectorization factor = 4" "vect" } } */
Index: gcc/testsuite/gcc.target/i386/vectorize10.c
===================================================================
--- gcc/testsuite/gcc.target/i386/vectorize10.c (nonexistent)
+++ gcc/testsuite/gcc.target/i386/vectorize10.c (working copy)
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O -ftree-loop-vectorize -fdump-tree-vect-details 
-fno-tree-loop-im -msse2 -mno-avx" } */
+
+double x[1024][1024], red[1024];
+void foo (void)
+{
+  for (int i = 0; i < 1024; ++i)
+    for (int j = 0; j < 1024; ++j)
+      {
+       double v = i == 0 ? 0.0 : red[j];
+       v = v + x[i][j];
+       red[j] = v;
+      }
+}
+
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */

Reply via email to