The following changes the gate to perform vectorization of BB reductions
to use needs_fold_left_reduction_p which in turn requires handling
TYPE_OVERFLOW_UNDEFINED types in the epilogue code generation by
promoting any operations generated there to use unsigned arithmetic.

The following does this, there's currently only v16qi where x86
supports a .REDUC_PLUS reduction for integral modes so I had to
add a x86 specific testcase using GIMPLE IL.

Bootstrap and regtest ongoing on x86_64-unknown-linux-gnu.

The next plan is to remove the restriction to .REDUC_PLUS, factoring
out some of the general non-ifn way of doing a reduction epilog
from loop reduction handling.  I had a stab at doing in-order
reductions already but then those are really too similar to
having general SLP discovery from N scalar defs (and then replacing
those with extracts), at least since there's no
fold_left_plus that doesn't add to an existing scalar I can't
seem to easily just handle that case, possibly discovering
{ x_0, x_1, ..., x_n-1 }, extracting x_0, shifting the vector
to { x_1, ..., x_n-1, <undef> } and using mask_fold_left_plus
with accumulating to x_0 and the <undef> element masked would do.
But I'm not sure that's worth the trouble?

In principle with generic N scalar defs we could do a forward
discovery from grouped loads and see where that goes (and of
course handle in-order reductions that way).

        * tree-vect-slp.cc (vect_slp_check_for_roots): Use
        !needs_fold_left_reduction_p to decide whether we can
        handle the reduction with association.
        (vectorize_slp_instance_root_stmt): For TYPE_OVERFLOW_UNDEFINED
        reductions perform all arithmetic in an unsigned type.

        * gcc.target/i386/vect-reduc-2.c: New testcase.
---
 gcc/testsuite/gcc.target/i386/vect-reduc-2.c | 77 ++++++++++++++++++++
 gcc/tree-vect-slp.cc                         | 44 +++++++----
 2 files changed, 107 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-reduc-2.c

diff --git a/gcc/testsuite/gcc.target/i386/vect-reduc-2.c 
b/gcc/testsuite/gcc.target/i386/vect-reduc-2.c
new file mode 100644
index 00000000000..62559ef8e7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-reduc-2.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-options "-fgimple -O2 -msse2 -fdump-tree-slp2-optimized" } */
+
+signed char x[16];
+
+signed char __GIMPLE (ssa,guessed_local(1073741824))
+foo ()
+{
+  signed char _1;
+  signed char _3;
+  signed char _5;
+  signed char _6;
+  signed char _8;
+  signed char _9;
+  signed char _11;
+  signed char _12;
+  signed char _14;
+  signed char _15;
+  signed char _17;
+  signed char _18;
+  signed char _20;
+  signed char _21;
+  signed char _23;
+  signed char _24;
+  signed char _26;
+  signed char _27;
+  signed char _29;
+  signed char _30;
+  signed char _32;
+  signed char _33;
+  signed char _35;
+  signed char _36;
+  signed char _38;
+  signed char _39;
+  signed char _41;
+  signed char _42;
+  signed char _44;
+  signed char _45;
+  signed char _47;
+
+  __BB(2,guessed_local(1073741824)):
+  _1 = x[15];
+  _3 = x[1];
+  _5 = _1 + _3;
+  _6 = x[2];
+  _8 = _5 + _6;
+  _9 = x[3];
+  _11 = _8 + _9;
+  _12 = x[4];
+  _14 = _11 + _12;
+  _15 = x[5];
+  _17 = _14 + _15;
+  _18 = x[6];
+  _20 = _17 + _18;
+  _21 = x[7];
+  _23 = _20 + _21;
+  _24 = x[8];
+  _26 = _23 + _24;
+  _27 = x[9];
+  _29 = _26 + _27;
+  _30 = x[10];
+  _32 = _29 + _30;
+  _33 = x[11];
+  _35 = _32 + _33;
+  _36 = x[12];
+  _38 = _35 + _36;
+  _39 = x[13];
+  _41 = _38 + _39;
+  _42 = x[14];
+  _44 = _41 + _42;
+  _45 = x[0];
+  _47 = _44 + _45;
+  return _47;
+
+}
+
+/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp2" 
} } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7020bd9fa0e..07d68f2052b 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -7217,13 +7217,10 @@ vect_slp_check_for_roots (bb_vec_info bb_vinfo)
        }
       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
               && (associative_tree_code (code) || code == MINUS_EXPR)
-              /* ???  The flag_associative_math and TYPE_OVERFLOW_WRAPS
-                 checks pessimize a two-element reduction.  PR54400.
+              /* ???  This pessimizes a two-element reduction.  PR54400.
                  ???  In-order reduction could be handled if we only
                  traverse one operand chain in vect_slp_linearize_chain.  */
-              && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math)
-                  || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
-                      && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs))))
+              && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
               /* Ops with constants at the tail can be stripped here.  */
               && TREE_CODE (rhs) == SSA_NAME
               && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
@@ -9161,9 +9158,23 @@ vectorize_slp_instance_root_stmt (slp_tree node, 
slp_instance instance)
       /* We may end up with more than one vector result, reduce them
         to one vector.  */
       tree vec_def = vec_defs[0];
+      tree vectype = TREE_TYPE (vec_def);
+      tree compute_vectype = vectype;
+      if (TYPE_OVERFLOW_UNDEFINED (vectype))
+       {
+         compute_vectype = unsigned_type_for (vectype);
+         vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
+                                 compute_vectype, vec_def);
+       }
       for (unsigned i = 1; i < vec_defs.length (); ++i)
-       vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def),
-                               vec_def, vec_defs[i]);
+       {
+         tree def = vec_defs[i];
+         if (TYPE_OVERFLOW_UNDEFINED (vectype))
+           def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
+                               compute_vectype, def);
+         vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
+                                 vec_def, def);
+       }
       vec_defs.release ();
       /* ???  Support other schemes than direct internal fn.  */
       internal_fn reduc_fn;
@@ -9171,21 +9182,26 @@ vectorize_slp_instance_root_stmt (slp_tree node, 
slp_instance instance)
          || reduc_fn == IFN_LAST)
        gcc_unreachable ();
       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
-                                     TREE_TYPE (TREE_TYPE (vec_def)), vec_def);
+                                     TREE_TYPE (compute_vectype), vec_def);
       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
        {
          tree rem_def = NULL_TREE;
          for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
-           if (!rem_def)
-             rem_def = def;
-           else
-             rem_def = gimple_build (&epilogue, reduc_code,
-                                     TREE_TYPE (scalar_def),
-                                     rem_def, def);
+           {
+             def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
+             if (!rem_def)
+               rem_def = def;
+             else
+               rem_def = gimple_build (&epilogue, reduc_code,
+                                       TREE_TYPE (scalar_def),
+                                       rem_def, def);
+           }
          scalar_def = gimple_build (&epilogue, reduc_code,
                                     TREE_TYPE (scalar_def),
                                     scalar_def, rem_def);
        }
+      scalar_def = gimple_convert (&epilogue,
+                                  TREE_TYPE (vectype), scalar_def);
       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
-- 
2.35.3

Reply via email to