The following adds the ability to discover a reduction chain on a
series of statements that invoke undefined behavior on integer overflow.
This inhibits the reassoc pass from associating stmts in the way
naturally leading to a reduction chain. The common mistake on the
source side is to rely on the += operator to sum multiple inputs.
After the refactoring of how we handle reduction chains we can
easily use vect_slp_linearize_chain to do this our selves and
rely on the vectorizer punning operations to unsigned given reduction
vectorization always associates.
Bootstrapped and tested on x86_64-unknown-linux-gnu, will push shortly.
Richard.
PR tree-optimization/120687
* tree-vect-slp.cc (vect_analyze_slp_reduc_chain): When
there's no natural reduction chain see if vect_slp_linearize_chain
can recover one and built the SLP instance manually in that
case.
(vect_schedule_slp): Deal with NULL lanes when looking for
stores to remove.
* gcc.dg/vect/vect-reduc-chain-4.c: New testcase.
---
.../gcc.dg/vect/vect-reduc-chain-4.c | 31 ++++
gcc/tree-vect-loop.cc | 4 +
gcc/tree-vect-slp.cc | 138 +++++++++++++++++-
3 files changed, 171 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-chain-4.c
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-4.c
b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-4.c
new file mode 100644
index 00000000000..d79676d8d82
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-4.c
@@ -0,0 +1,31 @@
+#include "tree-vect.h"
+
+int q[32];
+
+int __attribute__((noipa))
+foo ()
+{
+ int res = 0;
+ for (int i = 0; i < 8; ++i)
+ res += q[4*i] + q[4*i+1] + q[4*i+2] + q[4*i+3];
+ return res;
+}
+
+int main()
+{
+ check_vect ();
+
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 32; ++i)
+ {
+ q[i] = i;
+ sum += i;
+ }
+
+ if (foo () != sum)
+ abort ();
+}
+
+/* { dg-final { scan-tree-dump "vectorizing a reduction chain" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 455205069f6..be684a529db 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8220,6 +8220,10 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
/* Leave the scalar phi in place. */
return true;
+ if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vectorizing a reduction chain\n");
+
vec_num = vect_get_num_copies (loop_vinfo, slp_node);
/* Check whether we should use a single PHI node and accumulate
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 13a2995592b..53fe643be2c 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4230,9 +4230,142 @@ vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
first = false;
}
while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
- if (fail || scalar_stmts.length () <= 1)
+ if (fail)
return false;
+ if (scalar_stmts.length () == 1
+ && code.is_tree_code ()
+ && associative_tree_code ((tree_code)code))
+ {
+ auto_vec<chain_op_t> chain;
+ auto_vec<std::pair<tree_code, gimple *> > worklist;
+ gimple *op_stmt = NULL, *other_op_stmt = NULL;
+ vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
+ scalar_stmts[0]->stmt, op_stmt, other_op_stmt,
+ NULL);
+ if (chain.length () < 3)
+ {
+ scalar_stmts.release ();
+ return false;
+ }
+
+ scalar_stmts.truncate (0);
+ stmt_vec_info tail = NULL;
+ for (auto el : chain)
+ {
+ if (el.dt == vect_external_def
+ || el.dt == vect_constant_def
+ || el.code != (tree_code) code)
+ {
+ scalar_stmts.release ();
+ return false;
+ }
+ stmt_vec_info stmt = vinfo->lookup_def (el.op);
+ if (STMT_VINFO_REDUC_IDX (stmt) != -1
+ || STMT_VINFO_REDUC_DEF (stmt))
+ {
+ gcc_assert (tail == NULL);
+ tail = stmt;
+ continue;
+ }
+ scalar_stmts.safe_push (stmt);
+ }
+ gcc_assert (tail);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Starting SLP discovery of reduction chain for\n");
+ for (unsigned i = 0; i < scalar_stmts.length (); ++i)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ " %G", scalar_stmts[i]->stmt);
+ }
+
+ unsigned int group_size = scalar_stmts.length ();
+ bool *matches = XALLOCAVEC (bool, group_size);
+ poly_uint64 max_nunits = 1;
+ unsigned tree_size = 0;
+ slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
+ &max_nunits, matches, limit,
+ &tree_size, bst_map);
+ if (!node)
+ {
+ scalar_stmts.release ();
+ return false;
+ }
+
+ unsigned cycle_id = vinfo->reduc_infos.length ();
+ vect_reduc_info reduc_info = new vect_reduc_info_s ();
+ vinfo->reduc_infos.safe_push (reduc_info);
+ VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
+ VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
+ VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
+ VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
+ reduc_info->is_reduc_chain = true;
+
+ /* Build the node for the PHI and possibly the conversion(s?). */
+ slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
+ SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
+ phis->cycle_info.id = cycle_id;
+ SLP_TREE_LANES (phis) = group_size;
+ SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
+ /* ??? vect_cse_slp_nodes cannot cope with cycles without any
+ SLP_TREE_SCALAR_STMTS. */
+ SLP_TREE_SCALAR_STMTS (phis).create (group_size);
+ for (unsigned i = 0; i < group_size; ++i)
+ SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
+
+ slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
+ SLP_TREE_REPRESENTATIVE (reduc) = scalar_stmt;
+ SLP_TREE_CHILDREN (reduc).quick_push (phis);
+ SLP_TREE_CHILDREN (reduc).quick_push (node);
+ reduc->cycle_info.id = cycle_id;
+ SLP_TREE_REDUC_IDX (reduc) = 0;
+ SLP_TREE_LANES (reduc) = group_size;
+ SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
+ /* ??? For the reduction epilogue we need a live lane. */
+ SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
+ SLP_TREE_SCALAR_STMTS (reduc).quick_push (scalar_stmt);
+ for (unsigned i = 1; i < group_size; ++i)
+ SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
+
+ edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
+ SLP_TREE_CHILDREN (phis).quick_push (NULL);
+ SLP_TREE_CHILDREN (phis).quick_push (NULL);
+ SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
+ SLP_TREE_REF_COUNT (reduc)++;
+
+ /* Create a new SLP instance. */
+ slp_instance new_instance = XNEW (class _slp_instance);
+ SLP_INSTANCE_TREE (new_instance) = reduc;
+ SLP_INSTANCE_LOADS (new_instance) = vNULL;
+ SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
+ SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
+ SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
+ new_instance->reduc_phis = NULL;
+ new_instance->cost_vec = vNULL;
+ new_instance->subgraph_entries = vNULL;
+
+ vinfo->slp_instances.safe_push (new_instance);
+
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Final SLP tree for instance %p:\n",
+ (void *) new_instance);
+ vect_print_slp_graph (MSG_NOTE, vect_location,
+ SLP_INSTANCE_TREE (new_instance));
+ }
+
+ return true;
+ }
+
+ if (scalar_stmts.length () <= 1)
+ {
+ scalar_stmts.release ();
+ return false;
+ }
+
scalar_stmts.reverse ();
stmt_vec_info reduc_phi_info = next_stmt;
@@ -12046,7 +12179,8 @@ vect_schedule_slp (vec_info *vinfo, const
vec<slp_instance> &slp_instances)
/* Remove vectorized stores original scalar stmts. */
for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
{
- if (!STMT_VINFO_DATA_REF (store_info)
+ if (!store_info
+ || !STMT_VINFO_DATA_REF (store_info)
|| !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
break;
--
2.51.0