https://gcc.gnu.org/g:97a7d568f3d72076c3f9da062007e3298da9243f
commit r16-5417-g97a7d568f3d72076c3f9da062007e3298da9243f Author: Richard Biener <[email protected]> Date: Tue Nov 18 15:20:44 2025 +0100 tree-optimization/122722 - better SLP reduction group discovery The following improves the all-or-nothing discovery of reduction groups to consider sub-groups by trying toplevel "matches" candidates for this. For simplicity and to limit compile-time failed sub-group matches are not decomposed further, only the originally failed part is tried again to discover more sub-groups. Any remaining fails get picked up by the current single-reduction handling. PR tree-optimization/122722 * tree-vect-slp.cc (vect_analyze_slp_reductions): New function, split out from vect_analyze_slp. Try SLP sub-groups. (vect_analyze_slp_reduction_group): New helper. * gcc.dg/vect/slp-reduc-14.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/slp-reduc-14.c | 15 ++ gcc/tree-vect-slp.cc | 244 +++++++++++++++++++++++-------- 2 files changed, 201 insertions(+), 58 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c new file mode 100644 index 000000000000..7966d23716ad --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ + +void foo (int * __restrict sums, int *a, int *b, int n) +{ + for (int i = 0; i < n; ++i) + { + sums[0] = sums[0] + a[2*i]; + sums[1] = sums[1] + a[2*i+1]; + sums[2] = sums[2] + b[2*i]; + sums[3] = sums[3] + b[2*i+1]; + } +} + +/* { dg-final { scan-tree-dump-times "SLP discovery of size 2 reduction group" 2 "vect" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 07e22ea7ccfa..0ab15fde469d 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -4723,6 +4723,189 @@ vect_analyze_slp_reduction (loop_vec_info vinfo, return false; } +/* Analyze a single SLP reduction group. If successful add a SLP instance + for it and return true, otherwise return false and have *MATCHES + populated. */ + +static bool +vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo, + vec<stmt_vec_info> scalar_stmts, + scalar_stmts_to_slp_tree_map_t *bst_map, + unsigned max_tree_size, unsigned *limit, + bool *matches) +{ + /* Try to form a reduction group. */ + unsigned int group_size = scalar_stmts.length (); + if (!matches) + matches = XALLOCAVEC (bool, group_size); + poly_uint64 max_nunits = 1; + unsigned tree_size = 0; + slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts, + group_size, + &max_nunits, matches, limit, + &tree_size, bst_map); + if (!node) + return false; + + /* Create a new SLP instance. */ + slp_instance new_instance = XNEW (class _slp_instance); + SLP_INSTANCE_TREE (new_instance) = node; + SLP_INSTANCE_LOADS (new_instance) = vNULL; + SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL; + SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL; + SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group; + new_instance->reduc_phis = NULL; + new_instance->cost_vec = vNULL; + new_instance->subgraph_entries = vNULL; + + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "SLP size %u vs. limit %u.\n", + tree_size, max_tree_size); + + loop_vinfo->slp_instances.safe_push (new_instance); + + /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with + the number of scalar stmts in the root in a few places. + Verify that assumption holds. */ + gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance)) + .length () == group_size); + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "SLP discovery of size %d reduction group " + "succeeded\n", group_size); + dump_printf_loc (MSG_NOTE, vect_location, + "Final SLP tree for instance %p:\n", + (void *) new_instance); + vect_print_slp_graph (MSG_NOTE, vect_location, + SLP_INSTANCE_TREE (new_instance)); + } + + return true; +} + +/* Analyze reductions in LOOP_VINFO and populate SLP instances + accordingly. Returns false if something fails. */ + +static bool +vect_analyze_slp_reductions (loop_vec_info loop_vinfo, + unsigned max_tree_size, unsigned *limit, + scalar_stmts_to_slp_tree_map_t *bst_map, + bool force_single_lane) +{ + if (loop_vinfo->reductions.is_empty ()) + return true; + + /* Collect reduction statements we can combine into + a SLP reduction. */ + vec<stmt_vec_info> scalar_stmts; + scalar_stmts.create (loop_vinfo->reductions.length ()); + for (auto next_info : loop_vinfo->reductions) + { + next_info = vect_stmt_to_vectorize (next_info); + if ((STMT_VINFO_RELEVANT_P (next_info) + || STMT_VINFO_LIVE_P (next_info)) + /* ??? Make sure we didn't skip a conversion around a + reduction path. In that case we'd have to reverse + engineer that conversion stmt following the chain using + reduc_idx and from the PHI using reduc_def. */ + && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def + || (STMT_VINFO_DEF_TYPE (next_info) + == vect_double_reduction_def))) + { + /* Do not discover SLP reductions combining lane-reducing + ops, that will fail later. */ + if (!force_single_lane + && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info))) + scalar_stmts.quick_push (next_info); + /* Do SLP discovery for single-lane reductions. */ + else if (! vect_analyze_slp_reduction (loop_vinfo, next_info, + max_tree_size, limit, + bst_map, + force_single_lane)) + { + scalar_stmts.release (); + return false; + } + } + } + + if (scalar_stmts.length () > 1) + { + /* Try to form a reduction group. */ + unsigned int group_size = scalar_stmts.length (); + bool *matches = XALLOCAVEC (bool, group_size); + if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map, + max_tree_size, limit, matches)) + return true; + + /* When analysis as a single SLP reduction group failed try to + form sub-groups by collecting matching lanes. Do not recurse + that on failure (to limit compile-time costs), but recurse + for the initial non-matching parts. Everything not covered + by a sub-group gets single-reduction treatment. */ + vec<stmt_vec_info> cands = vNULL; + while (matches[0]) + { + cands.truncate (0); + cands.reserve (group_size, true); + for (unsigned i = 0; i < group_size; ++i) + if (matches[i]) + cands.quick_push (scalar_stmts[i]); + + /* Try to form a reduction group. */ + if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map, + max_tree_size, limit, NULL)) + cands = vNULL; + else + { + /* Do SLP discovery for single-lane reductions. */ + for (auto stmt_info : cands) + if (! vect_analyze_slp_reduction (loop_vinfo, + vect_stmt_to_vectorize + (stmt_info), + max_tree_size, limit, + bst_map, force_single_lane)) + { + scalar_stmts.release (); + cands.release (); + return false; + } + } + /* Remove the handled stmts from scalar_stmts and try again, + possibly repeating the above with updated matches[]. */ + unsigned j = 0; + for (unsigned i = 0; i < group_size; ++i) + if (!matches[i]) + { + scalar_stmts[j] = scalar_stmts[i]; + ++j; + } + scalar_stmts.truncate (j); + group_size = scalar_stmts.length (); + if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, + bst_map, max_tree_size, limit, + matches)) + return true; + } + } + /* Do SLP discovery for single-lane reductions. */ + for (auto stmt_info : scalar_stmts) + if (! vect_analyze_slp_reduction (loop_vinfo, + vect_stmt_to_vectorize (stmt_info), + max_tree_size, limit, + bst_map, force_single_lane)) + { + scalar_stmts.release (); + return false; + } + + scalar_stmts.release (); + return true; +} + /* Analyze an SLP instance starting from a group of grouped stores. Call vect_build_slp_tree to build a tree of packed stmts if possible. Return FALSE if it's impossible to SLP any stmt in the group. */ @@ -5617,64 +5800,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size, if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) { /* Find SLP sequences starting from groups of reductions. */ - if (loop_vinfo->reductions.length () > 0) - { - /* Collect reduction statements we can combine into - a SLP reduction. */ - vec<stmt_vec_info> scalar_stmts; - scalar_stmts.create (loop_vinfo->reductions.length ()); - for (auto next_info : loop_vinfo->reductions) - { - next_info = vect_stmt_to_vectorize (next_info); - if ((STMT_VINFO_RELEVANT_P (next_info) - || STMT_VINFO_LIVE_P (next_info)) - /* ??? Make sure we didn't skip a conversion around a - reduction path. In that case we'd have to reverse - engineer that conversion stmt following the chain using - reduc_idx and from the PHI using reduc_def. */ - && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def - || (STMT_VINFO_DEF_TYPE (next_info) - == vect_double_reduction_def))) - { - /* Do not discover SLP reductions combining lane-reducing - ops, that will fail later. */ - if (!force_single_lane - && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info))) - scalar_stmts.quick_push (next_info); - /* Do SLP discovery for single-lane reductions. */ - else if (! vect_analyze_slp_reduction (loop_vinfo, next_info, - max_tree_size, &limit, - bst_map, - force_single_lane)) - return opt_result::failure_at (vect_location, - "SLP build failed.\n"); - } - } - /* Save for re-processing on failure. */ - vec<stmt_vec_info> saved_stmts = scalar_stmts.copy (); - vec<stmt_vec_info> roots = vNULL; - vec<tree> remain = vNULL; - if (scalar_stmts.length () <= 1 - || !vect_build_slp_instance (loop_vinfo, - slp_inst_kind_reduc_group, - scalar_stmts, roots, remain, - max_tree_size, &limit, bst_map, - force_single_lane)) - { - if (scalar_stmts.length () <= 1) - scalar_stmts.release (); - /* Do SLP discovery for single-lane reductions. */ - for (auto stmt_info : saved_stmts) - if (! vect_analyze_slp_reduction (loop_vinfo, - vect_stmt_to_vectorize - (stmt_info), - max_tree_size, &limit, - bst_map, force_single_lane)) - return opt_result::failure_at (vect_location, - "SLP build failed.\n"); - } - saved_stmts.release (); - } + if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit, + bst_map, force_single_lane)) + return opt_result::failure_at (vect_location, "SLP build failed.\n"); /* Make sure to vectorize only-live stmts, usually inductions. */ for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
