https://gcc.gnu.org/g:97a7d568f3d72076c3f9da062007e3298da9243f

commit r16-5417-g97a7d568f3d72076c3f9da062007e3298da9243f
Author: Richard Biener <[email protected]>
Date:   Tue Nov 18 15:20:44 2025 +0100

    tree-optimization/122722 - better SLP reduction group discovery
    
    The following improves the all-or-nothing discovery of reduction
    groups to consider sub-groups by trying toplevel "matches" candidates
    for this.  For simplicity and to limit compile-time failed sub-group
    matches are not decomposed further, only the originally failed part
    is tried again to discover more sub-groups.  Any remaining fails
    get picked up by the current single-reduction handling.
    
            PR tree-optimization/122722
            * tree-vect-slp.cc (vect_analyze_slp_reductions): New
            function, split out from vect_analyze_slp.  Try SLP
            sub-groups.
            (vect_analyze_slp_reduction_group): New helper.
    
            * gcc.dg/vect/slp-reduc-14.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-reduc-14.c |  15 ++
 gcc/tree-vect-slp.cc                     | 244 +++++++++++++++++++++++--------
 2 files changed, 201 insertions(+), 58 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c 
b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
new file mode 100644
index 000000000000..7966d23716ad
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-14.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
+
+void foo (int * __restrict sums, int *a, int *b, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      sums[0] = sums[0] + a[2*i];
+      sums[1] = sums[1] + a[2*i+1];
+      sums[2] = sums[2] + b[2*i];
+      sums[3] = sums[3] + b[2*i+1];
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "SLP discovery of size 2 reduction group" 
2 "vect" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 07e22ea7ccfa..0ab15fde469d 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4723,6 +4723,189 @@ vect_analyze_slp_reduction (loop_vec_info vinfo,
   return false;
 }
 
+/* Analyze a single SLP reduction group.  If successful add a SLP instance
+   for it and return true, otherwise return false and have *MATCHES
+   populated.  */
+
+static bool
+vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
+                                 vec<stmt_vec_info> scalar_stmts,
+                                 scalar_stmts_to_slp_tree_map_t *bst_map,
+                                 unsigned max_tree_size, unsigned *limit,
+                                 bool *matches)
+{
+  /* Try to form a reduction group.  */
+  unsigned int group_size = scalar_stmts.length ();
+  if (!matches)
+    matches = XALLOCAVEC (bool, group_size);
+  poly_uint64 max_nunits = 1;
+  unsigned tree_size = 0;
+  slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
+                                      group_size,
+                                      &max_nunits, matches, limit,
+                                      &tree_size, bst_map);
+  if (!node)
+    return false;
+
+  /* Create a new SLP instance.  */
+  slp_instance new_instance = XNEW (class _slp_instance);
+  SLP_INSTANCE_TREE (new_instance) = node;
+  SLP_INSTANCE_LOADS (new_instance) = vNULL;
+  SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
+  SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
+  SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
+  new_instance->reduc_phis = NULL;
+  new_instance->cost_vec = vNULL;
+  new_instance->subgraph_entries = vNULL;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "SLP size %u vs. limit %u.\n",
+                    tree_size, max_tree_size);
+
+  loop_vinfo->slp_instances.safe_push (new_instance);
+
+  /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
+     the number of scalar stmts in the root in a few places.
+     Verify that assumption holds.  */
+  gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
+             .length () == group_size);
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "SLP discovery of size %d reduction group "
+                      "succeeded\n", group_size);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "Final SLP tree for instance %p:\n",
+                      (void *) new_instance);
+      vect_print_slp_graph (MSG_NOTE, vect_location,
+                           SLP_INSTANCE_TREE (new_instance));
+    }
+
+  return true;
+}
+
+/* Analyze reductions in LOOP_VINFO and populate SLP instances
+   accordingly.  Returns false if something fails.  */
+
+static bool
+vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
+                            unsigned max_tree_size, unsigned *limit,
+                            scalar_stmts_to_slp_tree_map_t *bst_map,
+                            bool force_single_lane)
+{
+  if (loop_vinfo->reductions.is_empty ())
+    return true;
+
+  /* Collect reduction statements we can combine into
+     a SLP reduction.  */
+  vec<stmt_vec_info> scalar_stmts;
+  scalar_stmts.create (loop_vinfo->reductions.length ());
+  for (auto next_info : loop_vinfo->reductions)
+    {
+      next_info = vect_stmt_to_vectorize (next_info);
+      if ((STMT_VINFO_RELEVANT_P (next_info)
+          || STMT_VINFO_LIVE_P (next_info))
+         /* ???  Make sure we didn't skip a conversion around a
+            reduction path.  In that case we'd have to reverse
+            engineer that conversion stmt following the chain using
+            reduc_idx and from the PHI using reduc_def.  */
+         && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
+             || (STMT_VINFO_DEF_TYPE (next_info)
+                 == vect_double_reduction_def)))
+       {
+         /* Do not discover SLP reductions combining lane-reducing
+            ops, that will fail later.  */
+         if (!force_single_lane
+             && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
+           scalar_stmts.quick_push (next_info);
+         /* Do SLP discovery for single-lane reductions.  */
+         else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
+                                                max_tree_size, limit,
+                                                bst_map,
+                                                force_single_lane))
+           {
+             scalar_stmts.release ();
+             return false;
+           }
+       }
+    }
+
+  if (scalar_stmts.length () > 1)
+    {
+      /* Try to form a reduction group.  */
+      unsigned int group_size = scalar_stmts.length ();
+      bool *matches = XALLOCAVEC (bool, group_size);
+      if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
+                                           max_tree_size, limit, matches))
+       return true;
+
+      /* When analysis as a single SLP reduction group failed try to
+        form sub-groups by collecting matching lanes.  Do not recurse
+        that on failure (to limit compile-time costs), but recurse
+        for the initial non-matching parts.  Everything not covered
+        by a sub-group gets single-reduction treatment.  */
+      vec<stmt_vec_info> cands = vNULL;
+      while (matches[0])
+       {
+         cands.truncate (0);
+         cands.reserve (group_size, true);
+         for (unsigned i = 0; i < group_size; ++i)
+           if (matches[i])
+             cands.quick_push (scalar_stmts[i]);
+
+         /* Try to form a reduction group.  */
+         if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
+                                               max_tree_size, limit, NULL))
+           cands = vNULL;
+         else
+           {
+             /* Do SLP discovery for single-lane reductions.  */
+             for (auto stmt_info : cands)
+               if (! vect_analyze_slp_reduction (loop_vinfo,
+                                                 vect_stmt_to_vectorize
+                                                   (stmt_info),
+                                                 max_tree_size, limit,
+                                                 bst_map, force_single_lane))
+                 {
+                   scalar_stmts.release ();
+                   cands.release ();
+                   return false;
+                 }
+           }
+         /* Remove the handled stmts from scalar_stmts and try again,
+            possibly repeating the above with updated matches[].  */
+         unsigned j = 0;
+         for (unsigned i = 0; i < group_size; ++i)
+           if (!matches[i])
+             {
+               scalar_stmts[j] = scalar_stmts[i];
+               ++j;
+             }
+         scalar_stmts.truncate (j);
+         group_size = scalar_stmts.length ();
+         if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
+                                               bst_map, max_tree_size, limit,
+                                               matches))
+           return true;
+       }
+    }
+  /* Do SLP discovery for single-lane reductions.  */
+  for (auto stmt_info : scalar_stmts)
+    if (! vect_analyze_slp_reduction (loop_vinfo,
+                                     vect_stmt_to_vectorize (stmt_info),
+                                     max_tree_size, limit,
+                                     bst_map, force_single_lane))
+      {
+       scalar_stmts.release ();
+       return false;
+      }
+
+  scalar_stmts.release ();
+  return true;
+}
+
 /* Analyze an SLP instance starting from a group of grouped stores.  Call
    vect_build_slp_tree to build a tree of packed stmts if possible.
    Return FALSE if it's impossible to SLP any stmt in the group.  */
@@ -5617,64 +5800,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size,
   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
     {
       /* Find SLP sequences starting from groups of reductions.  */
-      if (loop_vinfo->reductions.length () > 0)
-       {
-         /* Collect reduction statements we can combine into
-            a SLP reduction.  */
-         vec<stmt_vec_info> scalar_stmts;
-         scalar_stmts.create (loop_vinfo->reductions.length ());
-         for (auto next_info : loop_vinfo->reductions)
-           {
-             next_info = vect_stmt_to_vectorize (next_info);
-             if ((STMT_VINFO_RELEVANT_P (next_info)
-                  || STMT_VINFO_LIVE_P (next_info))
-                 /* ???  Make sure we didn't skip a conversion around a
-                    reduction path.  In that case we'd have to reverse
-                    engineer that conversion stmt following the chain using
-                    reduc_idx and from the PHI using reduc_def.  */
-                 && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
-                     || (STMT_VINFO_DEF_TYPE (next_info)
-                         == vect_double_reduction_def)))
-               {
-                 /* Do not discover SLP reductions combining lane-reducing
-                    ops, that will fail later.  */
-                 if (!force_single_lane
-                     && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
-                   scalar_stmts.quick_push (next_info);
-                 /* Do SLP discovery for single-lane reductions.  */
-                 else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
-                                                        max_tree_size, &limit,
-                                                        bst_map,
-                                                        force_single_lane))
-                   return opt_result::failure_at (vect_location,
-                                                  "SLP build failed.\n");
-               }
-           }
-         /* Save for re-processing on failure.  */
-         vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
-         vec<stmt_vec_info> roots = vNULL;
-         vec<tree> remain = vNULL;
-         if (scalar_stmts.length () <= 1
-             || !vect_build_slp_instance (loop_vinfo,
-                                          slp_inst_kind_reduc_group,
-                                          scalar_stmts, roots, remain,
-                                          max_tree_size, &limit, bst_map,
-                                          force_single_lane))
-           {
-             if (scalar_stmts.length () <= 1)
-               scalar_stmts.release ();
-             /* Do SLP discovery for single-lane reductions.  */
-             for (auto stmt_info : saved_stmts)
-               if (! vect_analyze_slp_reduction (loop_vinfo,
-                                                 vect_stmt_to_vectorize
-                                                   (stmt_info),
-                                                 max_tree_size, &limit,
-                                                 bst_map, force_single_lane))
-                 return opt_result::failure_at (vect_location,
-                                                "SLP build failed.\n");
-           }
-         saved_stmts.release ();
-       }
+      if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
+                                       bst_map, force_single_lane))
+       return opt_result::failure_at (vect_location, "SLP build failed.\n");
 
       /* Make sure to vectorize only-live stmts, usually inductions.  */
       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))

Reply via email to