https://gcc.gnu.org/g:0d4b254b20a9193ab261d02f8a063e21816f85e4

commit r15-5168-g0d4b254b20a9193ab261d02f8a063e21816f85e4
Author: Richard Biener <rguent...@suse.de>
Date:   Tue Nov 12 10:31:30 2024 +0100

    tree-optimization/116973 - SLP permute lower heuristic and single-lane SLP
    
    When forcing single-lane SLP to emulate non-SLP behavior we need to
    disable heuristics designed to optimize SLP loads and instead in
    all cases resort to an interleaving scheme as requested by forcefully
    doing single-lane SLP.
    
    This fixes the remaining fallout for --param vect-force-slp=1 on x86.
    
            PR tree-optimization/116973
            * tree-vect-slp.cc (vect_lower_load_permutations): Add
            force_single_lane parameter.  Disable heuristic that keeps
            some load-permutations.
            (vect_analyze_slp): Pass force_single_lane to
            vect_lower_load_permutations.

Diff:
---
 gcc/tree-vect-slp.cc | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index eebac1955de7..d69fdc04b9df 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4402,7 +4402,8 @@ vllp_cmp (const void *a_, const void *b_)
 static void
 vect_lower_load_permutations (loop_vec_info loop_vinfo,
                              scalar_stmts_to_slp_tree_map_t *bst_map,
-                             const array_slice<slp_tree> &loads)
+                             const array_slice<slp_tree> &loads,
+                             bool force_single_lane)
 {
   /* We at this point want to lower without a fixed VF or vector
      size in mind which means we cannot actually compute whether we
@@ -4494,7 +4495,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
         extracting it from the larger load.
         ???  Long-term some of the lowering should move to where
         the vector types involved are fixed.  */
-      if (ld_lanes_lanes == 0
+      if (!force_single_lane
+         && ld_lanes_lanes == 0
          && contiguous
          && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
          && pow2p_hwi (SLP_TREE_LANES (load))
@@ -4668,7 +4670,8 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
 
 static void
 vect_lower_load_permutations (loop_vec_info loop_vinfo,
-                             scalar_stmts_to_slp_tree_map_t *bst_map)
+                             scalar_stmts_to_slp_tree_map_t *bst_map,
+                             bool force_single_lane)
 {
   /* Gather and sort loads across all instances.  */
   hash_set<slp_tree> visited;
@@ -4696,14 +4699,16 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
       if (STMT_VINFO_GROUPED_ACCESS (a0))
        vect_lower_load_permutations (loop_vinfo, bst_map,
                                      make_array_slice (&loads[firsti],
-                                                       i - firsti));
+                                                       i - firsti),
+                                     force_single_lane);
       firsti = i;
     }
   if (firsti < loads.length ()
       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
     vect_lower_load_permutations (loop_vinfo, bst_map,
                                  make_array_slice (&loads[firsti],
-                                                   loads.length () - firsti));
+                                                   loads.length () - firsti),
+                                 force_single_lane);
 }
 
 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
@@ -5097,7 +5102,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
      like schemes.  */
   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
     {
-      vect_lower_load_permutations (loop_vinfo, bst_map);
+      vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
       if (dump_enabled_p ())
        {
          dump_printf_loc (MSG_NOTE, vect_location,

Reply via email to