https://gcc.gnu.org/g:7164d982663738c255a1a71a5d4f38dc51c2a3cb

commit r15-3442-g7164d982663738c255a1a71a5d4f38dc51c2a3cb
Author: Richard Biener <rguent...@suse.de>
Date:   Mon Sep 2 15:00:05 2024 +0200

    Also lower SLP grouped loads with just one consumer
    
    This makes sure to produce interleaving schemes or load-lanes
    for single-element interleaving and other permutes that otherwise
    would use more than three vectors.
    
    It exposes the latent issue that single-element interleaving with
    large gaps can be inefficient - the mitigation in get_group_load_store_type
    doesn't trigger when we clear the load permutation.
    
    It also exposes the fact that not all permutes can be lowered in
    the best way in a vector length agnostic way so I've added an
    exception to keep power-of-two size contiguous aligned chunks
    unlowered (unless we want load-lanes).  The optimal handling
    of load/store vectorization is going to continue to be a learning
    process.
    
            * tree-vect-slp.cc (vect_lower_load_permutations): Also
            process single-use grouped loads.
            Avoid lowering contiguous aligned power-of-two sized
            chunks, those are better handled by the vector size
            specific SLP code generation.
            * tree-vect-stmts.cc (get_group_load_store_type): Drop
            the unrelated requirement of a load permutation for the
            single-element interleaving limit.
    
            * gcc.dg/vect/slp-46.c: Remove XFAIL.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-46.c |  2 +-
 gcc/tree-vect-slp.cc               | 56 ++++++++++++++++++++++++++------------
 gcc/tree-vect-stmts.cc             |  1 -
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c 
b/gcc/testsuite/gcc.dg/vect/slp-46.c
index b44a673f7de..016580e7a95 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-46.c
@@ -98,4 +98,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { 
xfail { vect_load_lanes && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } 
} */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 2b05032790e..d35e0609174 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4315,6 +4315,37 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
          && ld_lanes_lanes == 0)
        continue;
 
+      /* Build the permute to get the original load permutation order.  */
+      bool contiguous = true;
+      lane_permutation_t final_perm;
+      final_perm.create (SLP_TREE_LANES (load));
+      for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
+       {
+         final_perm.quick_push
+           (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+         if (i != 0
+             && (SLP_TREE_LOAD_PERMUTATION (load)[i]
+                 != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
+           contiguous = false;
+       }
+
+      /* When the load permutation accesses a contiguous unpermuted,
+        power-of-two aligned and sized chunk leave the load alone.
+        We can likely (re-)load it more efficiently rather than
+        extracting it from the larger load.
+        ???  Long-term some of the lowering should move to where
+        the vector types involved are fixed.  */
+      if (ld_lanes_lanes == 0
+         && contiguous
+         && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
+         && pow2p_hwi (SLP_TREE_LANES (load))
+         && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
+         && group_lanes % SLP_TREE_LANES (load) == 0)
+       {
+         final_perm.release ();
+         continue;
+       }
+
       /* First build (and possibly re-use) a load node for the
         unpermuted group.  Gaps in the middle and on the end are
         represented with NULL stmts.  */
@@ -4338,13 +4369,6 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
                                         &max_nunits, matches, &limit,
                                         &tree_size, bst_map);
 
-      /* Build the permute to get the original load permutation order.  */
-      lane_permutation_t final_perm;
-      final_perm.create (SLP_TREE_LANES (load));
-      for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
-       final_perm.quick_push
-         (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
-
       if (ld_lanes_lanes != 0)
        {
          /* ???  If this is not in sync with what get_load_store_type
@@ -4503,20 +4527,16 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
          && STMT_VINFO_GROUPED_ACCESS (b0)
          && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
        continue;
-      /* Just one SLP load of a possible group, leave those alone.  */
-      if (i == firsti + 1)
-       {
-         firsti = i;
-         continue;
-       }
-      /* Now we have multiple SLP loads of the same group from
+      /* Now we have one or multiple SLP loads of the same group from
         firsti to i - 1.  */
-      vect_lower_load_permutations (loop_vinfo, bst_map,
-                                   make_array_slice (&loads[firsti],
-                                                     i - firsti));
+      if (STMT_VINFO_GROUPED_ACCESS (a0))
+       vect_lower_load_permutations (loop_vinfo, bst_map,
+                                     make_array_slice (&loads[firsti],
+                                                       i - firsti));
       firsti = i;
     }
-  if (firsti < loads.length () - 1)
+  if (firsti < loads.length ()
+      && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
     vect_lower_load_permutations (loop_vinfo, bst_map,
                                  make_array_slice (&loads[firsti],
                                                    loads.length () - firsti));
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 16f6889d853..25b120c158e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2187,7 +2187,6 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info 
stmt_info,
             blow up memory, see PR65518).  */
          if (loop_vinfo
              && *memory_access_type == VMAT_CONTIGUOUS
-             && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
              && single_element_p
              && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
            {

Reply via email to