[PATCH] Do less redundant vect_transform_slp_perm_load calls

Richard Biener Fri, 12 Sep 2025 14:00:13 -0700

The following tries to do vect_transform_slp_perm_load exactly
once during analysis and once during transform.  There's a 2nd
case left during analysis in get_load_store_type.  Temporarily
this records n_perms in the load-store info and verifies that
against the value computed at transform stage.


Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

        * tree-vectorizer.h (vect_load_store_data::n_perms): New.
        * tree-vect-stmts.cc (vectorizable_load): Analyze
        SLP_TREE_LOAD_PERMUTATION only once and remember n_perms.
        Verify the transform-time n_perms against the value stored
        during analysis.
---
 gcc/tree-vect-stmts.cc | 47 +++++++++++++++++++++++-------------------
 gcc/tree-vectorizer.h  |  1 +
 2 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7eabf169a2b..d0ae19baebb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9478,6 +9478,7 @@ vectorizable_load (vec_info *vinfo,
 
   /* ???  The following checks should really be part of
      get_load_store_type.  */
+  unsigned n_perms = -1U;
   if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
       && !((memory_access_type == VMAT_ELEMENTWISE
            || mat_gather_scatter_p (memory_access_type))
@@ -9485,7 +9486,7 @@ vectorizable_load (vec_info *vinfo,
     {
       slp_perm = true;
 
-      if (!loop_vinfo)
+      if (!loop_vinfo && cost_vec)
        {
          /* In BB vectorization we may not actually use a loaded vector
             accessing elements in excess of DR_GROUP_SIZE.  */
@@ -9508,17 +9509,21 @@ vectorizable_load (vec_info *vinfo,
            }
        }
 
-      auto_vec<tree> tem;
-      unsigned n_perms;
-      if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
-                                        true, &n_perms))
+      if (cost_vec)
        {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                            vect_location,
-                            "unsupported load permutation\n");
-         return false;
+         if (!vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
+                                            true, &n_perms))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                                vect_location,
+                                "unsupported load permutation\n");
+             return false;
+           }
+         ls.n_perms = n_perms;
        }
+      else
+       n_perms = ls.n_perms;
     }
 
   if (slp_node->ldst_lanes
@@ -9989,18 +9994,19 @@ vectorizable_load (vec_info *vinfo,
        }
       if (slp_perm)
        {
-         unsigned n_perms;
          if (costing_p)
            {
-             unsigned n_loads;
-             vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
-                                           true, &n_perms, &n_loads);
+             gcc_assert (n_perms != -1U);
              inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
                                               slp_node, 0, vect_body);
            }
          else
-           vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
-                                         false, &n_perms);
+           {
+             unsigned n_perms2;
+             vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
+                                           false, &n_perms2);
+             gcc_assert (n_perms == n_perms2);
+           }
        }
 
       if (costing_p)
@@ -11378,25 +11384,24 @@ vectorizable_load (vec_info *vinfo,
 
   if (slp_perm)
     {
-      unsigned n_perms;
       /* For SLP we know we've seen all possible uses of dr_chain so
         direct vect_transform_slp_perm_load to DCE the unused parts.
         ???  This is a hack to prevent compile-time issues as seen
         in PR101120 and friends.  */
       if (costing_p)
        {
-         vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf,
-                                       true, &n_perms, nullptr);
+         gcc_assert (n_perms != -1U);
          if (n_perms != 0)
            inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
                                            slp_node, 0, vect_body);
        }
       else
        {
+         unsigned n_perms2;
          bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
-                                                 gsi, vf, false, &n_perms,
+                                                 gsi, vf, false, &n_perms2,
                                                  nullptr, true);
-         gcc_assert (ok);
+         gcc_assert (ok && n_perms == n_perms2);
        }
       dr_chain.release ();
     }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 749a9830e07..6ac4299ede2 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -288,6 +288,7 @@ struct vect_load_store_data : vect_data {
   } gs;
   tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
   auto_vec<int> elsvals;
+  unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
 };
 
 /* A computation tree of an SLP instance.  Each node corresponds to a group of
-- 
2.51.0

[PATCH] Do less redundant vect_transform_slp_perm_load calls

Reply via email to