[PATCH 4/6] Remove non-SLP path from vectorizable_load

Richard Biener Tue, 24 Jun 2025 07:28:06 -0700

Propagate out ncopies == 1.

                * tree-vect-stmts.cc (vectorizable_load): Step 3.
---
 gcc/tree-vect-stmts.cc | 46 +++++++++++-------------------------------
 1 file changed, 12 insertions(+), 34 deletions(-)


diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index eca7e70adf4..2efa000034c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9836,7 +9836,6 @@ vectorizable_load (vec_info *vinfo,
   tree dataref_ptr = NULL_TREE;
   tree dataref_offset = NULL_TREE;
   gimple *ptr_incr = NULL;
-  int ncopies;
   int i, j;
   unsigned int group_size;
   poly_uint64 group_gap_adj;
@@ -9938,16 +9937,9 @@ vectorizable_load (vec_info *vinfo,
   else
     vf = 1;
 
-  /* Multiple types in SLP are handled by creating the appropriate number of
-     vectorized stmts for each SLP node.  Hence, NCOPIES is always 1 in
-     case of SLP.  */
-  ncopies = 1;
-
-  gcc_assert (ncopies >= 1);
-
   /* FORNOW. This restriction should be relaxed.  */
   if (nested_in_vect_loop
-      && (ncopies > 1 || SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))
+      && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9955,20 +9947,6 @@ vectorizable_load (vec_info *vinfo,
       return false;
     }
 
-  /* Invalidate assumptions made by dependence analysis when vectorization
-     on the unrolled body effectively re-orders stmts.  */
-  if (ncopies > 1
-      && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0
-      && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
-                  STMT_VINFO_MIN_NEG_DIST (stmt_info)))
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "cannot perform implicit CSE when unrolling "
-                        "with negative dependence distance\n");
-      return false;
-    }
-
   elem_type = TREE_TYPE (vectype);
   mode = TYPE_MODE (vectype);
 
@@ -10018,7 +9996,7 @@ vectorizable_load (vec_info *vinfo,
   int maskload_elsval = 0;
   bool need_zeroing = false;
   if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, 
VLS_LOAD,
-                           ncopies, &memory_access_type, &poffset,
+                           1, &memory_access_type, &poffset,
                            &alignment_support_scheme, &misalignment, &gs_info,
                            &lanes_ifn, &elsvals))
     return false;
@@ -10194,8 +10172,7 @@ vectorizable_load (vec_info *vinfo,
   gcc_assert (memory_access_type == SLP_TREE_MEMORY_ACCESS_TYPE (slp_node));
 
   if (dump_enabled_p () && !costing_p)
-    dump_printf_loc (MSG_NOTE, vect_location,
-                     "transform load. ncopies = %d\n", ncopies);
+    dump_printf_loc (MSG_NOTE, vect_location, "transform load.\n");
 
   /* Transform.  */
 
@@ -10443,6 +10420,7 @@ vectorizable_load (vec_info *vinfo,
       /* For SLP permutation support we need to load the whole group,
         not only the number of vector stmts the permutation result
         fits in.  */
+      int ncopies;
       if (slp_perm)
        {
          /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
@@ -10869,7 +10847,7 @@ vectorizable_load (vec_info *vinfo,
       /* For costing some adjacent vector loads, we'd like to cost with
         the total number of them once instead of cost each one by one. */
       unsigned int n_adjacent_loads = 0;
-      ncopies = slp_node->vec_stmts_size / group_size;
+      int ncopies = slp_node->vec_stmts_size / group_size;
       for (j = 0; j < ncopies; j++)
        {
          if (costing_p)
@@ -11029,7 +11007,7 @@ vectorizable_load (vec_info *vinfo,
       gcc_assert (!grouped_load && !slp_perm);
 
       unsigned int inside_cost = 0, prologue_cost = 0;
-      for (j = 0; j < ncopies; j++)
+      for (j = 0; j < 1; j++)
        {
          /* 1. Create the vector or array pointer update chain.  */
          if (j == 0 && !costing_p)
@@ -11065,7 +11043,7 @@ vectorizable_load (vec_info *vinfo,
                  if (loop_masks)
                    final_mask
                      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-                                           vec_num * ncopies, vectype,
+                                           vec_num, vectype,
                                            vec_num * j + i);
                  if (vec_mask)
                    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
@@ -11098,7 +11076,7 @@ vectorizable_load (vec_info *vinfo,
                      if (loop_lens)
                        final_len
                          = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-                                              vec_num * ncopies, vectype,
+                                              vec_num, vectype,
                                               vec_num * j + i, 1);
                      else
                        final_len
@@ -11394,7 +11372,7 @@ vectorizable_load (vec_info *vinfo,
   /* For costing some adjacent vector loads, we'd like to cost with
      the total number of them once instead of cost each one by one. */
   unsigned int n_adjacent_loads = 0;
-  for (j = 0; j < ncopies; j++)
+  for (j = 0; j < 1; j++)
     {
       /* 1. Create the vector or array pointer update chain.  */
       if (j == 0 && !costing_p)
@@ -11475,7 +11453,7 @@ vectorizable_load (vec_info *vinfo,
                vec_mask = vec_masks[vec_num * j + i];
              if (loop_masks)
                final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
-                                                vec_num * ncopies, vectype,
+                                                vec_num, vectype,
                                                 vec_num * j + i);
              if (vec_mask)
                final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
@@ -11526,7 +11504,7 @@ vectorizable_load (vec_info *vinfo,
                    unsigned factor
                      = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode);
                    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-                                                  vec_num * ncopies, vectype,
+                                                  vec_num, vectype,
                                                   vec_num * j + i, factor);
                  }
                else if (final_mask)
@@ -11942,7 +11920,7 @@ vectorizable_load (vec_info *vinfo,
              if (alignment_support_scheme == dr_explicit_realign_optimized)
                {
                  gcc_assert (phi);
-                 if (i == vec_num - 1 && j == ncopies - 1)
+                 if (i == vec_num - 1 && j == 0)
                    add_phi_arg (phi, lsq, loop_latch_edge (containing_loop),
                                 UNKNOWN_LOCATION);
                  msq = lsq;
-- 
2.43.0

[PATCH 4/6] Remove non-SLP path from vectorizable_load

Reply via email to