[gcc r16-403] tree-optimization/1157777 - STLF fails with BB vectorization of loop

Richard Biener via Gcc-cvs Tue, 06 May 2025 04:39:21 -0700

https://gcc.gnu.org/g:76c33109074b8e7cf6c326116b46792070122c7b


commit r16-403-g76c33109074b8e7cf6c326116b46792070122c7b
Author: Richard Biener <rguent...@suse.de>
Date:   Mon Mar 17 15:04:28 2025 +0100

    tree-optimization/1157777 - STLF fails with BB vectorization of loop
    
    The following tries to address us BB vectorizing a loop body that
    swaps consecutive elements of an array like for bubble-sort.  This
    causes the vector store in the previous iteration to fail to forward
    to the vector load in the current iteration since there's a partial
    overlap.
    
    We try to detect this situation by looking for a load to store
    data dependence and analyze this with respect to the containing loop
    for a proven problematic access.  Currently the search for a
    problematic pair is limited to loads and stores in the same SLP
    instance which means the problematic load happens in the next
    loop iteration and larger dependence distances are not considered.
    
    On x86 with generic costing this avoids vectorizing the loop body,
    but once you do core-specific tuning the saved cost for the vector
    store vs. the scalar stores makes vectorization still profitable,
    but at least the STLF issue is avoided.
    
    For example on my Zen4 machine with -O2 -march=znver4 the testcase in
    the PR is improving from
      insertion_sort  =>     2327
    to
      insertion_sort  =>      997
    but plain -O2 (or -fno-tree-slp-vectorize) gives
      insertion_sort  =>      183
    In the end a better target-side cost model for small vector
    vectorization is needed to reject this vectorization from this side.
    
    I'll note this is a machine independent heuristic (similar to the
    avoid-store-forwarding RTL optimization pass), I expect that uarchs
    implementing vectors will suffer from this kind of issue.  I know
    some aarch64 uarchs can forward from upper/lower part stores, this
    isn't considered at the moment.  The actual vector size/overlap
    distance check could be moved to a target hook if it turns out
    necessary.
    
    There might be the chance to use a smaller vector size for the loads
    avoiding the penalty rather than falling back to elementwise accesses,
    that's not implemented either.
    
            PR tree-optimization/1157777
            * tree-vectorizer.h (_slp_tree::avoid_stlf_fail): New member.
            * tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize it.
            (vect_print_slp_tree): Dump it.
            * tree-vect-data-refs.cc (vect_slp_analyze_instance_dependence):
            For dataflow dependent loads of a store check whether there's
            a cross-iteration data dependence that for sure prohibits
            store-to-load forwarding and mark involved loads.
            * tree-vect-stmts.cc (get_group_load_store_type): For 
avoid_stlf_fail
            marked loads use VMAT_ELEMENTWISE.
    
            * gcc.dg/vect/bb-slp-pr115777.c: New testcase.

Diff:
---
 gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c | 15 +++++
 gcc/tree-vect-data-refs.cc                  | 91 +++++++++++++++++++++++++++++
 gcc/tree-vect-slp.cc                        |  4 +-
 gcc/tree-vect-stmts.cc                      |  8 +++
 gcc/tree-vectorizer.h                       |  3 +
 5 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c
new file mode 100644
index 000000000000..bba0dc75f6fc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+typedef unsigned int T;
+
+#define SWAP(A, B) do { T tmp = A; A = B; B = tmp; } while (0)
+
+void
+insertion_sort(T *v, int n)
+{
+  for (int i = 1; i < n; ++i)
+    for (int k = i; k > 0 && v[k-1] > v[k]; --k)
+      SWAP(v[k-1], v[k]);
+}
+
+/* { dg-final { scan-tree-dump "using element-wise load" "slp1" { target { { 
x86_64-*-* i?86-*-* } && { ! ia32 } } } } } */
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index c9395e33fcdf..231a3cab4f80 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1203,6 +1203,97 @@ vect_slp_analyze_instance_dependence (vec_info *vinfo, 
slp_instance instance)
     for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 
+  /* If this is a SLP instance with a store check if there's a dependent
+     load that cannot be forwarded from a previous iteration of a loop
+     both are in.  This is to avoid situations like that in PR115777.  */
+  if (res && store)
+    {
+      stmt_vec_info store_info
+       = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (store)[0]);
+      class loop *store_loop = gimple_bb (store_info->stmt)->loop_father;
+      if (! loop_outer (store_loop))
+       return res;
+      vec<loop_p> loop_nest;
+      loop_nest.create (1);
+      loop_nest.quick_push (store_loop);
+      data_reference *drs = nullptr;
+      for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
+       {
+         if (! STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (load)[0]))
+           continue;
+         stmt_vec_info load_info
+           = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (load)[0]);
+         if (gimple_bb (load_info->stmt)->loop_father != store_loop)
+           continue;
+
+         /* For now concern ourselves with write-after-read as we also
+            only look for re-use of the store within the same SLP instance.
+            We can still get a RAW here when the instance contais a PHI
+            with a backedge though, thus this test.  */
+         if (! vect_stmt_dominates_stmt_p (STMT_VINFO_STMT (load_info),
+                                           STMT_VINFO_STMT (store_info)))
+           continue;
+
+         if (! drs)
+           {
+             drs = create_data_ref (loop_preheader_edge (store_loop),
+                                    store_loop,
+                                    DR_REF (STMT_VINFO_DATA_REF (store_info)),
+                                    store_info->stmt, false, false);
+             if (! DR_BASE_ADDRESS (drs)
+                 || TREE_CODE (DR_STEP (drs)) != INTEGER_CST)
+               break;
+           }
+         data_reference *drl
+           = create_data_ref (loop_preheader_edge (store_loop),
+                              store_loop,
+                              DR_REF (STMT_VINFO_DATA_REF (load_info)),
+                              load_info->stmt, true, false);
+
+         /* See whether the DRs have a known constant distance throughout
+            the containing loop iteration.  */
+         if (! DR_BASE_ADDRESS (drl)
+             || ! operand_equal_p (DR_STEP (drs), DR_STEP (drl))
+             || ! operand_equal_p (DR_BASE_ADDRESS (drs),
+                                   DR_BASE_ADDRESS (drl))
+             || ! operand_equal_p (DR_OFFSET (drs), DR_OFFSET (drl)))
+           {
+             free_data_ref (drl);
+             continue;
+           }
+
+         /* If the next iteration load overlaps with a non-power-of-two offset
+            we are surely failing any STLF attempt.  */
+         HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (drl));
+         unsigned HOST_WIDE_INT sizes
+           = (TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drs))))
+              * DR_GROUP_SIZE (store_info));
+         unsigned HOST_WIDE_INT sizel
+           = (TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drl))))
+              * DR_GROUP_SIZE (load_info));
+         if (ranges_overlap_p (TREE_INT_CST_LOW (DR_INIT (drl)) + step, sizel,
+                               TREE_INT_CST_LOW (DR_INIT (drs)), sizes))
+           {
+             unsigned HOST_WIDE_INT dist
+               = absu_hwi (TREE_INT_CST_LOW (DR_INIT (drl)) + step
+                           - TREE_INT_CST_LOW (DR_INIT (drs)));
+             poly_uint64 loadsz = tree_to_poly_uint64
+                                    (TYPE_SIZE_UNIT (SLP_TREE_VECTYPE (load)));
+             poly_uint64 storesz = tree_to_poly_uint64
+                                   (TYPE_SIZE_UNIT (SLP_TREE_VECTYPE (store)));
+             /* When the overlap aligns with vector sizes used for the loads
+                and the vector stores are larger or equal to the loads
+                forwarding should work.  */
+             if (maybe_gt (loadsz, storesz) || ! multiple_p (dist, loadsz))
+               load->avoid_stlf_fail = true;
+           }
+         free_data_ref (drl);
+       }
+      if (drs)
+       free_data_ref (drs);
+      loop_nest.release ();
+    }
+
   return res;
 }
 
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 9bf142d0faf5..562e2227c7c4 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -122,6 +122,7 @@ _slp_tree::_slp_tree ()
   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
   SLP_TREE_CODE (this) = ERROR_MARK;
   this->ldst_lanes = false;
+  this->avoid_stlf_fail = false;
   SLP_TREE_VECTYPE (this) = NULL_TREE;
   SLP_TREE_REPRESENTATIVE (this) = NULL;
   SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
@@ -3104,7 +3105,8 @@ vect_print_slp_tree (dump_flags_t dump_kind, 
dump_location_t loc,
                                         SLP_TREE_REF_COUNT (node));
   if (SLP_TREE_VECTYPE (node))
     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
-  dump_printf (metadata, "\n");
+  dump_printf (metadata, "%s\n",
+              node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
     {
       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 537ae6c2f614..ea0b42627815 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2134,6 +2134,14 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
                            : vect_store_lanes_supported (vectype, group_size,
                                                          masked_p))) != 
IFN_LAST)
            *memory_access_type = VMAT_LOAD_STORE_LANES;
+         else if (!loop_vinfo && slp_node->avoid_stlf_fail)
+           {
+             *memory_access_type = VMAT_ELEMENTWISE;
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "using element-wise load to avoid disrupting "
+                                "cross iteration store-to-load forwarding\n");
+           }
          else
            *memory_access_type = VMAT_CONTIGUOUS;
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 63991c3d9775..3d11559fe82b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -266,6 +266,9 @@ struct _slp_tree {
   /* Whether uses of this load or feeders of this store are suitable
      for load/store-lanes.  */
   bool ldst_lanes;
+  /* For BB vect, flag to indicate this load node should be vectorized
+     as to avoid STLF fails because of related stores.  */
+  bool avoid_stlf_fail;
 
   int vertex;

[gcc r16-403] tree-optimization/1157777 - STLF fails with BB vectorization of loop

Reply via email to