This enables SLP store group splitting also for loop vectorization.
For the existing testcase gcc.dg/vect/vect-complex-5.c this then
generates much better code, likewise for the PR97428 testcase.

Both of those have a splitting opportunity splitting the group
into two equal (vector-sized) halves, still the patch enables
quite arbitrary splitting since generally the interleaving scheme
results in quite awkward code for even small groups.  If any
problems surface with this it's easy to restrict the splitting
to known-good cases.  Is there any additional constraints for
non-constant sized vectors?  Note this interacts with vector
size iteration (but comparing interleaving cost with SLP cost
of a smaller vector size doesn't reliably pick the smaller
vector size).

Bootstrapped / tested on x86_64-unknown-linux-gnu.

2020-10-15  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/97428
        * tree-vect-slp.c (vect_analyze_slp_instance): Split store
        groups also for loop vectorization.

        * gcc.dg/vect/vect-complex-5.c: Expect to SLP.
        * gcc.dg/vect/pr97428.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/pr97428.c        |  1 +
 gcc/testsuite/gcc.dg/vect/vect-complex-5.c |  2 +-
 gcc/tree-vect-slp.c                        | 46 ++++++++++++++++++----
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr97428.c 
b/gcc/testsuite/gcc.dg/vect/pr97428.c
index b5b02dca9de..49d53738256 100644
--- a/gcc/testsuite/gcc.dg/vect/pr97428.c
+++ b/gcc/testsuite/gcc.dg/vect/pr97428.c
@@ -40,4 +40,5 @@ void foo_i2(dcmlx4_t dst[], const dcmlx_t src[], int n)
    load and store groups.  */
 /* { dg-final { scan-tree-dump "Detected interleaving load of size 8" "vect" } 
} */
 /* { dg-final { scan-tree-dump "Detected interleaving store of size 16" "vect" 
} } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } 
} */
 /* { dg-final { scan-tree-dump-not "gap of 6 elements" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c 
b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
index a2e3590ed98..06486375449 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
@@ -40,4 +40,4 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } 
} */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } 
} */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index d0fa6dce8a8..1310a12cbba 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2283,20 +2283,20 @@ vect_analyze_slp_instance (vec_info *vinfo,
       scalar_stmts.release ();
     }
 
-  /* For basic block SLP, try to break the group up into multiples of the
-     vector size.  */
+  /* Try to break the group up into pieces.  */
   unsigned HOST_WIDE_INT const_nunits;
-  if (is_a <bb_vec_info> (vinfo)
-      && STMT_VINFO_GROUPED_ACCESS (stmt_info)
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
       && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))
       && nunits.is_constant (&const_nunits))
     {
-      /* We consider breaking the group only on VF boundaries from the existing
-        start.  */
       for (i = 0; i < group_size; i++)
-       if (!matches[i]) break;
+       if (!matches[i])
+         break;
 
-      if (i >= const_nunits && i < group_size)
+      /* For basic block SLP, try to break the group up into multiples of the
+        vector size.  */
+      if (is_a <bb_vec_info> (vinfo)
+         && (i >= const_nunits && i < group_size))
        {
          /* Split into two groups at the first vector boundary before i.  */
          gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
@@ -2323,6 +2323,36 @@ vect_analyze_slp_instance (vec_info *vinfo,
                                              rest, max_tree_size);
          return res;
        }
+
+      /* For loop vectorization split into arbitrary pieces of size > 1.  */
+      if (is_a <loop_vec_info> (vinfo)
+         && (i > 1 && i < group_size))
+       {
+         gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
+         unsigned group1_size = i;
+
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Splitting SLP group at stmt %u\n", i);
+
+         stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
+                                                          group1_size);
+         /* Loop vectorization cannot handle gaps in stores, make sure
+            the split group appears as strided.  */
+         STMT_VINFO_STRIDED_P (rest) = 1;
+         DR_GROUP_GAP (rest) = 0;
+         STMT_VINFO_STRIDED_P (stmt_info) = 1;
+         DR_GROUP_GAP (stmt_info) = 0;
+
+         bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
+                                               max_tree_size);
+         if (i + 1 < group_size)
+           res |= vect_analyze_slp_instance (vinfo, bst_map,
+                                             rest, max_tree_size);
+
+         return res;
+       }
+
       /* Even though the first vector did not all match, we might be able to 
SLP
         (some) of the remainder.  FORNOW ignore this possibility.  */
     }
-- 
2.26.2

Reply via email to