Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Tue, Apr 6, 2021 at 12:03 PM Richard Sandiford via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> As noted in the PR, we were no longer using ST3 for the testcase and
>> instead stored each lane individually.  This is because we'd split
>> the store group during SLP and couldn't recover when SLP failed.
>>
>> However, we seem to get better code with ST3 and ST4 even if
>> SLP would have succeeded, such as for vect-complex-5.c.
>> I think the best thing for GCC 11 would therefore be to skip
>> the split entirely if we could use IFN_STORE_LANES.  A downside
>> of this is that SLP can handle smaller iteration counts than
>> IFN_STORE_LANES can, but we don't have the infrastructure to
>> choose reliably based on that.
>>
>> Tested on aarch64-linux-gnu (with and without SVE), arm-linux-gnueabihf,
>> armeb-eabi and x86_64-linux-gnu.  OK to install?
>
> One of the cases where splitting helps is if you have say V2DFmode
> and a group size of 4 but if you break up the group into sizes of 2
> then you get two V2DFmode group size 2 SLP subgraphs.  So I wonder
> if, since you look for a vector type, want to only disable splitting
> in case the resulting vector type has the same number of lanes
> as the group size?  (and if not, instead limit where we consider splitting)

Hmm, yeah.  If we can apply SLP to full vectors within a loop iteration
then I agree it's still better to split.  I think the test should favour
ST3/ST4 more though: split only if one of the new group sizes is a
multiple of the vector size.  If the vector type has more lanes than the
group size then we should still use ST3/ST4.

How does this version look?  Tested as before.

Thanks,
Richard


gcc/
        PR tree-optimization/99873
        * tree-vect-slp.c (vect_slp_prefer_store_lanes_p): New function.
        (vect_build_slp_instance): Don't split store groups that could
        use IFN_STORE_LANES.

gcc/testsuite/
        * gcc.dg/vect/slp-21.c: Only expect 2 of the loops to use SLP
        if IFN_STORE_LANES is available.
        * vect/vect-complex-5.c: Expect no loops to use SLP if
        IFN_STORE_LANES is available.
        * gcc.target/aarch64/pr99873_1.c: New test.
        * gcc.target/aarch64/pr99873_2.c: Likewise.
        * gcc.target/aarch64/pr99873_3.c: Likewise.
        * gcc.target/aarch64/sve/pr99873_1.c: Likewise.
        * gcc.target/aarch64/sve/pr99873_2.c: Likewise.
        * gcc.target/aarch64/sve/pr99873_3.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/slp-21.c            |  4 +--
 gcc/testsuite/gcc.dg/vect/vect-complex-5.c    |  3 +-
 gcc/testsuite/gcc.target/aarch64/pr99873_1.c  | 17 ++++++++++
 gcc/testsuite/gcc.target/aarch64/pr99873_2.c  | 20 ++++++++++++
 gcc/testsuite/gcc.target/aarch64/pr99873_3.c  | 20 ++++++++++++
 .../gcc.target/aarch64/sve/pr99873_1.c        | 15 +++++++++
 .../gcc.target/aarch64/sve/pr99873_2.c        | 18 +++++++++++
 .../gcc.target/aarch64/sve/pr99873_3.c        | 18 +++++++++++
 gcc/tree-vect-slp.c                           | 31 ++++++++++++++++++-
 9 files changed, 142 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr99873_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index ceec7f5c410..eadbd521966 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2458,6 +2458,34 @@ vect_match_slp_patterns (slp_instance instance, vec_info 
*vinfo,
   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
 }
 
+/* STMT_INFO is a store group of size GROUP_SIZE that we are considering
+   splitting into two, with the first split group having size NEW_GROUP_SIZE.
+   Return true if we could use IFN_STORE_LANES instead and if that appears
+   to be the better approach.  */
+
+static bool
+vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
+                              unsigned int group_size,
+                              unsigned int new_group_size)
+{
+  tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
+  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
+  if (!vectype)
+    return false;
+  /* Allow the split if one of the two new groups would operate on full
+     vectors *within* rather than across one scalar loop iteration.
+     This is purely a heuristic, but it should work well for group
+     sizes of 3 and 4, where the possible splits are:
+
+       3->2+1:  OK if the vector has exactly two elements
+       4->2+2:  Likewise
+       4->3+1:  Less clear-cut.  */
+  if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
+      || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+    return false;
+  return vect_store_lanes_supported (vectype, group_size, false);
+}
+
 /* Analyze an SLP instance starting from a group of grouped stores.  Call
    vect_build_slp_tree to build a tree of packed stmts if possible.
    Return FALSE if it's impossible to SLP any stmt in the loop.  */
@@ -2693,7 +2721,8 @@ vect_build_slp_instance (vec_info *vinfo,
 
       /* For loop vectorization split into arbitrary pieces of size > 1.  */
       if (is_a <loop_vec_info> (vinfo)
-         && (i > 1 && i < group_size))
+         && (i > 1 && i < group_size)
+         && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
        {
          unsigned group1_size = i;
 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-21.c 
b/gcc/testsuite/gcc.dg/vect/slp-21.c
index bf8f434dd50..85393975b45 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-21.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-21.c
@@ -210,7 +210,7 @@ int main (void)
 
    Not all vect_perm targets support that, and it's a bit too specific to have
    its own effective-target selector, so we just test targets directly.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { 
target { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { vect_strided4 && { ! { aarch64*-*-* arm*-*-* powerpc64*-*-* } } } } } 
} */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { 
target powerpc64*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { vect_strided4 && { ! powerpc64*-*-* } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  { 
target { ! { vect_strided4 } } } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c 
b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
index 81fdb67ce81..addcf60438c 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-complex-5.c
@@ -40,4 +40,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
xfail { ! vect_hw_misalign } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { 
target vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_load_lanes } xfail { ! vect_hw_misalign } } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_1.c 
b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c
new file mode 100644
index 00000000000..bc4d81e3ae5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr99873_1.c
@@ -0,0 +1,17 @@
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+  for (int i = 0; i < n; i += 3)
+    {
+      x[i] = y[i] + z[i];
+      x[i + 1] = y[i + 1] - z[i + 1];
+      x[i + 2] = y[i + 2] | z[i + 2];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld3\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst3\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_2.c 
b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c
new file mode 100644
index 00000000000..b73fbdc0a18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr99873_2.c
@@ -0,0 +1,20 @@
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+void __attribute ((noipa))
+foo (uint64_t *__restrict x, uint64_t *__restrict y, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      x[i] += y[i];
+      x[i + 1] += y[i + 1];
+      x[i + 2] |= y[i + 2];
+      x[i + 3] |= y[i + 3];
+    }
+}
+
+/* { dg-final { scan-assembler-not {\tld4\t} } } */
+/* { dg-final { scan-assembler-not {\tst4\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr99873_3.c 
b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c
new file mode 100644
index 00000000000..ccbab6d74be
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr99873_3.c
@@ -0,0 +1,20 @@
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+void __attribute ((noipa))
+foo (uint32_t *__restrict x, uint32_t *__restrict y, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      x[i] += y[i];
+      x[i + 1] += y[i + 1];
+      x[i + 2] |= y[i + 2];
+      x[i + 3] |= y[i + 3];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld4\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst4\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c
new file mode 100644
index 00000000000..f4b95da2afa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_1.c
@@ -0,0 +1,15 @@
+/* { dg-options "-O3" } */
+
+void
+f (int *restrict x, int *restrict y, int *restrict z, int n)
+{
+  for (int i = 0; i < n; i += 3)
+    {
+      x[i] = y[i] + z[i];
+      x[i + 1] = y[i + 1] - z[i + 1];
+      x[i + 2] = y[i + 2] | z[i + 2];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld3w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst3w\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c
new file mode 100644
index 00000000000..03dc4ef930d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_2.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void __attribute ((noipa))
+foo (uint64_t *__restrict x, uint64_t *__restrict y, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      x[i] += y[i];
+      x[i + 1] += y[i + 1];
+      x[i + 2] |= y[i + 2];
+      x[i + 3] |= y[i + 3];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld4d\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst4d\t} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c
new file mode 100644
index 00000000000..87a0141e508
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr99873_3.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void __attribute ((noipa))
+foo (uint32_t *__restrict x, uint32_t *__restrict y, int n)
+{
+  for (int i = 0; i < n; i += 4)
+    {
+      x[i] += y[i];
+      x[i + 1] += y[i + 1];
+      x[i + 2] |= y[i + 2];
+      x[i + 3] |= y[i + 3];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld4w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tst4w\t} 1 } } */

Reply via email to