https://gcc.gnu.org/g:d34cda720988674bcf8a24267c9e1ec61335d6de

commit r15-3509-gd34cda720988674bcf8a24267c9e1ec61335d6de
Author: Richard Biener <rguent...@suse.de>
Date:   Fri Sep 29 12:54:17 2023 +0200

    Handle non-grouped stores as single-lane SLP
    
    The following enables single-lane loop SLP discovery for non-grouped stores
    and adjusts vectorizable_store to properly handle those.
    
    For gfortran.dg/vect/vect-8.f90 we vectorize one additional loop,
    not running into the "not falling back to strided accesses" bail-out.
    I have not investigated in detail.
    
    There is a set of i386 target assembler test FAILs,
    gcc.target/i386/pr88531-2[bc].c in particular fail because the
    target cannot identify SLP emulated gathers, see another mail from me.
    Others need adjustment, I've adjusted one with this patch only.
    In particular there are gcc.target/i386/cond_op_fma_*-1.c FAILs
    that are because we no longer fold a VEC_COND_EXPR during the
    region value-numbering we do after vectorization since we
    code-generate a { 0.0, ... } constant in the VEC_COND_EXPR now
    instead of having a separate statement which gets forwarded
    and then triggers folding.  This leads to sligtly different
    code generation.  The solution is probably to use gimple_build
    when building stmts or, in this case, directly emit .COND_FMA
    instead of .FMA and a VEC_COND_EXPR.
    
    gcc.dg/vect/slp-19a.c mixes contiguous 8-lane SLP with a single
    lane contiguous store from one lane of the 8-lane load and we
    expect to use load-lanes for this reason but the heuristic for
    forcing single-lane rediscovery as implemented doesn't trigger
    here as it treats both SLP instances separately.  FAILs on RISC-V
    
    gcc.dg/vect/slp-19c.c shows we fail to implement an interleaving
    scheme for group_size 12 (by extension using the group_size 3
    scheme to reduce to 4 lanes and then continue with a pow2 scheme
    would work);  we are also not considering load-lanes because of
    the above reason, but aarch64 cannot do ld12.  FAILs on AARCH64
    (load requires three vectors) and x86_64.
    
    gcc.dg/vect/slp-19c.c FAILs with variable-length vectors because
    of "SLP induction not supported for variable-length vectors".
    
    gcc.target/aarch64/pr110449.c will FAIL because the (contested)
    optimization in r14-2367-g224fd59b2dc8a5 was only applied to
    loop-vect but not SLP vect.  I'll leave it to target maintainers
    to either XFAIL (the optimization is bad) or remove the test.
    
            * tree-vect-slp.cc (vect_analyze_slp): Perform single-lane
            loop SLP discovery for non-grouped stores.  Move check on the root
            for re-doing SLP analysis with a single lane for load/store-lanes
            earlier and make sure we are dealing with a grouped access.
            * tree-vect-stmts.cc (vectorizable_store): Always set
            vec_num for SLP.
    
            * gcc.dg/vect/O3-pr39675-2.c: Adjust expected number of SLP.
            * gcc.dg/vect/fast-math-vect-call-1.c: Likewise.
            * gcc.dg/vect/no-scevccp-slp-31.c: Likewise.
            * gcc.dg/vect/slp-12b.c: Likewise.
            * gcc.dg/vect/slp-12c.c: Likewise.
            * gcc.dg/vect/slp-19a.c: Likewise.
            * gcc.dg/vect/slp-19b.c: Likewise.
            * gcc.dg/vect/slp-4-big-array.c: Likewise.
            * gcc.dg/vect/slp-4.c: Likewise.
            * gcc.dg/vect/slp-5.c: Likewise.
            * gcc.dg/vect/slp-7.c: Likewise.
            * gcc.dg/vect/slp-perm-7.c: Likewise.
            * gcc.dg/vect/slp-37.c: Likewise.
            * gcc.dg/vect/fast-math-vect-call-2.c: Likewise.
            * gcc.dg/vect/slp-26.c: RISC-V can now SLP two instances.
            * gcc.dg/vect/vect-outer-slp-3.c: Disable vectorization of
            initialization loop.
            * gcc.dg/vect/slp-reduc-5.c: Likewise.
            * gcc.dg/vect/no-scevccp-outer-12.c: Un-XFAIL.  SLP can handle
            inner loop inductions with multiple vector stmt copies.
            * gfortran.dg/vect/vect-8.f90: Adjust expected number of
            vectorized loops.
            * gcc.target/i386/vectorize1.c: Adjust what we scan for.

Diff:
---
 gcc/testsuite/gcc.dg/vect/O3-pr39675-2.c          |  2 +-
 gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c |  2 +-
 gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c |  2 +-
 gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c   |  3 +-
 gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c     |  5 ++-
 gcc/testsuite/gcc.dg/vect/slp-12b.c               |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-12c.c               |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-19a.c               |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-19b.c               |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-26.c                |  3 +-
 gcc/testsuite/gcc.dg/vect/slp-37.c                |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-4-big-array.c       |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-4.c                 |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-5.c                 |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-7.c                 |  4 +-
 gcc/testsuite/gcc.dg/vect/slp-perm-7.c            |  2 +-
 gcc/testsuite/gcc.dg/vect/slp-reduc-5.c           |  3 +-
 gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c      |  1 +
 gcc/testsuite/gcc.target/i386/vectorize1.c        |  4 +-
 gcc/testsuite/gfortran.dg/vect/vect-8.f90         |  2 +-
 gcc/tree-vect-slp.cc                              | 45 ++++++++++++++++++-----
 gcc/tree-vect-stmts.cc                            | 11 ++++--
 22 files changed, 69 insertions(+), 36 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/O3-pr39675-2.c 
b/gcc/testsuite/gcc.dg/vect/O3-pr39675-2.c
index c3f0f6dc1be..ddaac56cc0b 100644
--- a/gcc/testsuite/gcc.dg/vect/O3-pr39675-2.c
+++ b/gcc/testsuite/gcc.dg/vect/O3-pr39675-2.c
@@ -27,5 +27,5 @@ foo ()
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target 
vect_strided4 } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target vect_strided4 } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target vect_strided4 } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c 
b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c
index ad22f6e82b3..6c9b7c37b6e 100644
--- a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c
+++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-1.c
@@ -101,4 +101,4 @@ main ()
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" { target { 
vect_call_copysignf && vect_call_sqrtf } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target { { vect_call_copysignf && vect_call_sqrtf } && vect_perm3_int } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { 
target { { vect_call_copysignf && vect_call_sqrtf } && vect_perm3_int } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c 
b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c
index d51e17ff656..ed42a21cedc 100644
--- a/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c
+++ b/gcc/testsuite/gcc.dg/vect/fast-math-vect-call-2.c
@@ -132,4 +132,4 @@ main ()
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 6 "vect" { target 
vect_call_lrint } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { 
target vect_call_lrint } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 6 "vect" { 
target vect_call_lrint } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c 
b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c
index c2d3031bc0c..6ace6ad022e 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-12.c
@@ -46,5 +46,4 @@ int main (void)
   return 0;
 }
 
-/* Until we support multiple types in the inner loop  */
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" { xfail 
{ ! { aarch64*-*-* riscv*-*-* } } } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED." 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c 
b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c
index 22817a57ef8..f6ac5f60298 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-slp-31.c
@@ -53,6 +53,7 @@ int main (void)
   return 0;
 }
 
+/* We cannot handle grouped accesses in outer loops.  */
+/* { dg-final { scan-tree-dump-not "OUTER LOOP VECTORIZED" "vect" } } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  } 
} */
-  
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  } 
} */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-12b.c 
b/gcc/testsuite/gcc.dg/vect/slp-12b.c
index e2ea24d6c53..8e06e3bfa93 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-12b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-12b.c
@@ -47,6 +47,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target { 
vect_strided2 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  { target { 
! { vect_strided2 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { 
target { vect_strided2 && vect_int_mult } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  { 
target { vect_strided2 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  { 
target { ! { vect_strided2 && vect_int_mult } } } } } */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-12c.c 
b/gcc/testsuite/gcc.dg/vect/slp-12c.c
index 9c48dff3bf4..a3536e3053b 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-12c.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-12c.c
@@ -49,5 +49,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target { 
vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  { target { 
! vect_int_mult } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target vect_int_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target vect_int_mult } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { 
target { ! vect_int_mult } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-19a.c 
b/gcc/testsuite/gcc.dg/vect/slp-19a.c
index ca7a0a8e456..6c21416046d 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-19a.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-19a.c
@@ -57,5 +57,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target 
vect_strided8 } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { 
! vect_strided8 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target vect_strided8 } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target vect_strided8 } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { 
target { ! vect_strided8} } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-19b.c 
b/gcc/testsuite/gcc.dg/vect/slp-19b.c
index 4d53ac698db..10b84aab3b5 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-19b.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-19b.c
@@ -54,5 +54,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target 
vect_strided4 } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { 
! vect_strided4 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target vect_strided4 } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target vect_strided4 } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { 
target { ! vect_strided4 } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-26.c 
b/gcc/testsuite/gcc.dg/vect/slp-26.c
index cfb763bf519..cdb5d9c694b 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-26.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-26.c
@@ -50,4 +50,5 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { 
! { mips_msa || { amdgcn-*-* || { riscv_v || loongarch_sx } } } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { 
mips_msa || { amdgcn-*-* || { riscv_v || loongarch_sx } } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { 
target { ! { mips_msa || { amdgcn-*-* || { riscv_v || loongarch_sx } } } } } } 
} */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target { mips_msa || { amdgcn-*-* || { riscv_v || loongarch_sx } } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target { mips_msa || { amdgcn-*-* || loongarch_sx } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target riscv_v } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-37.c 
b/gcc/testsuite/gcc.dg/vect/slp-37.c
index caee2bb508f..8a430e63847 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-37.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-37.c
@@ -60,4 +60,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target 
vect_hw_misalign } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target vect_hw_misalign } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target vect_hw_misalign } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-4-big-array.c 
b/gcc/testsuite/gcc.dg/vect/slp-4-big-array.c
index fcda45ff368..f738a613324 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-4-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-4-big-array.c
@@ -131,5 +131,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect"  } 
} */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 6 "vect"  } 
} */
 
diff --git a/gcc/testsuite/gcc.dg/vect/slp-4.c 
b/gcc/testsuite/gcc.dg/vect/slp-4.c
index 29e741df02b..1ecad7415ef 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-4.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-4.c
@@ -125,5 +125,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect"  } 
} */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 6 "vect"  } 
} */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-5.c 
b/gcc/testsuite/gcc.dg/vect/slp-5.c
index 6d51f6a7323..484898c2afd 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-5.c
@@ -124,5 +124,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect"  } 
} */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 5 "vect"  } 
} */
   
diff --git a/gcc/testsuite/gcc.dg/vect/slp-7.c 
b/gcc/testsuite/gcc.dg/vect/slp-7.c
index 2845a99dedf..f83fdc96d16 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-7.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-7.c
@@ -125,6 +125,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  { target 
vect_short_mult } } }*/
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  { target { 
! { vect_short_mult } } } } }*/
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect"  { 
target vect_short_mult } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  { 
target { ! { vect_short_mult } } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 5 "vect"  { 
target vect_short_mult } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect"  { 
target { ! { vect_short_mult } } } } } */
  
diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c 
b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
index f15736ef729..9c522ba4705 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c
@@ -97,6 +97,6 @@ int main (int argc, const char* argv[])
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target 
vect_perm } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target { vect_perm3_int || vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { vect_perm3_int || vect_load_lanes } } } } */
 /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } 
} } */
 /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } 
} } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c 
b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
index 11f5a7414cf..0cde79d9e49 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-5.c
@@ -36,6 +36,7 @@ int main (void)
 
   check_vect ();
 
+#pragma GCC novector
   for (i = 0; i < N; i++)
     c[i] = (i+3) * -1;
 
@@ -44,6 +45,6 @@ int main (void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail 
vect_no_int_min_max } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail 
vect_no_int_min_max } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
xfail vect_no_int_min_max } } } */
 /* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 0 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c 
b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c
index 3dce51426b5..d315db5632b 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-outer-slp-3.c
@@ -30,6 +30,7 @@ int main ()
 {
   check_vect ();
 
+#pragma GCC novector
   for (int i = 0; i < 40; ++i)
     image[i] = 1.;
 
diff --git a/gcc/testsuite/gcc.target/i386/vectorize1.c 
b/gcc/testsuite/gcc.target/i386/vectorize1.c
index f3b9bfba382..14a8c5f28b3 100644
--- a/gcc/testsuite/gcc.target/i386/vectorize1.c
+++ b/gcc/testsuite/gcc.target/i386/vectorize1.c
@@ -1,6 +1,6 @@
 /* PR middle-end/28915 */
 /* { dg-do compile } */
-/* { dg-options "-msse -O2 -ftree-vectorize -fdump-tree-vect" } */
+/* { dg-options "-msse -O2 -ftree-vectorize -fdump-tree-vect-optimized" } */
 
 extern char lanip[3][40];
 typedef struct
@@ -17,4 +17,4 @@ int set_names (void)
       tt1.t[ln] = lanip[1];
 }
 
-/* { dg-final { scan-tree-dump "vect_cst" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 
b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
index 557a523e2bd..2a3fa90740e 100644
--- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
@@ -708,5 +708,5 @@ END SUBROUTINE kernel
 
 ! { dg-final { scan-tree-dump-times "vectorized 2\[56\] loops" 1 "vect" { 
target aarch64_sve } } }
 ! { dg-final { scan-tree-dump-times "vectorized 2\[45\] loops" 1 "vect" { 
target { aarch64*-*-* && { ! aarch64_sve } } } } }
-! { dg-final { scan-tree-dump-times "vectorized 2\[234\] loops" 1 "vect" { 
target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
+! { dg-final { scan-tree-dump-times "vectorized 2\[345\] loops" 1 "vect" { 
target { vect_intdouble_cvt && { ! aarch64*-*-* } } } } }
 ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { 
{ ! vect_intdouble_cvt } && { ! aarch64*-*-* } } } } }
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index b6839c7707b..3d2973698e2 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4548,6 +4548,7 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
 opt_result
 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
 {
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   unsigned int i;
   stmt_vec_info first_element;
   slp_instance instance;
@@ -4564,6 +4565,28 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
     vect_analyze_slp_instance (vinfo, bst_map, first_element,
                               slp_inst_kind_store, max_tree_size, &limit);
 
+  /* For loops also start SLP discovery from non-grouped stores.  */
+  if (loop_vinfo)
+    {
+      data_reference_p dr;
+      FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
+       if (DR_IS_WRITE (dr))
+         {
+           stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
+           /* Grouped stores are already handled above.  */
+           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+             continue;
+           vec<stmt_vec_info> stmts;
+           vec<stmt_vec_info> roots = vNULL;
+           vec<tree> remain = vNULL;
+           stmts.create (1);
+           stmts.quick_push (stmt_info);
+           vect_build_slp_instance (vinfo, slp_inst_kind_store,
+                                    stmts, roots, remain, max_tree_size,
+                                    &limit, bst_map, NULL);
+         }
+    }
+
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
     {
       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
@@ -4750,6 +4773,18 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
          int group_size = SLP_TREE_LANES (slp_root);
          tree vectype = SLP_TREE_VECTYPE (slp_root);
 
+         stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
+         gimple *rep = STMT_VINFO_STMT (rep_info);
+         bool masked = (is_gimple_call (rep)
+                        && gimple_call_internal_p (rep)
+                        && internal_fn_mask_index
+                             (gimple_call_internal_fn (rep)) != -1);
+         if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
+             || slp_root->ldst_lanes
+             || (vect_store_lanes_supported (vectype, group_size, masked)
+                 == IFN_LAST))
+           continue;
+
          auto_vec<slp_tree> loads;
          hash_set<slp_tree> visited;
          vect_gather_slp_loads (loads, slp_root, visited);
@@ -4773,17 +4808,9 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
                  }
            }
 
-         gimple *rep = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (slp_root));
-         bool masked = (is_gimple_call (rep)
-                        && gimple_call_internal_p (rep)
-                        && internal_fn_mask_index
-                             (gimple_call_internal_fn (rep)) != -1);
          /* If the loads and stores can use load/store-lanes force re-discovery
             with single lanes.  */
-         if (loads_permuted
-             && !slp_root->ldst_lanes
-             && vect_store_lanes_supported (vectype, group_size, masked)
-             != IFN_LAST)
+         if (loads_permuted)
            {
              bool can_use_lanes = true;
              FOR_EACH_VEC_ELT (loads, j, load_node)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 25b120c158e..f6c5b7a7e87 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8355,10 +8355,12 @@ vectorizable_store (vec_info *vinfo,
       return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, 
ncopies);
     }
 
-  if (grouped_store)
+  if (grouped_store || slp)
     {
       /* FORNOW */
-      gcc_assert (!loop || !nested_in_vect_loop_p (loop, stmt_info));
+      gcc_assert (!grouped_store
+                 || !loop
+                 || !nested_in_vect_loop_p (loop, stmt_info));
 
       if (slp)
         {
@@ -8367,8 +8369,9 @@ vectorizable_store (vec_info *vinfo,
              group.  */
           vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
          first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-         gcc_assert (DR_GROUP_FIRST_ELEMENT (first_stmt_info)
-                     == first_stmt_info);
+         gcc_assert (!STMT_VINFO_GROUPED_ACCESS (first_stmt_info)
+                     || (DR_GROUP_FIRST_ELEMENT (first_stmt_info)
+                         == first_stmt_info));
          first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
          op = vect_get_store_rhs (first_stmt_info);
         }

Reply via email to