The following makes sure to use a VEC_PERM SLP node to produce
lane duplications for non-grouped SLP loads as those are later
not lowered by load permutation lowering.
For some reason gcc.dg/vect/pr106081.c now fails permute optimizing,
in particular eliding vector reversal for the reduction.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
* tree-vect-slp.cc (vect_build_slp_tree_2): Use a VEC_PERM
SLP node to duplicate lanes for non-grouped loads.
* gcc.dg/vect/pr106081.c: Adjust.
---
gcc/testsuite/gcc.dg/vect/pr106081.c | 2 +-
gcc/tree-vect-slp.cc | 38 +++++++++++++++++++++++++++-
2 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/gcc/testsuite/gcc.dg/vect/pr106081.c
b/gcc/testsuite/gcc.dg/vect/pr106081.c
index 8f97af2d642..1864320c803 100644
--- a/gcc/testsuite/gcc.dg/vect/pr106081.c
+++ b/gcc/testsuite/gcc.dg/vect/pr106081.c
@@ -30,4 +30,4 @@ test(double *k)
}
/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */
-/* { dg-final { scan-tree-dump-times "VEC_PERM" 4 "optimized" { target
x86_64-*-* i?86-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM" 5 "optimized" { target
x86_64-*-* i?86-*-* } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index af00c5e35dd..b34064103bd 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -2088,7 +2088,43 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
}
else
{
- SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
+ if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+ {
+ /* Do not use SLP_TREE_LOAD_PERMUTATION for non-grouped
+ accesses. Instead when duplicated to so via a
+ VEC_PERM node. */
+ if (!any_permute)
+ load_permutation.release ();
+ else
+ {
+ gcc_assert (group_size != 1);
+ vec<stmt_vec_info> stmts2;
+ stmts2.create (1);
+ stmts2.quick_push (stmt_info);
+ bool matches2;
+ slp_tree unperm_load
+ = vect_build_slp_tree (vinfo, stmts2, 1,
+ &this_max_nunits, &matches2,
+ limit, &this_tree_size, bst_map);
+ gcc_assert (unperm_load);
+ lane_permutation_t lperm;
+ lperm.create (group_size);
+ for (unsigned j = 0; j < load_permutation.length (); ++j)
+ {
+ gcc_assert (load_permutation[j] == 0);
+ lperm.quick_push (std::make_pair (0, 0));
+ }
+ SLP_TREE_CODE (node) = VEC_PERM_EXPR;
+ SLP_TREE_CHILDREN (node).safe_push (unperm_load);
+ SLP_TREE_LANE_PERMUTATION (node) = lperm;
+ load_permutation.release ();
+ *max_nunits = this_max_nunits;
+ (*tree_size)++;
+ return node;
+ }
+ }
+ else
+ SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
return node;
}
}
--
2.43.0