On Tue, Apr 6, 2021 at 12:05 PM Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Many of the gcc.target/sve/slp-perm*.c tests started failing > after the introduction of separate SLP permute nodes. > This patch adds variable-length support using a similar > technique to vect_transform_slp_perm_load. > > As there, the idea is to detect when every permute mask vector > is the same and can be generated using a regular stepped sequence. > We can easily handle those cases for variable-length, but still > need to restrict the general case to constant-length. > > Again copying vect_transform_slp_perm_load, the idea is to distinguish > the two cases regardless of whether the length is variable or not, > partly to increase testing coverage and partly because it avoids > generating redundant trees. > > Doing this means that we can also use SLP for the two-vector > permute in pr88834.c, which we couldn't before VEC_PERM_EXPR > nodes were introduced. The patch therefore makes pr88834.c > check that we don't regress back to not using SLP and adds > pr88834_ld3.c to check for the original problem in the PR. > > Tested on aarch64-linux-gnu (with and without SVE), arm-linux-gnueabihf, > armeb-eabi and x86_64-linux-gnu. OK to install?
OK. Thanks, Richard. > Richard > > > gcc/ > PR tree-optimization/97513 > * tree-vect-slp.c (vect_add_slp_permutation): New function, > split out from... > (vectorizable_slp_permutation): ...here. Detect cases in which > all VEC_PERM_EXPRs are guaranteed to have the same stepped > permute vector and only generate one permute vector for that case. > Extend that case to handle variable-length vectors. > > gcc/testsuite/ > * gcc.target/aarch64/sve/pr88834.c: Expect the vectorizer to use SLP. > * gcc.target/aarch64/sve/pr88834_ld3.c: New test. > --- > .../gcc.target/aarch64/sve/pr88834.c | 5 +- > .../gcc.target/aarch64/sve/pr88834_ld3.c | 16 ++ > gcc/tree-vect-slp.c | 218 ++++++++++++------ > 3 files changed, 167 insertions(+), 72 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index b0c03da3aeb..aef2104774a 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -5813,6 +5813,57 @@ vect_transform_slp_perm_load (vec_info *vinfo, > return true; > } > > +/* Produce the next vector result for SLP permutation NODE by adding a vector > + statement at GSI. If MASK_VEC is nonnull, add: > + > + <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC> > + > + otherwise add: > + > + <new SSA name> = FIRST_DEF. */ > + > +static void > +vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi, > + slp_tree node, tree first_def, tree second_def, > + tree mask_vec) > +{ > + tree vectype = SLP_TREE_VECTYPE (node); > + > + /* ??? We SLP match existing vector element extracts but > + allow punning which we need to re-instantiate at uses > + but have no good way of explicitly representing. */ > + if (!types_compatible_p (TREE_TYPE (first_def), vectype)) > + { > + gassign *conv_stmt > + = gimple_build_assign (make_ssa_name (vectype), > + build1 (VIEW_CONVERT_EXPR, vectype, > first_def)); > + vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi); > + first_def = gimple_assign_lhs (conv_stmt); > + } > + gassign *perm_stmt; > + tree perm_dest = make_ssa_name (vectype); > + if (mask_vec) > + { > + if (!types_compatible_p (TREE_TYPE (second_def), vectype)) > + { > + gassign *conv_stmt > + = gimple_build_assign (make_ssa_name (vectype), > + build1 (VIEW_CONVERT_EXPR, > + vectype, second_def)); > + vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi); > + second_def = gimple_assign_lhs (conv_stmt); > + } > + perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR, > + first_def, second_def, > + mask_vec); > + } > + else > + /* We need a copy here in case the def was external. */ > + perm_stmt = gimple_build_assign (perm_dest, first_def); > + vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi); > + /* Store the vector statement in NODE. */ > + SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt); > +} > > /* Vectorize the SLP permutations in NODE as specified > in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP > @@ -5836,15 +5887,21 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > arbitrary mismatches. */ > slp_tree child; > unsigned i; > + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > + bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node)); > FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) > - if (!vect_maybe_update_slp_op_vectype (child, vectype) > - || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "Unsupported lane permutation\n"); > - return false; > - } > + { > + if (!vect_maybe_update_slp_op_vectype (child, vectype) > + || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Unsupported lane permutation\n"); > + return false; > + } > + if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node)) > + repeating_p = false; > + } > > vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION > (node); > gcc_assert (perm.length () == SLP_TREE_LANES (node)); > @@ -5854,18 +5911,58 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > "vectorizing permutation"); > for (unsigned i = 0; i < perm.length (); ++i) > dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second); > + if (repeating_p) > + dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node)); > dump_printf (MSG_NOTE, "\n"); > } > > - poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); > - if (!nunits.is_constant ()) > - return false; > - unsigned HOST_WIDE_INT vf = 1; > - if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo)) > - if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf)) > - return false; > - unsigned olanes = vf * SLP_TREE_LANES (node); > - gcc_assert (multiple_p (olanes, nunits)); > + /* REPEATING_P is true if every output vector is guaranteed to use the > + same permute vector. We can handle that case for both variable-length > + and constant-length vectors, but we only handle other cases for > + constant-length vectors. > + > + Set: > + > + - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute > + mask vector that we want to build. > + > + - NCOPIES to the number of copies of PERM that we need in order > + to build the necessary permute mask vectors. > + > + - NOUTPUTS_PER_MASK to the number of output vectors we want to create > + for each permute mask vector. This is only relevant when GSI is > + nonnull. */ > + uint64_t npatterns; > + unsigned nelts_per_pattern; > + uint64_t ncopies; > + unsigned noutputs_per_mask; > + if (repeating_p) > + { > + /* We need a single permute mask vector that has the form: > + > + { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... } > + > + In other words, the original n-element permute in PERM is > + "unrolled" to fill a full vector. The stepped vector encoding > + that we use for permutes requires 3n elements. */ > + npatterns = SLP_TREE_LANES (node); > + nelts_per_pattern = ncopies = 3; > + noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node); > + } > + else > + { > + /* Calculate every element of every permute mask vector explicitly, > + instead of relying on the pattern described above. */ > + if (!nunits.is_constant (&npatterns)) > + return false; > + nelts_per_pattern = ncopies = 1; > + if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo)) > + if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies)) > + return false; > + noutputs_per_mask = 1; > + } > + unsigned olanes = ncopies * SLP_TREE_LANES (node); > + gcc_assert (repeating_p || multiple_p (olanes, nunits)); > > /* Compute the { { SLP operand, vector index}, lane } permutation sequence > from the { SLP operand, scalar lane } permutation as recorded in the > @@ -5875,16 +5972,22 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > auto_vec<unsigned> active_lane; > vperm.create (olanes); > active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true); > - for (unsigned i = 0; i < vf; ++i) > + for (unsigned i = 0; i < ncopies; ++i) > { > for (unsigned pi = 0; pi < perm.length (); ++pi) > { > std::pair<unsigned, unsigned> p = perm[pi]; > tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]); > - unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant (); > - unsigned vi = (active_lane[p.first] + p.second) / vnunits; > - unsigned vl = (active_lane[p.first] + p.second) % vnunits; > - vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), > vl)); > + if (repeating_p) > + vperm.quick_push ({{p.first, 0}, p.second + > active_lane[p.first]}); > + else > + { > + /* We checked above that the vectors are constant-length. */ > + unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant (); > + unsigned vi = (active_lane[p.first] + p.second) / vnunits; > + unsigned vl = (active_lane[p.first] + p.second) % vnunits; > + vperm.quick_push ({{p.first, vi}, vl}); > + } > } > /* Advance to the next group. */ > for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j) > @@ -5896,7 +5999,10 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > dump_printf_loc (MSG_NOTE, vect_location, "as"); > for (unsigned i = 0; i < vperm.length (); ++i) > { > - if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))) > + if (i != 0 > + && (repeating_p > + ? multiple_p (i, npatterns) > + : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))) > dump_printf (MSG_NOTE, ","); > dump_printf (MSG_NOTE, " vops%u[%u][%u]", > vperm[i].first.first, vperm[i].first.second, > @@ -5913,11 +6019,10 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > somehow? */ > std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U); > std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U); > - unsigned int const_nunits = nunits.to_constant (); > unsigned int index = 0; > - unsigned int mask_element; > + poly_uint64 mask_element; > vec_perm_builder mask; > - mask.new_vector (const_nunits, const_nunits, 1); > + mask.new_vector (nunits, npatterns, nelts_per_pattern); > unsigned int count = mask.encoded_nelts (); > mask.quick_grow (count); > vec_perm_indices indices; > @@ -5932,7 +6037,7 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > || second_vec == vperm[i].first) > { > second_vec = vperm[i].first; > - mask_element += const_nunits; > + mask_element += nunits; > } > else > { > @@ -5948,8 +6053,7 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > > if (index == count) > { > - indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, > - const_nunits); > + indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits); > bool identity_p = indices.series_p (0, 1, 0, 1); > if (!identity_p > && !can_vec_perm_const_p (TYPE_MODE (vectype), indices)) > @@ -5977,51 +6081,25 @@ vectorizable_slp_permutation (vec_info *vinfo, > gimple_stmt_iterator *gsi, > if (second_vec.first == -1U) > second_vec = first_vec; > > - /* Generate the permute statement if necessary. */ > - slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first]; > - tree first_def > - = vect_get_slp_vect_def (first_node, first_vec.second); > - /* ??? We SLP match existing vector element extracts but > - allow punning which we need to re-instantiate at uses > - but have no good way of explicitely representing. */ > - if (!types_compatible_p (TREE_TYPE (first_def), vectype)) > - { > - gassign *conv_stmt; > - conv_stmt = gimple_build_assign (make_ssa_name (vectype), > - build1 (VIEW_CONVERT_EXPR, > - vectype, > first_def)); > - vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi); > - first_def = gimple_assign_lhs (conv_stmt); > - } > - gassign *perm_stmt; > - tree perm_dest = make_ssa_name (vectype); > + slp_tree > + first_node = SLP_TREE_CHILDREN (node)[first_vec.first], > + second_node = SLP_TREE_CHILDREN (node)[second_vec.first]; > + > + tree mask_vec = NULL_TREE; > if (!identity_p) > + mask_vec = vect_gen_perm_mask_checked (vectype, indices); > + > + for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi) > { > - slp_tree second_node > - = SLP_TREE_CHILDREN (node)[second_vec.first]; > + tree first_def > + = vect_get_slp_vect_def (first_node, > + first_vec.second + vi); > tree second_def > - = vect_get_slp_vect_def (second_node, second_vec.second); > - if (!types_compatible_p (TREE_TYPE (second_def), vectype)) > - { > - gassign *conv_stmt; > - conv_stmt = gimple_build_assign (make_ssa_name > (vectype), > - build1 > - (VIEW_CONVERT_EXPR, > - vectype, > second_def)); > - vect_finish_stmt_generation (vinfo, NULL, conv_stmt, > gsi); > - second_def = gimple_assign_lhs (conv_stmt); > - } > - tree mask_vec = vect_gen_perm_mask_checked (vectype, > indices); > - perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR, > - first_def, second_def, > - mask_vec); > + = vect_get_slp_vect_def (second_node, > + second_vec.second + vi); > + vect_add_slp_permutation (vinfo, gsi, node, first_def, > + second_def, mask_vec); > } > - else > - /* We need a copy here in case the def was external. */ > - perm_stmt = gimple_build_assign (perm_dest, first_def); > - vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi); > - /* Store the vector statement in NODE. */ > - SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt); > } > > index = 0; > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c > index 7e7be4ed6fd..818291ea219 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c > @@ -11,5 +11,6 @@ f (int *restrict x, int *restrict y, int *restrict z, int n) > } > } > > -/* { dg-final { scan-assembler-times {\tld2w\t{z[0-9]+.s - z[0-9]+.s}, > p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */ > -/* { dg-final { scan-assembler-times {\tst2w\t{z[0-9]+.s - z[0-9]+.s}, > p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */ > +/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.d, all\n} } } */ > +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+.s, p[0-7]/z, > \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+.s, p[0-7], \[x[0-9]+, > x[0-9]+, lsl 2\]\n} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c > new file mode 100644 > index 00000000000..5936b878606 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > + > +void > +f (int *restrict x, int *restrict y, int *restrict z, int n) > +{ > + for (int i = 0; i < n; i += 3) > + { > + x[i] = y[i] + z[i]; > + x[i + 1] = y[i + 1] - z[i + 1]; > + x[i + 2] = y[i + 2] | z[i + 2]; > + } > +} > + > +/* { dg-final { scan-assembler-times {\tld3w\t{z[0-9]+.s - z[0-9]+.s}, > p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */ > +/* { dg-final { scan-assembler-times {\tst3w\t{z[0-9]+.s - z[0-9]+.s}, > p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */