On Tue, Apr 6, 2021 at 12:05 PM Richard Sandiford via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Many of the gcc.target/sve/slp-perm*.c tests started failing
> after the introduction of separate SLP permute nodes.
> This patch adds variable-length support using a similar
> technique to vect_transform_slp_perm_load.
>
> As there, the idea is to detect when every permute mask vector
> is the same and can be generated using a regular stepped sequence.
> We can easily handle those cases for variable-length, but still
> need to restrict the general case to constant-length.
>
> Again copying vect_transform_slp_perm_load, the idea is to distinguish
> the two cases regardless of whether the length is variable or not,
> partly to increase testing coverage and partly because it avoids
> generating redundant trees.
>
> Doing this means that we can also use SLP for the two-vector
> permute in pr88834.c, which we couldn't before VEC_PERM_EXPR
> nodes were introduced.  The patch therefore makes pr88834.c
> check that we don't regress back to not using SLP and adds
> pr88834_ld3.c to check for the original problem in the PR.
>
> Tested on aarch64-linux-gnu (with and without SVE), arm-linux-gnueabihf,
> armeb-eabi and x86_64-linux-gnu.  OK to install?

OK.

Thanks,
Richard.

> Richard
>
>
> gcc/
>         PR tree-optimization/97513
>         * tree-vect-slp.c (vect_add_slp_permutation): New function,
>         split out from...
>         (vectorizable_slp_permutation): ...here.  Detect cases in which
>         all VEC_PERM_EXPRs are guaranteed to have the same stepped
>         permute vector and only generate one permute vector for that case.
>         Extend that case to handle variable-length vectors.
>
> gcc/testsuite/
>         * gcc.target/aarch64/sve/pr88834.c: Expect the vectorizer to use SLP.
>         * gcc.target/aarch64/sve/pr88834_ld3.c: New test.
> ---
>  .../gcc.target/aarch64/sve/pr88834.c          |   5 +-
>  .../gcc.target/aarch64/sve/pr88834_ld3.c      |  16 ++
>  gcc/tree-vect-slp.c                           | 218 ++++++++++++------
>  3 files changed, 167 insertions(+), 72 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c
>
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index b0c03da3aeb..aef2104774a 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -5813,6 +5813,57 @@ vect_transform_slp_perm_load (vec_info *vinfo,
>    return true;
>  }
>
> +/* Produce the next vector result for SLP permutation NODE by adding a vector
> +   statement at GSI.  If MASK_VEC is nonnull, add:
> +
> +      <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
> +
> +   otherwise add:
> +
> +      <new SSA name> = FIRST_DEF.  */
> +
> +static void
> +vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
> +                         slp_tree node, tree first_def, tree second_def,
> +                         tree mask_vec)
> +{
> +  tree vectype = SLP_TREE_VECTYPE (node);
> +
> +  /* ???  We SLP match existing vector element extracts but
> +     allow punning which we need to re-instantiate at uses
> +     but have no good way of explicitly representing.  */
> +  if (!types_compatible_p (TREE_TYPE (first_def), vectype))
> +    {
> +      gassign *conv_stmt
> +       = gimple_build_assign (make_ssa_name (vectype),
> +                              build1 (VIEW_CONVERT_EXPR, vectype, 
> first_def));
> +      vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
> +      first_def = gimple_assign_lhs (conv_stmt);
> +    }
> +  gassign *perm_stmt;
> +  tree perm_dest = make_ssa_name (vectype);
> +  if (mask_vec)
> +    {
> +      if (!types_compatible_p (TREE_TYPE (second_def), vectype))
> +       {
> +         gassign *conv_stmt
> +           = gimple_build_assign (make_ssa_name (vectype),
> +                                  build1 (VIEW_CONVERT_EXPR,
> +                                          vectype, second_def));
> +         vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
> +         second_def = gimple_assign_lhs (conv_stmt);
> +       }
> +      perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
> +                                      first_def, second_def,
> +                                      mask_vec);
> +    }
> +  else
> +    /* We need a copy here in case the def was external.  */
> +    perm_stmt = gimple_build_assign (perm_dest, first_def);
> +  vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
> +  /* Store the vector statement in NODE.  */
> +  SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
> +}
>
>  /* Vectorize the SLP permutations in NODE as specified
>     in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
> @@ -5836,15 +5887,21 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>       arbitrary mismatches.  */
>    slp_tree child;
>    unsigned i;
> +  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +  bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
>    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
> -    if (!vect_maybe_update_slp_op_vectype (child, vectype)
> -       || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
> -      {
> -       if (dump_enabled_p ())
> -         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                          "Unsupported lane permutation\n");
> -       return false;
> -      }
> +    {
> +      if (!vect_maybe_update_slp_op_vectype (child, vectype)
> +         || !types_compatible_p (SLP_TREE_VECTYPE (child), vectype))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "Unsupported lane permutation\n");
> +         return false;
> +       }
> +      if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
> +       repeating_p = false;
> +    }
>
>    vec<std::pair<unsigned, unsigned> > &perm = SLP_TREE_LANE_PERMUTATION 
> (node);
>    gcc_assert (perm.length () == SLP_TREE_LANES (node));
> @@ -5854,18 +5911,58 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>                        "vectorizing permutation");
>        for (unsigned i = 0; i < perm.length (); ++i)
>         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
> +      if (repeating_p)
> +       dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
>        dump_printf (MSG_NOTE, "\n");
>      }
>
> -  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> -  if (!nunits.is_constant ())
> -    return false;
> -  unsigned HOST_WIDE_INT vf = 1;
> -  if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
> -    if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&vf))
> -      return false;
> -  unsigned olanes = vf * SLP_TREE_LANES (node);
> -  gcc_assert (multiple_p (olanes, nunits));
> +  /* REPEATING_P is true if every output vector is guaranteed to use the
> +     same permute vector.  We can handle that case for both variable-length
> +     and constant-length vectors, but we only handle other cases for
> +     constant-length vectors.
> +
> +     Set:
> +
> +     - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
> +       mask vector that we want to build.
> +
> +     - NCOPIES to the number of copies of PERM that we need in order
> +       to build the necessary permute mask vectors.
> +
> +     - NOUTPUTS_PER_MASK to the number of output vectors we want to create
> +       for each permute mask vector.  This is only relevant when GSI is
> +       nonnull.  */
> +  uint64_t npatterns;
> +  unsigned nelts_per_pattern;
> +  uint64_t ncopies;
> +  unsigned noutputs_per_mask;
> +  if (repeating_p)
> +    {
> +      /* We need a single permute mask vector that has the form:
> +
> +          { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
> +
> +        In other words, the original n-element permute in PERM is
> +        "unrolled" to fill a full vector.  The stepped vector encoding
> +        that we use for permutes requires 3n elements.  */
> +      npatterns = SLP_TREE_LANES (node);
> +      nelts_per_pattern = ncopies = 3;
> +      noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
> +    }
> +  else
> +    {
> +      /* Calculate every element of every permute mask vector explicitly,
> +        instead of relying on the pattern described above.  */
> +      if (!nunits.is_constant (&npatterns))
> +       return false;
> +      nelts_per_pattern = ncopies = 1;
> +      if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
> +       if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
> +         return false;
> +      noutputs_per_mask = 1;
> +    }
> +  unsigned olanes = ncopies * SLP_TREE_LANES (node);
> +  gcc_assert (repeating_p || multiple_p (olanes, nunits));
>
>    /* Compute the { { SLP operand, vector index}, lane } permutation sequence
>       from the { SLP operand, scalar lane } permutation as recorded in the
> @@ -5875,16 +5972,22 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>    auto_vec<unsigned> active_lane;
>    vperm.create (olanes);
>    active_lane.safe_grow_cleared (SLP_TREE_CHILDREN (node).length (), true);
> -  for (unsigned i = 0; i < vf; ++i)
> +  for (unsigned i = 0; i < ncopies; ++i)
>      {
>        for (unsigned pi = 0; pi < perm.length (); ++pi)
>         {
>           std::pair<unsigned, unsigned> p = perm[pi];
>           tree vtype = SLP_TREE_VECTYPE (SLP_TREE_CHILDREN (node)[p.first]);
> -         unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
> -         unsigned vi = (active_lane[p.first] + p.second) / vnunits;
> -         unsigned vl = (active_lane[p.first] + p.second) % vnunits;
> -         vperm.quick_push (std::make_pair (std::make_pair (p.first, vi), 
> vl));
> +         if (repeating_p)
> +           vperm.quick_push ({{p.first, 0}, p.second + 
> active_lane[p.first]});
> +         else
> +           {
> +             /* We checked above that the vectors are constant-length.  */
> +             unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
> +             unsigned vi = (active_lane[p.first] + p.second) / vnunits;
> +             unsigned vl = (active_lane[p.first] + p.second) % vnunits;
> +             vperm.quick_push ({{p.first, vi}, vl});
> +           }
>         }
>        /* Advance to the next group.  */
>        for (unsigned j = 0; j < SLP_TREE_CHILDREN (node).length (); ++j)
> @@ -5896,7 +5999,10 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>        dump_printf_loc (MSG_NOTE, vect_location, "as");
>        for (unsigned i = 0; i < vperm.length (); ++i)
>         {
> -         if (i != 0 && multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype)))
> +         if (i != 0
> +             && (repeating_p
> +                 ? multiple_p (i, npatterns)
> +                 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
>             dump_printf (MSG_NOTE, ",");
>           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
>                        vperm[i].first.first, vperm[i].first.second,
> @@ -5913,11 +6019,10 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>       somehow?  */
>    std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
>    std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
> -  unsigned int const_nunits = nunits.to_constant ();
>    unsigned int index = 0;
> -  unsigned int mask_element;
> +  poly_uint64 mask_element;
>    vec_perm_builder mask;
> -  mask.new_vector (const_nunits, const_nunits, 1);
> +  mask.new_vector (nunits, npatterns, nelts_per_pattern);
>    unsigned int count = mask.encoded_nelts ();
>    mask.quick_grow (count);
>    vec_perm_indices indices;
> @@ -5932,7 +6037,7 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>                || second_vec == vperm[i].first)
>         {
>           second_vec = vperm[i].first;
> -         mask_element += const_nunits;
> +         mask_element += nunits;
>         }
>        else
>         {
> @@ -5948,8 +6053,7 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>
>        if (index == count)
>         {
> -         indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
> -                             const_nunits);
> +         indices.new_vector (mask, second_vec.first == -1U ? 1 : 2, nunits);
>           bool identity_p = indices.series_p (0, 1, 0, 1);
>           if (!identity_p
>               && !can_vec_perm_const_p (TYPE_MODE (vectype), indices))
> @@ -5977,51 +6081,25 @@ vectorizable_slp_permutation (vec_info *vinfo, 
> gimple_stmt_iterator *gsi,
>               if (second_vec.first == -1U)
>                 second_vec = first_vec;
>
> -             /* Generate the permute statement if necessary.  */
> -             slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
> -             tree first_def
> -               = vect_get_slp_vect_def (first_node, first_vec.second);
> -             /* ???  We SLP match existing vector element extracts but
> -                allow punning which we need to re-instantiate at uses
> -                but have no good way of explicitely representing.  */
> -             if (!types_compatible_p (TREE_TYPE (first_def), vectype))
> -               {
> -                 gassign *conv_stmt;
> -                 conv_stmt = gimple_build_assign (make_ssa_name (vectype),
> -                                                  build1 (VIEW_CONVERT_EXPR,
> -                                                          vectype, 
> first_def));
> -                 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
> -                 first_def = gimple_assign_lhs (conv_stmt);
> -               }
> -             gassign *perm_stmt;
> -             tree perm_dest = make_ssa_name (vectype);
> +             slp_tree
> +               first_node = SLP_TREE_CHILDREN (node)[first_vec.first],
> +               second_node = SLP_TREE_CHILDREN (node)[second_vec.first];
> +
> +             tree mask_vec = NULL_TREE;
>               if (!identity_p)
> +               mask_vec = vect_gen_perm_mask_checked (vectype, indices);
> +
> +             for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
>                 {
> -                 slp_tree second_node
> -                   = SLP_TREE_CHILDREN (node)[second_vec.first];
> +                 tree first_def
> +                   = vect_get_slp_vect_def (first_node,
> +                                            first_vec.second + vi);
>                   tree second_def
> -                   = vect_get_slp_vect_def (second_node, second_vec.second);
> -                 if (!types_compatible_p (TREE_TYPE (second_def), vectype))
> -                   {
> -                     gassign *conv_stmt;
> -                     conv_stmt = gimple_build_assign (make_ssa_name 
> (vectype),
> -                                                      build1
> -                                                        (VIEW_CONVERT_EXPR,
> -                                                         vectype, 
> second_def));
> -                     vect_finish_stmt_generation (vinfo, NULL, conv_stmt, 
> gsi);
> -                     second_def = gimple_assign_lhs (conv_stmt);
> -                   }
> -                 tree mask_vec = vect_gen_perm_mask_checked (vectype, 
> indices);
> -                 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
> -                                                  first_def, second_def,
> -                                                  mask_vec);
> +                   = vect_get_slp_vect_def (second_node,
> +                                            second_vec.second + vi);
> +                 vect_add_slp_permutation (vinfo, gsi, node, first_def,
> +                                           second_def, mask_vec);
>                 }
> -             else
> -               /* We need a copy here in case the def was external.  */
> -               perm_stmt = gimple_build_assign (perm_dest, first_def);
> -             vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
> -             /* Store the vector statement in NODE.  */
> -             SLP_TREE_VEC_STMTS (node).quick_push (perm_stmt);
>             }
>
>           index = 0;
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c
> index 7e7be4ed6fd..818291ea219 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr88834.c
> @@ -11,5 +11,6 @@ f (int *restrict x, int *restrict y, int *restrict z, int n)
>      }
>  }
>
> -/* { dg-final { scan-assembler-times {\tld2w\t{z[0-9]+.s - z[0-9]+.s}, 
> p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
> -/* { dg-final { scan-assembler-times {\tst2w\t{z[0-9]+.s - z[0-9]+.s}, 
> p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */
> +/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.d, all\n} } } */
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+.s, p[0-7]/z, 
> \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+.s, p[0-7], \[x[0-9]+, 
> x[0-9]+, lsl 2\]\n} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c
> new file mode 100644
> index 00000000000..5936b878606
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr88834_ld3.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +void
> +f (int *restrict x, int *restrict y, int *restrict z, int n)
> +{
> +  for (int i = 0; i < n; i += 3)
> +    {
> +      x[i] = y[i] + z[i];
> +      x[i + 1] = y[i + 1] - z[i + 1];
> +      x[i + 2] = y[i + 2] | z[i + 2];
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tld3w\t{z[0-9]+.s - z[0-9]+.s}, 
> p[0-7]/z, \[x[0-9]+, x[0-9]+, lsl 2\]\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tst3w\t{z[0-9]+.s - z[0-9]+.s}, 
> p[0-7], \[x[0-9]+, x[0-9]+, lsl 2\]\n} 1 } } */

Reply via email to