On Mon, Nov 3, 2025 at 9:10 AM Robin Dapp <[email protected]> wrote:
>
> Hi,
>
> new try even though Richi didn't like the previous attempt ;)
> I played with vect_transform_slp_perm_load without really getting
> anywhere and figured maybe a now clearer version is acceptable.
>
> Consecutive load permutations like {0, 1, 2, 3} or {4, 5, 6, 7} in a
> group of 8 only read a part of the group, leaving a gap.
>
> For strided accesses we can elide the permutation and, instead of
> accessing the whole group, use the number of SLP lanes.  This
> effectively increases the vector size as we don't load gaps.  On top we
> do not need to emit the permutes at all.
>
> Bootstrapped and regtested on x86 and power10.  Regtested on aarch64 and
> rv64gcv_zvl512b.
>
> Regards
>  Robin
>
> gcc/ChangeLog:
>
>         * tree-vect-stmts.cc (has_consecutive_load_permutation): New.
>         (get_load_store_type): Elide load permutation if possible and
>         decrease group size.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (struct vect_load_store_data): Add
>         consecutive_perm_start.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/pr118019-2.c: Expect vlse32.
> ---
>  .../gcc.target/riscv/rvv/autovec/pr118019-2.c |  2 +-
>  gcc/tree-vect-stmts.cc                        | 54 ++++++++++++++++++-
>  gcc/tree-vectorizer.h                         |  3 ++
>  3 files changed, 56 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> index c8c1a7291fb..d3436b78377 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> @@ -47,4 +47,4 @@ x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t 
> *pix2, int i_pix2)
>    return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
>  }
>
> -/* { dg-final { scan-assembler-times "vlse64" 8 } } */
> +/* { dg-final { scan-assembler-times "vlse32" 4 } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 83acbb3ff67..c86ba4f5752 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2049,6 +2049,31 @@ vector_vector_composition_type (tree vtype, 
> poly_uint64 nelts, tree *ptype,
>    return NULL_TREE;
>  }
>
> +/* Check if the load permutation of NODE only refers to a consecutive
> +   subset of the group indices where GROUP_SIZE is the size of the
> +   dataref's group.  We also assert that the length of the permutation
> +   divides the group size and is a power of two.
> +   Such load permutations can be elided in strided access schemes as
> +   we can "jump over" the gap they leave.  */
> +
> +static bool
> +has_consecutive_load_permutation (slp_tree node, unsigned group_size)
> +{
> +  load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
> +  if (!perm.exists ()
> +      || perm.length () <= 1
> +      || !pow2p_hwi (perm.length ())
> +      || group_size % perm.length ())
> +    return false;
> +
> +  unsigned int start = perm[0];
> +  for (unsigned int i = 1; i < perm.length (); i++)
> +    if (perm[i] != start + (unsigned int)i)
> +      return false;
> +

We have a check (or multiple?) like this for BB vectorization where we also
can elide such permutes.  It would be nice to put this helper into
tree-vect-slp.cc
and use it throughout.  One "copy" is in
vect_optimize_slp_pass::remove_redundant_permutations:

      /* In basic block vectorization we allow any subchain of an interleaving
         chain.
         FORNOW: not in loop SLP because of realignment complications.  */
      if (is_a <bb_vec_info> (m_vinfo))
        {
          bool subchain_p = true;
          stmt_vec_info next_load_info = NULL;
          stmt_vec_info load_info;
          unsigned j;
          FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
            {
              if (j != 0
                  && (next_load_info != load_info
                      || ! load_info
                      || DR_GROUP_GAP (load_info) != 1))
                {
                  subchain_p = false;
                  break;
                }
              next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
            }
          if (subchain_p)
            {
              SLP_TREE_LOAD_PERMUTATION (node).release ();
              continue;
            }

"copy" as in, semantically equivalent.  Maybe it's the only copy.

> +  return true;
> +}
> +
>  /* Analyze load or store SLP_NODE of type VLS_TYPE.  Return true
>     if there is a memory access type that the vectorized form can use,
>     storing it in *MEMORY_ACCESS_TYPE if so.  If we decide to use gathers
> @@ -2094,6 +2119,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    *ls_type = NULL_TREE;
>    *slp_perm = false;
>    *n_perms = -1U;
> +  ls->consecutive_load_perm = false;
>
>    bool perm_ok = true;
>    poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
> @@ -2106,6 +2132,16 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>      {
>        first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
>        group_size = DR_GROUP_SIZE (first_stmt_info);
> +      /* If we have a strided access and the load permutation is consecutive
> +        we can reduce the group to the elements the permutation accesses.
> +        Then we release the permutation.  */
> +      if (STMT_VINFO_STRIDED_P (stmt_info)
> +         && has_consecutive_load_permutation (slp_node, group_size))
> +       {
> +         ls->consecutive_load_perm = true;
> +         group_size = SLP_TREE_LANES (slp_node);

When STMT_VINFO_STRIDED_P then DR_GROUp_GAP (first_stmt_info)
is always zero - it would be good to put an assert() here, for
documentation purposes.

> +         SLP_TREE_LOAD_PERMUTATION (slp_node).release ();

This seems a bit "dangerous" to do early.  In fact your changes below ...

> +       }
>        gap = DR_GROUP_GAP (first_stmt_info);
>        single_element_p = (stmt_info == first_stmt_info
>                           && !DR_GROUP_NEXT_ELEMENT (stmt_info));
> @@ -9876,7 +9912,14 @@ vectorizable_load (vec_info *vinfo,
>
>        if (grouped_load)
>         {
> -         first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
> +         /* If we elided a consecutive load permutation, don't
> +            use the original first statement (which could be elided)
> +            but the one the load permutation starts with.
> +            This ensures the stride_base below is correct.  */

... here and below suggest this is only valid for VMAT_STRIDED_SLP.
IIRC the BB vectorization case I talked about above uses
VMAT_CONTIGUOUS, and with VLA vectors you'll end up with
that or VMAT_GATHER_SCATTER_*?

So in get_load_store_type, can we do this only when choosing
VMAT_*s we have made this work?  And assert in the others
that !ls.consecutive_load_perm?  I'd rather name it
ls.subchain_p.

Note for the testcase in question on x86 we can perform a single
load for the whole group and split that cheaply into low/high parts,
so I wonder if this "optimization" should be only done if the
permute isn't supported?

> +         if (!ls.consecutive_load_perm)
> +           first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
> +         else
> +           first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
>           first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
>           ref_type = get_group_alias_ptr_type (first_stmt_info);
>         }
> @@ -9890,7 +9933,14 @@ vectorizable_load (vec_info *vinfo,
>        if (grouped_load)
>         {
>           if (memory_access_type == VMAT_STRIDED_SLP)
> -           group_size = DR_GROUP_SIZE (first_stmt_info);
> +           {
> +             /* If we elided a consecutive load permutation, adjust
> +                the group size here.  */
> +             if (!ls.consecutive_load_perm)
> +               group_size = DR_GROUP_SIZE (first_stmt_info);
> +             else
> +               group_size = SLP_TREE_LANES (slp_node);
> +           }
>           else /* VMAT_ELEMENTWISE */
>             group_size = SLP_TREE_LANES (slp_node);
>         }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 359c994139b..25706913778 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -288,11 +288,14 @@ struct vect_load_store_data : vect_data {
>        tree decl;       // VMAT_GATHER_SCATTER_DECL
>    } gs;
>    tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
> +  /* Larger load/store type used for punning the vectype.  */
>    tree ls_type; // VMAT_GATHER_SCATTER_IFN
>    auto_vec<int> elsvals;
>    /* True if the load requires a load permutation.  */
>    bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
>    unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
> +  /* Whether the load permutation is consecutive and simple.  */
> +  bool consecutive_load_perm; // VMAT_STRIDED_SLP
>  };
>
>  /* A computation tree of an SLP instance.  Each node corresponds to a group 
> of
> --
> 2.51.0
>

Reply via email to