Re: [PATCH 4/6] aarch64: Optimize vector rotates into REV* instructions where possible

Richard Sandiford Wed, 23 Oct 2024 02:31:14 -0700

Kyrylo Tkachov <ktkac...@nvidia.com> writes:
> Hi all,
>
> Some vector rotate operations can be implemented in a single instruction
> rather than using the fallback SHL+USRA sequence.
> In particular, when the rotate amount is half the bitwidth of the element
> we can use a REV64,REV32,REV16 instruction.
> This patch adds this transformation in the recently added splitter for vector
> rotates.
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
>
> gcc/
>
>       * config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate):
>       Declare prototype.
>       * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement.
>       * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
>       Call the above.
>
> gcc/testsuite/
>
>       * gcc.target/aarch64/simd/pr117048_2.c: New test.


Sorry to be awkward, but I still think at least part of this should be
target-independent.  Any rotate by a byte amount can be expressed as a
vector permutation in a target-independent way.  Target-independent code
can then use the usual optab routines to query whether the permutation
is possible and/or try to generate it.

I can see that it probably makes sense to leave target code to make
the decision about when to use the permutation strategy vs. other
approaches.  But the code to implement that strategy shouldn't need
to be target-specific.

E.g. we could have a routine:

  expand_rotate_as_vec_perm

which checks whether the rotation amount is suitable and tries to
generate the permutation if so.

Thanks,
Richard

> From e97509382b6bb755336ec4aa220fabd968e69502 Mon Sep 17 00:00:00 2001
> From: Kyrylo Tkachov <ktkac...@nvidia.com>
> Date: Wed, 16 Oct 2024 04:10:08 -0700
> Subject: [PATCH 4/6] aarch64: Optimize vector rotates into REV* instructions
>  where possible
>
> Some vector rotate operations can be implemented in a single instruction
> rather than using the fallback SHL+USRA sequence.
> In particular, when the rotate amount is half the bitwidth of the element
> we can use a REV64,REV32,REV16 instruction.
> This patch adds this transformation in the recently added splitter for vector
> rotates.
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Signed-off-by: Kyrylo Tkachov <ktkac...@nvidia.com>
>
> gcc/
>
>       * config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate):
>       Declare prototype.
>       * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement.
>       * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>):
>       Call the above.
>
> gcc/testsuite/
>
>       * gcc.target/aarch64/simd/pr117048_2.c: New test.
> ---
>  gcc/config/aarch64/aarch64-protos.h           |  1 +
>  gcc/config/aarch64/aarch64-simd.md            |  3 +
>  gcc/config/aarch64/aarch64.cc                 | 49 ++++++++++++++
>  .../gcc.target/aarch64/simd/pr117048_2.c      | 66 +++++++++++++++++++
>  4 files changed, 119 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index d03c1fe798b..da0e657a513 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -776,6 +776,7 @@ bool aarch64_rnd_imm_p (rtx);
>  bool aarch64_constant_address_p (rtx);
>  bool aarch64_emit_approx_div (rtx, rtx, rtx);
>  bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
> +bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
>  tree aarch64_vector_load_decl (tree);
>  rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
>  void aarch64_expand_call (rtx, rtx, rtx, bool);
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 543179d9fce..44c40512f30 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1313,6 +1313,9 @@
>           (match_dup 4))
>         (match_dup 3)))]
>    {
> +    if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
> +      DONE;
> +
>      operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
>      rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
>      int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 21d9a6b5a20..47859c4e31b 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -15998,6 +15998,55 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
>    return true;
>  }
>  
> +/* Emit an optimized sequence to perform a vector rotate
> +   of REG by the vector constant amount AMNT and place the result
> +   in DST.  Return true iff successful.  */
> +
> +bool
> +aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
> +{
> +  amnt = unwrap_const_vec_duplicate (amnt);
> +  gcc_assert (CONST_INT_P (amnt));
> +  HOST_WIDE_INT rotamnt = UINTVAL (amnt);
> +  machine_mode mode = GET_MODE (reg);
> +  /* Rotates by half the element width map down to REV* instructions.  */
> +  if (rotamnt == GET_MODE_UNIT_BITSIZE (mode) / 2)
> +    {
> +      machine_mode revmode;
> +      unsigned unspec;
> +      switch (mode)
> +     {
> +       case V2DImode:
> +         revmode = V4SImode;
> +         unspec = UNSPEC_REV64;
> +         break;
> +       case V4SImode:
> +         revmode = V8HImode;
> +         unspec = UNSPEC_REV32;
> +         break;
> +       case V2SImode:
> +         revmode = V4HImode;
> +         unspec = UNSPEC_REV32;
> +         break;
> +       /* We can implement a V8HI rotate by 8 with a REV16 instruction but
> +          that is a standard BSWAP code and it won't go through this rotate
> +          optimization path.  */
> +       default:
> +         return false;
> +     }
> +      rtx rev_reg = lowpart_subreg (revmode, reg, mode);
> +      rtx unspec_op
> +     = gen_rtx_UNSPEC (revmode, gen_rtvec (1, rev_reg), unspec);
> +      rtx tmp_reg
> +     = reload_completed ? lowpart_subreg (revmode, dst, mode)
> +                        : gen_reg_rtx (revmode);
> +      emit_set_insn (tmp_reg, unspec_op);
> +      emit_move_insn (dst, lowpart_subreg (mode, tmp_reg, revmode));
> +      return true;
> +    }
> +  return false;
> +}
> +
>  /* Return the number of instructions that can be issued per cycle.  */
>  static int
>  aarch64_sched_issue_rate (void)
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
> new file mode 100644
> index 00000000000..7821909859d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
> @@ -0,0 +1,66 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef char __attribute__ ((vector_size (16))) v16qi;
> +typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
> +typedef unsigned int __attribute__ ((vector_size (16))) v4si;
> +typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
> +typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
> +typedef unsigned int __attribute__ ((vector_size (8))) v2si;
> +
> +/*
> +** G1:
> +**   rev64   v0\.4s, v0\.4s
> +**   ret 
> +*/
> +v2di
> +G1 (v2di r)
> +{
> +  return (r >> 32) | (r << 32);
> +}
> +
> +/*
> +** G2:
> +**   rev32   v0\.8h, v0\.8h
> +**   ret 
> +*/
> +v4si
> +G2 (v4si r)
> +{
> +  return (r >> 16) | (r << 16);
> +}
> +
> +/*
> +** G3:
> +**   rev16   v0\.16b, v0\.16b
> +**   ret 
> +*/
> +v8hi
> +G3 (v8hi r)
> +{
> +  return (r >> 8) | (r << 8);
> +}
> +
> +/*
> +** G4:
> +**   rev32   v0\.4h, v0\.4h
> +**   ret 
> +*/
> +v2si
> +G4 (v2si r)
> +{
> +  return (r >> 16) | (r << 16);
> +}
> +
> +/*
> +** G5:
> +**   rev16   v0\.8b, v0\.8b
> +**   ret 
> +*/
> +v4hi
> +G5 (v4hi r)
> +{
> +  return (r >> 8) | (r << 8);
> +}
> +

Re: [PATCH 4/6] aarch64: Optimize vector rotates into REV* instructions where possible

Reply via email to