Re: [PATCH]AArch64 Optimize right shift rounding narrowing

Richard Sandiford via Gcc-patches Tue, 30 Nov 2021 08:13:57 -0800

Tamar Christina <tamar.christ...@arm.com> writes:
> Hi All,
>
> This optimizes right shift rounding narrow instructions to
> rounding add narrow high where one vector is 0 when the shift amount is half
> that of the original input type.
>
> i.e.
>
> uint32x4_t foo (uint64x2_t a, uint64x2_t b)
> {
>   return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
> }
>
> now generates:
>
> foo:
>         movi    v3.4s, 0
>         raddhn  v0.2s, v2.2d, v3.2d
>         raddhn2 v0.4s, v2.2d, v3.2d
>
> instead of:
>
> foo:
>         rshrn   v0.2s, v0.2d, 32
>         rshrn2  v0.4s, v1.2d, 32
>         ret
>
> On Arm cores this is an improvement in both latency and throughput.
> Because a vector zero is needed I created a new method
> aarch64_gen_shareable_zero that creates zeros using V4SI and then takes a 
> subreg
> of the zero to the desired type.  This allows CSE to share all the zero
> constants.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?


LGTM.  Just a couple of nits:

>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-protos.h (aarch64_gen_shareable_zero): New.
>       * config/aarch64/aarch64-simd.md (aarch64_rshrn<mode>,
>       aarch64_rshrn2<mode>): 

Missing description.

>       * config/aarch64/aarch64.c (aarch64_gen_shareable_zero): New.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/advsimd-intrinsics/shrn-1.c: New test.
>       * gcc.target/aarch64/advsimd-intrinsics/shrn-2.c: New test.
>       * gcc.target/aarch64/advsimd-intrinsics/shrn-3.c: New test.
>       * gcc.target/aarch64/advsimd-intrinsics/shrn-4.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> f7887d06139f01c1591c4e755538d94e5e608a52..f7f5cae82bc9198e54d0298f25f7c0f5902d5fb1
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -846,6 +846,7 @@ const char *aarch64_output_move_struct (rtx *operands);
>  rtx aarch64_return_addr_rtx (void);
>  rtx aarch64_return_addr (int, rtx);
>  rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
> +rtx aarch64_gen_shareable_zero (machine_mode);
>  bool aarch64_simd_mem_operand_p (rtx);
>  bool aarch64_sve_ld1r_operand_p (rtx);
>  bool aarch64_sve_ld1rq_operand_p (rtx);
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> c71658e2bf52b26bf9fc9fa702dd5446447f4d43..d7f8694add540e32628893a7b7471c08de6f760f
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1956,20 +1956,32 @@ (define_expand "aarch64_rshrn<mode>"
>     (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")]
>    "TARGET_SIMD"
>    {
> -    operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> -                                              INTVAL (operands[2]));
> -    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> -    if (BYTES_BIG_ENDIAN)
> -      emit_insn (gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
> -                             operands[2], CONST0_RTX (<VNARROWQ>mode)));
> +    if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode))
> +      {
> +     rtx tmp0 = aarch64_gen_shareable_zero (<MODE>mode);
> +     emit_insn (gen_aarch64_raddhn<mode> (operands[0], operands[1], tmp0));
> +      }
>      else
> -      emit_insn (gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
> -                             operands[2], CONST0_RTX (<VNARROWQ>mode)));
> -
> -    /* The intrinsic expects a narrow result, so emit a subreg that will get
> -       optimized away as appropriate.  */
> -    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
> -                                              <VNARROWQ2>mode));
> +      {
> +     rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> +     operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> +                                                      INTVAL (operands[2]));
> +     if (BYTES_BIG_ENDIAN)
> +       emit_insn (
> +             gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
> +                                              operands[2],
> +                                              CONST0_RTX (<VNARROWQ>mode)));
> +     else
> +       emit_insn (
> +             gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
> +                                              operands[2],
> +                                              CONST0_RTX (<VNARROWQ>mode)));
> +
> +     /* The intrinsic expects a narrow result, so emit a subreg that will
> +        get optimized away as appropriate.  */
> +     emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
> +                                                  <VNARROWQ2>mode));
> +      }
>      DONE;
>    }
>  )
> @@ -2049,14 +2061,27 @@ (define_expand "aarch64_rshrn2<mode>"
>     (match_operand:SI 3 "aarch64_simd_shift_imm_offset_<vn_mode>")]
>    "TARGET_SIMD"
>    {
> -    operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> -                                              INTVAL (operands[3]));
> -    if (BYTES_BIG_ENDIAN)
> -      emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0], operands[1],
> -                                               operands[2], operands[3]));
> +    if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ2>mode))
> +      {
> +     rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
> +     emit_insn (gen_aarch64_raddhn2<mode> (operands[0], operands[1],
> +                                           operands[2], tmp));
> +      }
>      else
> -      emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0], operands[1],
> -                                               operands[2], operands[3]));
> +      {
> +     operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> +                                                      INTVAL (operands[3]));
> +     if (BYTES_BIG_ENDIAN)
> +       emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0],
> +                                                    operands[1],
> +                                                    operands[2],
> +                                                    operands[3]));
> +     else
> +       emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0],
> +                                                    operands[1],
> +                                                    operands[2],
> +                                                    operands[3]));
> +      }
>      DONE;
>    }
>  )
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> fdf05505846721b02059df494d6395ae9423a8ef..11201ea3498beb270c0a7f8da5f5009d710535ee
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -20397,6 +20397,18 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
>      == SYMBOL_TINY_ABSOLUTE;
>  }
>  
> +/* Create a 0 constant that is based of V4SI to allow CSE to optimally share

based on

OK otherwise, thanks.  I think long-term we should create shareable
zeros in all contexts, a bit like we do for PTRUEs, but I realise
that isn't late stage 1 material.

Richard

> +   the constant creation.  */
> +
> +rtx
> +aarch64_gen_shareable_zero (machine_mode mode)
> +{
> +  machine_mode zmode = V4SImode;
> +  rtx tmp = gen_reg_rtx (zmode);
> +  emit_move_insn (tmp, CONST0_RTX (zmode));
> +  return lowpart_subreg (mode, tmp, zmode);
> +}
> +
>  /* Return a const_int vector of VAL.  */
>  rtx
>  aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..4bc3aa9563ee7d0dc46557d30d9a29149706229d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
> +
> +#include <arm_neon.h>
> +
> +uint8x16_t foo (uint32x4_t a, uint32x4_t b)
> +{
> +  uint16x4_t a1 = vrshrn_n_u32 (a, 16);
> +  uint16x8_t b1 = vrshrn_high_n_u32 (a1, b, 16);
> +  return vrshrn_high_n_u16 (vrshrn_n_u16 (b1, 8), b1, 8);
> +}
> +
> +/* { dg-final { scan-assembler-times {\tmovi\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..09d913e85524f06367c1c2cf51dda0f57578e9ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +
> +#include <arm_neon.h>
> +
> +uint32x4_t foo (uint64x2_t a, uint64x2_t b)
> +{
> +  return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
> +}
> +
> +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..bdccbb3410f049d7e45aabdcc3d2964fbabca807
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +
> +#include <arm_neon.h>
> +
> +uint16x8_t foo (uint32x4_t a, uint32x4_t b)
> +{
> +  return vrshrn_high_n_u32 (vrshrn_n_u32 (a, 16), b, 16);
> +}
> +
> +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..4b23eddb85891975b8e122060e2a9ebfe56d842c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +
> +#include <arm_neon.h>
> +
> +uint8x16_t foo (uint16x8_t a, uint16x8_t b)
> +{
> +  return vrshrn_high_n_u16 (vrshrn_n_u16 (a, 8), b, 8);
> +}
> +
> +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */

Re: [PATCH]AArch64 Optimize right shift rounding narrowing

Reply via email to