Akram Ahmad <akram.ah...@arm.com> writes:
> GIMPLE code which performs a narrowing truncation on the result of a
> vector concatenation currently results in an unnecessary XTN being
> emitted following a UZP1 to concate the operands. In cases such as this,
> UZP1 should instead use a smaller arrangement specifier to replace the
> XTN instruction. This is seen in cases such as in this GIMPLE example:
>
>       int32x2_t foo (svint64_t a, svint64_t b)
>       {
>         vector(2) int vect__2.8;
>         long int _1;
>         long int _3;
>         vector(2) long int _12;
>
>         <bb 2> [local count: 1073741824]:
>         _1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D));
>         _3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D));
>         _12 = {_1, _3};
>         vect__2.8_13 = (vector(2) int) _12;
>         return vect__2.8_13;
>
>       }
>
> Original assembly generated:
>
>       bar:
>               ptrue   p3.b, all
>               uaddv   d0, p3, z0.d
>               uaddv   d1, p3, z1.d
>               uzp1    v0.2d, v0.2d, v1.2d
>               xtn     v0.2s, v0.2d
>               ret
>
> This patch therefore defines the *aarch64_trunc_concat<mode> insn which
> truncates the concatenation result, rather than concatenating the
> truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting
> in the following optimised assembly being emitted:
>
>       bar:
>               ptrue   p3.b, all
>               uaddv   d0, p3, z0.d
>               uaddv   d1, p3, z1.d
>               uzp1    v0.2s, v0.2s, v1.2s
>               ret
>
> This patch passes all regression tests on aarch64 with no new failures.
> A supporting test for this optimisation is also written and passes.
>
> OK for master? I do not have commit rights so I cannot push the patch
> myself.

Sorry, looks like I never reviewed this :(

>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat)
>         (*aarch64_float_trunc_concat) new insn definitions.
>       * config/aarch64/iterators.md: (VQ_SDF): new mode iterator.
>         (VTRUNCD): new mode attribute for truncated modes.
>         (Vtruncd): new mode attribute for arrangement specifier.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/sve/truncated_concatenation_1.c: new test
>         for the above example and other modes covered by insn
>         definitions.
> ---
>  gcc/config/aarch64/aarch64-simd.md            | 32 +++++++++++++
>  gcc/config/aarch64/iterators.md               | 11 +++++
>  .../aarch64/sve/truncated_concatenation_1.c   | 46 +++++++++++++++++++
>  3 files changed, 89 insertions(+)
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index cfe95bd4c31..90730960451 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1872,6 +1872,38 @@
>    [(set_attr "type" "neon_permute<q>")]
>  )
>  
> +(define_insn "*aarch64_trunc_concat<mode>"
> +  [(set (match_operand:<VTRUNCD> 0 "register_operand" "=w")
> +     (truncate:<VTRUNCD>
> +       (vec_concat:VQN
> +         (match_operand:<VHALF> 1 "register_operand" "w")
> +         (match_operand:<VHALF> 2 "register_operand" "w"))))]
> +  "TARGET_SIMD"
> +{
> +  if (!BYTES_BIG_ENDIAN)
> +    return "uzp1\\t%0.<Vtruncd>, %1.<Vtruncd>, %2.<Vtruncd>";
> +  else
> +    return "uzp1\\t%0.<Vtruncd>, %2.<Vtruncd>, %1.<Vtruncd>";
> +}
> +  [(set_attr "type" "neon_permute<q>")]
> +)

It looks like <VNARROWQ> already provides <VTRUNCD> and that <Vntype>
already provides <Vtruncd>, so we probably don't need to add the new
mode attributes.

Otherwise this looks good.

> +
> +(define_insn "*aarch64_float_trunc_concat<mode>"
> +  [(set (match_operand:<VTRUNCD> 0 "register_operand" "=w")
> +     (float_truncate:<VTRUNCD>
> +       (vec_concat:VQ_SDF
> +         (match_operand:<VHALF> 1 "register_operand" "w")
> +         (match_operand:<VHALF> 2 "register_operand" "w"))))]
> +  "TARGET_SIMD"
> +{
> +  if (!BYTES_BIG_ENDIAN)
> +    return "uzp1\\t%0.<Vtruncd>, %1.<Vtruncd>, %2.<Vtruncd>";
> +  else
> +    return "uzp1\\t%0.<Vtruncd>, %2.<Vtruncd>, %1.<Vtruncd>";
> +}
> +  [(set_attr "type" "neon_permute<q>")]
> +)

This doesn't look right though.  float_truncate is an arithmetic
operation that would need to convert, say, 1.0 double-precision
into 1.0f single-precision.  It doesn't simply drop the upper bits.

Thanks,
Richard

> +
>  ;; Packing doubles.
>  
>  (define_expand "vec_pack_trunc_<mode>"
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index d7cb27e1885..008629ecf63 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -181,6 +181,9 @@
>  ;; Advanced SIMD single Float modes.
>  (define_mode_iterator VDQSF [V2SF V4SF])
>  
> +;; Quad vector Float modes with single and double elements.
> +(define_mode_iterator VQ_SDF [V4SF V2DF])
> +
>  ;; Quad vector Float modes with half/single elements.
>  (define_mode_iterator VQ_HSF [V8HF V4SF])
>  
> @@ -1722,6 +1725,14 @@
>  (define_mode_attr Vnarrowq2 [(V8HI "v16qi") (V4SI "v8hi")
>                            (V2DI "v4si")])
>  
> +;; Truncated Advanced SIMD modes which preserve the number of lanes.
> +(define_mode_attr VTRUNCD [(V8HI "V8QI") (V4SI "V4HI")
> +                        (V4SF "V4HF") (V2DI "V2SI")
> +                        (V2DF "V2SF")])
> +(define_mode_attr Vtruncd [(V8HI "8b") (V4SI "4h")
> +                        (V4SF "4h") (V2DI "2s")
> +                        (V2DF "2s")])
> +
>  ;; Narrowed modes of vector modes.
>  (define_mode_attr VNARROW [(VNx8HI "VNx16QI")
>                          (VNx4SI "VNx8HI") (VNx4SF "VNx8HF")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
> new file mode 100644
> index 00000000000..400428accd2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c
> @@ -0,0 +1,46 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */
> +
> +#include <arm_neon.h>
> +#include <arm_sve.h>
> +
> +int8x8_t f1 (int16x4_t a, int16x4_t b) {
> +    int8x8_t ab = vdup_n_s8 (0);
> +    int16x8_t ab_concat = vcombine_s16 (a, b);
> +    ab = vmovn_s16 (ab_concat);
> +    return ab;
> +}
> +
> +int16x4_t f2 (int32x2_t a, int32x2_t b) {
> +    int16x4_t ab = vdup_n_s16 (0);
> +    int32x4_t ab_concat = vcombine_s32 (a, b);
> +    ab = vmovn_s32 (ab_concat);
> +    return ab;
> +}
> +
> +float16x4_t f3 (float32x2_t a, float32x2_t b) {
> +    float16x4_t ab = vdup_n_f16 (0);
> +    float32x4_t ab_concat = vcombine_f32 (a, b);
> +    ab = vcvt_f16_f32 (ab_concat);
> +    return ab;
> +}
> +
> +int32x2_t f4 (svint64_t a, svint64_t b) {
> +    int32x2_t ab = vdup_n_s32 (0);
> +    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0);
> +    ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1);
> +    return ab;
> +}
> +
> +float32x2_t f5 (svfloat64_t a, svfloat64_t b) {
> +    float32x2_t ab = vdup_n_f32 (0);
> +    ab = vset_lane_f32 ((float)svaddv_f64 (svptrue_b64(), a), ab, 0);
> +    ab = vset_lane_f32 ((float)svaddv_f64 (svptrue_b64(), b), ab, 1);
> +    return ab;
> +}
> +
> +/* { dg-final { scan-assembler-not {\txtn\t} } }*/
> +/* { dg-final { scan-assembler-not {\tfcvtn\t} } }*/
> +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.8b, v[0-9]+\.8b, 
> v[0-9]+\.8b} 1 } }*/
> +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.4h, v[0-9]+\.4h, 
> v[0-9]+\.4h} 2 } }*/
> +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, 
> v[0-9]+\.2s} 2 } }*/
> \ No newline at end of file

Reply via email to