Akram Ahmad <akram.ah...@arm.com> writes: > GIMPLE code which performs a narrowing truncation on the result of a > vector concatenation currently results in an unnecessary XTN being > emitted following a UZP1 to concate the operands. In cases such as this, > UZP1 should instead use a smaller arrangement specifier to replace the > XTN instruction. This is seen in cases such as in this GIMPLE example: > > int32x2_t foo (svint64_t a, svint64_t b) > { > vector(2) int vect__2.8; > long int _1; > long int _3; > vector(2) long int _12; > > <bb 2> [local count: 1073741824]: > _1 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, a_6(D)); > _3 = svaddv_s64 ({ -1, 0, 0, 0, 0, 0, 0, 0, ... }, b_7(D)); > _12 = {_1, _3}; > vect__2.8_13 = (vector(2) int) _12; > return vect__2.8_13; > > } > > Original assembly generated: > > bar: > ptrue p3.b, all > uaddv d0, p3, z0.d > uaddv d1, p3, z1.d > uzp1 v0.2d, v0.2d, v1.2d > xtn v0.2s, v0.2d > ret > > This patch therefore defines the *aarch64_trunc_concat<mode> insn which > truncates the concatenation result, rather than concatenating the > truncated operands (such as in *aarch64_narrow_trunc<mode>), resulting > in the following optimised assembly being emitted: > > bar: > ptrue p3.b, all > uaddv d0, p3, z0.d > uaddv d1, p3, z1.d > uzp1 v0.2s, v0.2s, v1.2s > ret > > This patch passes all regression tests on aarch64 with no new failures. > A supporting test for this optimisation is also written and passes. > > OK for master? I do not have commit rights so I cannot push the patch > myself.
Sorry, looks like I never reviewed this :( > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md: (*aarch64_trunc_concat) > (*aarch64_float_trunc_concat) new insn definitions. > * config/aarch64/iterators.md: (VQ_SDF): new mode iterator. > (VTRUNCD): new mode attribute for truncated modes. > (Vtruncd): new mode attribute for arrangement specifier. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/truncated_concatenation_1.c: new test > for the above example and other modes covered by insn > definitions. > --- > gcc/config/aarch64/aarch64-simd.md | 32 +++++++++++++ > gcc/config/aarch64/iterators.md | 11 +++++ > .../aarch64/sve/truncated_concatenation_1.c | 46 +++++++++++++++++++ > 3 files changed, 89 insertions(+) > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index cfe95bd4c31..90730960451 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1872,6 +1872,38 @@ > [(set_attr "type" "neon_permute<q>")] > ) > > +(define_insn "*aarch64_trunc_concat<mode>" > + [(set (match_operand:<VTRUNCD> 0 "register_operand" "=w") > + (truncate:<VTRUNCD> > + (vec_concat:VQN > + (match_operand:<VHALF> 1 "register_operand" "w") > + (match_operand:<VHALF> 2 "register_operand" "w"))))] > + "TARGET_SIMD" > +{ > + if (!BYTES_BIG_ENDIAN) > + return "uzp1\\t%0.<Vtruncd>, %1.<Vtruncd>, %2.<Vtruncd>"; > + else > + return "uzp1\\t%0.<Vtruncd>, %2.<Vtruncd>, %1.<Vtruncd>"; > +} > + [(set_attr "type" "neon_permute<q>")] > +) It looks like <VNARROWQ> already provides <VTRUNCD> and that <Vntype> already provides <Vtruncd>, so we probably don't need to add the new mode attributes. Otherwise this looks good. > + > +(define_insn "*aarch64_float_trunc_concat<mode>" > + [(set (match_operand:<VTRUNCD> 0 "register_operand" "=w") > + (float_truncate:<VTRUNCD> > + (vec_concat:VQ_SDF > + (match_operand:<VHALF> 1 "register_operand" "w") > + (match_operand:<VHALF> 2 "register_operand" "w"))))] > + "TARGET_SIMD" > +{ > + if (!BYTES_BIG_ENDIAN) > + return "uzp1\\t%0.<Vtruncd>, %1.<Vtruncd>, %2.<Vtruncd>"; > + else > + return "uzp1\\t%0.<Vtruncd>, %2.<Vtruncd>, %1.<Vtruncd>"; > +} > + [(set_attr "type" "neon_permute<q>")] > +) This doesn't look right though. float_truncate is an arithmetic operation that would need to convert, say, 1.0 double-precision into 1.0f single-precision. It doesn't simply drop the upper bits. Thanks, Richard > + > ;; Packing doubles. > > (define_expand "vec_pack_trunc_<mode>" > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index d7cb27e1885..008629ecf63 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -181,6 +181,9 @@ > ;; Advanced SIMD single Float modes. > (define_mode_iterator VDQSF [V2SF V4SF]) > > +;; Quad vector Float modes with single and double elements. > +(define_mode_iterator VQ_SDF [V4SF V2DF]) > + > ;; Quad vector Float modes with half/single elements. > (define_mode_iterator VQ_HSF [V8HF V4SF]) > > @@ -1722,6 +1725,14 @@ > (define_mode_attr Vnarrowq2 [(V8HI "v16qi") (V4SI "v8hi") > (V2DI "v4si")]) > > +;; Truncated Advanced SIMD modes which preserve the number of lanes. > +(define_mode_attr VTRUNCD [(V8HI "V8QI") (V4SI "V4HI") > + (V4SF "V4HF") (V2DI "V2SI") > + (V2DF "V2SF")]) > +(define_mode_attr Vtruncd [(V8HI "8b") (V4SI "4h") > + (V4SF "4h") (V2DI "2s") > + (V2DF "2s")]) > + > ;; Narrowed modes of vector modes. > (define_mode_attr VNARROW [(VNx8HI "VNx16QI") > (VNx4SI "VNx8HI") (VNx4SF "VNx8HF") > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c > new file mode 100644 > index 00000000000..400428accd2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/truncated_concatenation_1.c > @@ -0,0 +1,46 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -Wall -march=armv8.2-a+sve" } */ > + > +#include <arm_neon.h> > +#include <arm_sve.h> > + > +int8x8_t f1 (int16x4_t a, int16x4_t b) { > + int8x8_t ab = vdup_n_s8 (0); > + int16x8_t ab_concat = vcombine_s16 (a, b); > + ab = vmovn_s16 (ab_concat); > + return ab; > +} > + > +int16x4_t f2 (int32x2_t a, int32x2_t b) { > + int16x4_t ab = vdup_n_s16 (0); > + int32x4_t ab_concat = vcombine_s32 (a, b); > + ab = vmovn_s32 (ab_concat); > + return ab; > +} > + > +float16x4_t f3 (float32x2_t a, float32x2_t b) { > + float16x4_t ab = vdup_n_f16 (0); > + float32x4_t ab_concat = vcombine_f32 (a, b); > + ab = vcvt_f16_f32 (ab_concat); > + return ab; > +} > + > +int32x2_t f4 (svint64_t a, svint64_t b) { > + int32x2_t ab = vdup_n_s32 (0); > + ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), a), ab, 0); > + ab = vset_lane_s32 ((int)svaddv_s64 (svptrue_b64 (), b), ab, 1); > + return ab; > +} > + > +float32x2_t f5 (svfloat64_t a, svfloat64_t b) { > + float32x2_t ab = vdup_n_f32 (0); > + ab = vset_lane_f32 ((float)svaddv_f64 (svptrue_b64(), a), ab, 0); > + ab = vset_lane_f32 ((float)svaddv_f64 (svptrue_b64(), b), ab, 1); > + return ab; > +} > + > +/* { dg-final { scan-assembler-not {\txtn\t} } }*/ > +/* { dg-final { scan-assembler-not {\tfcvtn\t} } }*/ > +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.8b, v[0-9]+\.8b, > v[0-9]+\.8b} 1 } }*/ > +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.4h, v[0-9]+\.4h, > v[0-9]+\.4h} 2 } }*/ > +/* { dg-final { scan-assembler-times {\tuzp1\tv[0-9]+\.2s, v[0-9]+\.2s, > v[0-9]+\.2s} 2 } }*/ > \ No newline at end of file