Wilco Dijkstra <wilco.dijks...@arm.com> writes: > Use UZP1 instead of INS when combining low and high halves of vectors. > UZP1 has 3 operands which improves register allocation, and is faster on > some microarchitectures. > > Passes regress & bootstrap, OK for commit?
OK, thanks. We can add core-specific tuning later if a supported core strongly prefers INS for some reason, but I agree that the three-address nature of UZP1 makes it the better default choice. Richard > > gcc: > * config/aarch64/aarch64-simd.md (aarch64_combine_internal<mode>): > Use UZP1 instead of INS. > (aarch64_combine_internal_be<mode>): Likewise. > > gcc/testsuite: > * gcc.target/aarch64/ldp_stp_16.c: Update to check for UZP1. > * gcc.target/aarch64/pr109072_1.c: Likewise. > * gcc.target/aarch64/vec-init-14.c: Likewise. > * gcc.target/aarch64/vec-init-9.c: Likewise. > > --- > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > f8bb973a278c7964f3e3a4f7154a0ab62214b7cf..16b7445d9f72f77a98ab262e21fd24e6cc97eba0 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -4388,7 +4388,7 @@ > && (register_operand (operands[0], <VDBL>mode) > || register_operand (operands[2], <MODE>mode))" > {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] > - [ w , 0 , w ; neon_ins<dblq> , simd ] > ins\t%0.<single_type>[1], %2.<single_type>[0] > + [ w , w , w ; neon_permute<dblq> , simd ] > uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type> > [ w , 0 , ?r ; neon_from_gp<dblq> , simd ] > ins\t%0.<single_type>[1], %<single_wx>2 > [ w , 0 , ?r ; f_mcr , * ] > fmov\t%0.d[1], %2 > [ w , 0 , Utv ; neon_load1_one_lane<dblq> , simd ] > ld1\t{%0.<single_type>}[1], %2 > @@ -4407,7 +4407,7 @@ > && (register_operand (operands[0], <VDBL>mode) > || register_operand (operands[2], <MODE>mode))" > {@ [ cons: =0 , 1 , 2 ; attrs: type , arch ] > - [ w , 0 , w ; neon_ins<dblq> , simd ] > ins\t%0.<single_type>[1], %2.<single_type>[0] > + [ w , w , w ; neon_permute<dblq> , simd ] > uzp1\t%0.2<single_type>, %1.2<single_type>, %2.2<single_type> > [ w , 0 , ?r ; neon_from_gp<dblq> , simd ] > ins\t%0.<single_type>[1], %<single_wx>2 > [ w , 0 , ?r ; f_mcr , * ] > fmov\t%0.d[1], %2 > [ w , 0 , Utv ; neon_load1_one_lane<dblq> , simd ] > ld1\t{%0.<single_type>}[1], %2 > diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c > b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c > index > f1f46e051a86d160a7f7f14872108da87b444ca1..95835aa2eb41c289e7b74f19bb56cf6fa23a3045 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c > +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_16.c > @@ -80,16 +80,16 @@ CONS2_FN (2, float); > > /* > ** cons2_4_float: { target aarch64_little_endian } > -** ins v0.s\[1\], v1.s\[0\] > -** stp d0, d0, \[x0\] > -** stp d0, d0, \[x0, #?16\] > +** uzp1 v([0-9])\.2s, v0\.2s, v1\.2s > +** stp d\1, d\1, \[x0\] > +** stp d\1, d\1, \[x0, #?16\] > ** ret > */ > /* > ** cons2_4_float: { target aarch64_big_endian } > -** ins v1.s\[1\], v0.s\[0\] > -** stp d1, d1, \[x0\] > -** stp d1, d1, \[x0, #?16\] > +** uzp1 v([0-9])\.2s, v1\.2s, v0\.2s > +** stp d\1, d\1, \[x0\] > +** stp d\1, d\1, \[x0, #?16\] > ** ret > */ > CONS2_FN (4, float); > @@ -125,8 +125,8 @@ CONS4_FN (2, float); > > /* > ** cons4_4_float: > -** ins v[0-9]+\.s[^\n]+ > -** ins v[0-9]+\.s[^\n]+ > +** uzp1 v[0-9]+\.2s[^\n]+ > +** uzp1 v[0-9]+\.2s[^\n]+ > ** zip1 v([0-9]+).4s, [^\n]+ > ** stp q\1, q\1, \[x0\] > ** stp q\1, q\1, \[x0, #?32\] > diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c > b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c > index > 6c1d2b0bdccfb74b80d938a0d94413f0f9dda5ab..0fc195a598f3b82ff188b3151e77e1272254b78c > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c > @@ -54,7 +54,7 @@ f32x2_1 (float32_t x) > > /* > ** f32x2_2: > -** ins v0\.s\[1\], v1.s\[0\] > +** uzp1 v0\.2s, v0\.2s, v1\.2s > ** ret > */ > float32x2_t > @@ -165,7 +165,7 @@ f64x2_1 (float64_t x) > > /* > ** f64x2_2: > -** ins v0\.d\[1\], v1.d\[0\] > +** uzp1 v0\.2d, v0\.2d, v1\.2d > ** ret > */ > float64x2_t > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c > b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c > index > 02875088cd98833882cdf15b14dcb426951e428f..1a2cc9fbf473ad0de2d8ef97d7efdbe40d959866 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/vec-init-14.c > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-14.c > @@ -67,7 +67,7 @@ int32x2_t s32_6(int32_t a0, int32_t a1) { > > /* > ** f32_1: > -** ins v0\.s\[1\], v1\.s\[0\] > +** uzp1 v0\.2s, v0\.2s, v1\.2s > ** ret > */ > float32x2_t f32_1(float32_t a0, float32_t a1) { > @@ -90,7 +90,7 @@ float32x2_t f32_2(float32_t a0, float32_t *ptr) { > /* > ** f32_3: > ** ldr s0, \[x0\] > -** ins v0\.s\[1\], v1\.s\[0\] > +** uzp1 v0\.2s, v0\.2s, v1\.2s > ** ret > */ > float32x2_t f32_3(float32_t a0, float32_t a1, float32_t *ptr) { > diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c > b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c > index > 8f68e06a55925b973a87723c7b5924264382e4b0..3cf05cf865e21fad482e5ffc8c769d0f15a57e74 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/vec-init-9.c > +++ b/gcc/testsuite/gcc.target/aarch64/vec-init-9.c > @@ -75,7 +75,7 @@ int64x2_t s64q_6(int64_t a0, int64_t a1) { > > /* > ** f64q_1: > -** ins v0\.d\[1\], v1\.d\[0\] > +** uzp1 v0\.2d, v0\.2d, v1\.2d > ** ret > */ > float64x2_t f64q_1(float64_t a0, float64_t a1) { > @@ -98,7 +98,7 @@ float64x2_t f64q_2(float64_t a0, float64_t *ptr) { > /* > ** f64q_3: > ** ldr d0, \[x0\] > -** ins v0\.d\[1\], v1\.d\[0\] > +** uzp1 v0\.2d, v0\.2d, v1\.2d > ** ret > */ > float64x2_t f64q_3(float64_t a0, float64_t a1, float64_t *ptr) { > @@ -140,7 +140,7 @@ float64x2_t f64q_6(float64_t a0, float64_t a1) { > > /* > ** s32q_1: > -** ins v0\.d\[1\], v1\.d\[0\] > +** uzp1 v0\.2d, v0\.2d, v1\.2d > ** ret > */ > int32x4_t s32q_1(int32x2_t a0, int32x2_t a1) { > @@ -157,7 +157,7 @@ int32x4_t s32q_2(int32x2_t a0, int32x2_t *ptr) { > /* > ** s32q_3: > ** ldr d0, \[x0\] > -** ins v0\.d\[1\], v1\.d\[0\] > +** uzp1 v0\.2d, v0\.2d, v1\.2d > ** ret > */ > int32x4_t s32q_3(int32x2_t a0, int32x2_t a1, int32x2_t *ptr) { > @@ -204,7 +204,7 @@ int32x4_t s32q_6(int32x2_t a0, int32x2_t a1) { > > /* > ** f32q_1: > -** ins v0\.d\[1\], v1\.d\[0\] > +** uzp1 v0\.2d, v0\.2d, v1\.2d > ** ret > */ > float32x4_t f32q_1(float32x2_t a0, float32x2_t a1) { > @@ -221,7 +221,7 @@ float32x4_t f32q_2(float32x2_t a0, float32x2_t *ptr) { > /* > ** f32q_3: > ** ldr d0, \[x0\] > -** ins v0\.d\[1\], v1\.d\[0\] > +** uzp1 v0\.2d, v0\.2d, v1\.2d > ** ret > */ > float32x4_t f32q_3(float32x2_t a0, float32x2_t a1, float32x2_t *ptr) {