This aarch64 specific vld2 intrinsics testcase was fixed by r16-1113-g069caa5cea91f (simple copy propagation for aggregates). I didn't include it in the original patch as I was testing on x86_64 but I got around to testing this and now we don't have any more extra movs so let's add a testcase.
Tested for aarch64-linux-gnu. PR tree-optimization/89606 gcc/testsuite/ChangeLog: * gcc.target/aarch64/vld2-1.c: New test. Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com> --- gcc/testsuite/gcc.target/aarch64/vld2-1.c | 45 +++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/vld2-1.c diff --git a/gcc/testsuite/gcc.target/aarch64/vld2-1.c b/gcc/testsuite/gcc.target/aarch64/vld2-1.c new file mode 100644 index 00000000000..8a267674df1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vld2-1.c @@ -0,0 +1,45 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-forwprop1-details" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ +/* PR tree-optimization/89606 */ + +#include <arm_neon.h> + +/* +**func1: +** ld2 {v0.2d - v1.2d}, \[x0\] +** ld2 {v0.d - v1.d}\[1\], \[x1\] +** ret +*/ +float64x2x2_t func1(const double *p1, const double *p2) +{ + float64x2x2_t v = vld2q_f64(p1); + return vld2q_lane_f64(p2, v, 1); +} + +/* +**func2: +** ld2 {v0.2s - v1.2s}, \[x0\] +** ld2 {v0.s - v1.s}\[1\], \[x1\] +** ret +*/ +float32x2x2_t func2(const float *p1, const float *p2) +{ + float32x2x2_t v = vld2_f32(p1); + return vld2_lane_f32(p2, v, 1); +} + +/* +**func3: +** ld2 {v([0-9]+).2s - v([0-9]+).2s}, \[x1\] +** ld2 {v\1.s - v\2.s}\[1\], \[x2\] +** stp d\1, d\2, \[x0\] +** ret +*/ +void func3(float32x2x2_t *p, const float *p1, const float *p2) +{ + float32x2x2_t v = vld2_f32(p1); + *p = vld2_lane_f32(p2, v, 1); +} + +/* { dg-final { scan-tree-dump-times "after previous" 3 "forwprop1" } } */ -- 2.43.0