Sorry for the slow review. Jennifer Schmitz <jschm...@nvidia.com> writes: > SVE loads and stores where the predicate is all-true can be optimized to > unpredicated instructions. For example, > svuint8_t foo (uint8_t *x) > { > return svld1 (svptrue_b8 (), x); > } > was compiled to: > foo: > ptrue p3.b, all > ld1b z0.b, p3/z, [x0] > ret > but can be compiled to: > foo: > ldr z0, [x0] > ret > > Late_combine2 had already been trying to do this, but was missing the > instruction: > (set (reg/i:VNx16QI 32 v0) > (unspec:VNx16QI [ > (const_vector:VNx16BI repeat [ > (const_int 1 [0x1]) > ]) > (mem:VNx16QI (reg/f:DI 0 x0 [orig:106 x ] [106]) > [0 MEM <svuint8_t> [(unsigned char *)x_2(D)]+0 S[16, 16] A8]) > ] UNSPEC_PRED_X)) > > This patch adds a new define_insn_and_split that matches the missing > instruction and splits it to an unpredicated load/store. Because LDR > offers fewer addressing modes than LD1[BHWD], the pattern is > guarded under reload_completed to only apply the transform once the > address modes have been chosen during RA. > > The patch was bootstrapped and tested on aarch64-linux-gnu, no regression. > OK for mainline? > > Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com> > > gcc/ > * config/aarch64/aarch64-sve.md (*aarch64_sve_ptrue<mode>_ldr_str): > Add define_insn_and_split to fold predicated SVE loads/stores with > ptrue predicates to unpredicated instructions. > > gcc/testsuite/ > * gcc.target/aarch64/sve/ptrue_ldr_str.c: New test. > * gcc.target/aarch64/sve/cost_model_14.c: Adjust expected outcome. > * gcc.target/aarch64/sve/cost_model_4.c: Adjust expected outcome. > * gcc.target/aarch64/sve/cost_model_5.c: Adjust expected outcome. > * gcc.target/aarch64/sve/cost_model_6.c: Adjust expected outcome. > * gcc.target/aarch64/sve/cost_model_7.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_f16.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_f32.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_f64.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_mf8.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_s16.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_s32.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_s64.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_s8.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_u16.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_u32.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_u64.c: Adjust expected outcome. > * gcc.target/aarch64/sve/pcs/varargs_2_u8.c: Adjust expected outcome. > * gcc.target/aarch64/sve/peel_ind_2.c: Adjust expected outcome. > * gcc.target/aarch64/sve/single_1.c: Adjust expected outcome. > * gcc.target/aarch64/sve/single_2.c: Adjust expected outcome. > * gcc.target/aarch64/sve/single_3.c: Adjust expected outcome. > * gcc.target/aarch64/sve/single_4.c: Adjust expected outcome. > --- > gcc/config/aarch64/aarch64-sve.md | 17 ++++ > .../aarch64/sve/acle/general/attributes_6.c | 8 +- > .../gcc.target/aarch64/sve/cost_model_14.c | 4 +- > .../gcc.target/aarch64/sve/cost_model_4.c | 3 +- > .../gcc.target/aarch64/sve/cost_model_5.c | 3 +- > .../gcc.target/aarch64/sve/cost_model_6.c | 3 +- > .../gcc.target/aarch64/sve/cost_model_7.c | 3 +- > .../aarch64/sve/pcs/varargs_2_f16.c | 93 +++++++++++++++++-- > .../aarch64/sve/pcs/varargs_2_f32.c | 93 +++++++++++++++++-- > .../aarch64/sve/pcs/varargs_2_f64.c | 93 +++++++++++++++++-- > .../aarch64/sve/pcs/varargs_2_mf8.c | 32 +++---- > .../aarch64/sve/pcs/varargs_2_s16.c | 93 +++++++++++++++++-- > .../aarch64/sve/pcs/varargs_2_s32.c | 93 +++++++++++++++++-- > .../aarch64/sve/pcs/varargs_2_s64.c | 93 +++++++++++++++++-- > .../gcc.target/aarch64/sve/pcs/varargs_2_s8.c | 34 +++---- > .../aarch64/sve/pcs/varargs_2_u16.c | 93 +++++++++++++++++-- > .../aarch64/sve/pcs/varargs_2_u32.c | 93 +++++++++++++++++-- > .../aarch64/sve/pcs/varargs_2_u64.c | 93 +++++++++++++++++-- > .../gcc.target/aarch64/sve/pcs/varargs_2_u8.c | 32 +++---- > .../gcc.target/aarch64/sve/peel_ind_2.c | 4 +- > .../gcc.target/aarch64/sve/ptrue_ldr_str.c | 31 +++++++ > .../gcc.target/aarch64/sve/single_1.c | 11 ++- > .../gcc.target/aarch64/sve/single_2.c | 11 ++- > .../gcc.target/aarch64/sve/single_3.c | 11 ++- > .../gcc.target/aarch64/sve/single_4.c | 11 ++- > 25 files changed, 907 insertions(+), 148 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/ptrue_ldr_str.c > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index d4af3706294..03b7194d200 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -702,6 +702,23 @@ > } > ) > > +;; Fold predicated loads/stores with a PTRUE predicate to unpredicated > +;; loads/stores after RA. > +(define_insn_and_split "*aarch64_sve_ptrue<mode>_ldr_str" > + [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand") > + (unspec:SVE_FULL > + [(match_operand:<VPRED> 1 "aarch64_simd_imm_one") > + (match_operand:SVE_FULL 2 "aarch64_sve_nonimmediate_operand")] > + UNSPEC_PRED_X))]
Since this is a define_insn, it ought to have constraints for non-immediate operands. Probably: [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=Utr,w") (unspec:SVE_FULL [(match_operand:<VPRED> 1 "aarch64_simd_imm_one") (match_operand:SVE_FULL 2 "aarch64_sve_nonimmediate_operand" "w,Utr")] UNSPEC_PRED_X))] > + "TARGET_SVE && reload_completed > + && (<MODE>mode == VNx16QImode || !BYTES_BIG_ENDIAN) > + && ((REG_P (operands[0]) && MEM_P (operands[2])) > + || (REG_P (operands[2]) && MEM_P (operands[0])))" > + "#" > + "&& 1" > + [(set (match_dup 0) > + (match_dup 2))]) > + > ;; Unpredicated moves that cannot use LDR and STR, i.e. partial vectors > ;; or vectors for which little-endian ordering isn't acceptable. Memory > ;; accesses require secondary reloads. > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c > b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c > index 02f4bec9a9c..9528ea3f48b 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/varargs_2_s8.c > @@ -8,9 +8,9 @@ > /* > ** callee_0: > ** ... > -** ld1b (z[0-9]+\.b), (p[0-7])/z, \[x1\] > +** ldr (z[0-9]+), \[x1\] > ** ... > -** st1b \1, \2, \[x0\] > +** str \1, \[x0\] > ** ... > ** ret > */ > @@ -23,15 +23,15 @@ callee_0 (int8_t *ptr, ...) > va_start (va, ptr); > vec = va_arg (va, svint8_t); > va_end (va); > - svst1 (svptrue_b8 (), ptr, vec); > +svst1 (svptrue_b8 (), ptr, vec); The last part looks like a spurious change. > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/single_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/single_1.c > index d9bb97e12cd..b9c3d3e5241 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/single_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/single_1.c > @@ -40,12 +40,13 @@ TEST_LOOP (double, 3.0) > /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #2\.0e\+0\n} 1 } } > */ > /* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.d, #3\.0e\+0\n} 1 } } > */ > > -/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 11 } } */ > +/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl32\n} 11 { > target aarch64_big_endian } } } */ > > -/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b,} 2 } } */ > -/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 } } */ > -/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 } } */ > -/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 } } */ > +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 3 { target > aarch64_big_endian } } } */ > +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 3 { target > aarch64_big_endian } } } */ > +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 3 { target > aarch64_big_endian } } } */ > +/* { dg-final { scan-assembler-times {\tstr\tz[0-9]+, \[x0\]} 2 { target > aarch64_big_endian } } } */ > +/* { dg-final { scan-assembler-times {\tstr\tz[0-9]+, \[x0\]} 11 { target > aarch64_little_endian } } } */ I should try it for myself, sorry, but: why do we still have 11 ptrues for big-endian, rather than 9, if the st1bs are converted to strs? Same for the other single_* tests. If 9 is correct, then the patch is OK with that change and the changes above. Thanks, Richard