Jennifer Schmitz <jschm...@nvidia.com> writes: > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index f7bccf532f8..1c06b8528e9 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -6416,13 +6416,30 @@ aarch64_stack_protect_canary_mem (machine_mode mode, > rtx decl_rtl, > void > aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src) > { > - expand_operand ops[3]; > machine_mode mode = GET_MODE (dest); > - create_output_operand (&ops[0], dest, mode); > - create_input_operand (&ops[1], pred, GET_MODE(pred)); > - create_input_operand (&ops[2], src, mode); > - temporary_volatile_ok v (true); > - expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); > + if ((MEM_P (dest) || MEM_P (src)) > + && known_eq (BYTES_PER_SVE_VECTOR, 16)
I suppose it's personal preference, but I think this would be more obvious as the suggested: known_eq (GET_MODE_SIZE (mode), 16) so that everything is defined/tested in terms of the mode that we want to move. > + && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA > + && !BYTES_BIG_ENDIAN) > + { > + if (MEM_P (src)) > + { > + rtx tmp = force_reg (V16QImode, adjust_address (src, V16QImode, 0)); > + emit_move_insn (dest, lowpart_subreg (mode, tmp, V16QImode)); > + } > + else > + emit_move_insn (adjust_address (dest, V16QImode, 0), > + force_lowpart_subreg (V16QImode, src, mode)); > + } > + else > + { > + expand_operand ops[3]; > + create_output_operand (&ops[0], dest, mode); > + create_input_operand (&ops[1], pred, GET_MODE(pred)); > + create_input_operand (&ops[2], src, mode); > + temporary_volatile_ok v (true); > + expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); > + } > } > > /* Expand a pre-RA SVE data move from SRC to DEST in which at least one > [...] > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c > b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c > index d99ce1202a9..370bd9e3bfe 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/struct_3_128.c > @@ -473,17 +473,11 @@ SEL2 (struct, pst_uniform4) > ** sub sp, sp, #144 > ** add (x[0-9]+), sp, #?31 > ** and x7, \1, #?(?:-32|4294967264) > -** ptrue (p[0-7])\.b, vl16 > -** st1w z0\.s, \2, \[x7\] > -** add (x[0-9]+), x7, #?32 > -** ( > -** str z1, \[\3\] > -** str z2, \[\3, #1, mul vl\] > -** | > -** stp q1, q2, \[\3\] > -** ) > -** str z3, \[\3, #2, mul vl\] > -** st1w z4\.s, \2, \[x7, #6, mul vl\] > +** mov x0, x7 > +** str q0, \[x0\], 32 > +** stp q1, q2, \[x0\] > +** str z3, \[x0, #2, mul vl\] > +** str q4, \[x7, 96\] > ** add sp, sp, #?144 > ** ret > */ Sorry for not noticing last time, but: There's no need for the temporary register to be x0, so it should be handled using captures rather than hard-coded to x0, as with the original ADD. Also, the patch doesn't change the handling of unpredicated SVE stores, and so doesn't change whether we use: str z1, \[\3\] str z2, \[\3, #1, mul vl\] or: stp q1, q2, \[\3\] So I think we should keep the (...|...) part of the test as-is, with just the captures updated. There would be a benefit in "defending" the use of STP if we paired q0+q1 and q2+q3, but until then, I think we should continue to accept both. Same for the other tests in the file. Thanks, Richard