On 10/08/17 14:12, Jackson Woodruff wrote: > Hi all, > > This patch changes patterns in aarch64-simd.md to replace > > movi v0.4s, 0 > str q0, [x0, 16] > > With: > > stp xzr, xzr, [x0, 16] > > When we are storing zeros to vectors like this: > > void f(uint32x4_t *p) { > uint32x4_t x = { 0, 0, 0, 0}; > p[1] = x; > } > > Bootstrapped and regtested on aarch64 with no regressions. > OK for trunk? > > Jackson > > gcc/ > > 2017-08-09 Jackson Woodruff <jackson.woodr...@arm.com> > > * aarch64-simd.md (mov<mode>): No longer force zero > immediate into register. > (*aarch64_simd_mov<mode>): Add new case for stp > using zero immediate. > > > gcc/testsuite > > 2017-08-09 Jackson Woodruff <jackson.woodr...@arm.com> > > * gcc.target/aarch64/simd/neon_str_zero.c: New. > > > patchfile > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 74de9b8c89dd5e4e3d87504594c969de0e0128ce..0149a742d34ae4fd5b3fd705b03c845f94aa1d59 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -23,7 +23,10 @@ > (match_operand:VALL_F16 1 "general_operand" ""))] > "TARGET_SIMD" > " > - if (GET_CODE (operands[0]) == MEM) > + if (GET_CODE (operands[0]) == MEM > + && !(aarch64_simd_imm_zero (operands[1], <MODE>mode) > + && aarch64_legitimate_address_p (<MODE>mode, operands[0], > + PARALLEL, 1))) > operands[1] = force_reg (<MODE>mode, operands[1]); > " > ) > @@ -94,63 +97,70 @@ > > (define_insn "*aarch64_simd_mov<mode>" > [(set (match_operand:VD 0 "nonimmediate_operand" > - "=w, m, w, ?r, ?w, ?r, w") > + "=w, m, m, w, ?r, ?w, ?r, w") > (match_operand:VD 1 "general_operand" > - "m, w, w, w, r, r, Dn"))] > + "m, Dz, w, w, w, r, r, Dn"))] > "TARGET_SIMD > - && (register_operand (operands[0], <MODE>mode) > - || register_operand (operands[1], <MODE>mode))" > + && ((register_operand (operands[0], <MODE>mode) > + || register_operand (operands[1], <MODE>mode)) > + || (memory_operand (operands[0], <MODE>mode) > + && immediate_operand (operands[1], <MODE>mode)))"
Allowing any immediate here seems too lax - it allows any immediate value which then could cause reload operations to be inserted (that in turn might cause register pressure calculations to be incorrect). Wouldn't it be better to use something like aarch64_simd_reg_or_zero? Similarly below. R. > { > switch (which_alternative) > { > case 0: return "ldr\\t%d0, %1"; > - case 1: return "str\\t%d1, %0"; > - case 2: return "mov\t%0.<Vbtype>, %1.<Vbtype>"; > - case 3: return "umov\t%0, %1.d[0]"; > - case 4: return "fmov\t%d0, %1"; > - case 5: return "mov\t%0, %1"; > - case 6: > + case 1: return "str\\txzr, %0"; > + case 2: return "str\\t%d1, %0"; > + case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>"; > + case 4: return "umov\t%0, %1.d[0]"; > + case 5: return "fmov\t%d0, %1"; > + case 6: return "mov\t%0, %1"; > + case 7: > return aarch64_output_simd_mov_immediate (operands[1], > <MODE>mode, 64); > default: gcc_unreachable (); > } > } > - [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\ > + [(set_attr "type" "neon_load1_1reg<q>, neon_stp, neon_store1_1reg<q>,\ > neon_logic<q>, neon_to_gp<q>, f_mcr,\ > mov_reg, neon_move<q>")] > ) > > (define_insn "*aarch64_simd_mov<mode>" > [(set (match_operand:VQ 0 "nonimmediate_operand" > - "=w, m, w, ?r, ?w, ?r, w") > + "=w, Ump, m, w, ?r, ?w, ?r, w") > (match_operand:VQ 1 "general_operand" > - "m, w, w, w, r, r, Dn"))] > + "m, Dz, w, w, w, r, r, Dn"))] > "TARGET_SIMD > - && (register_operand (operands[0], <MODE>mode) > - || register_operand (operands[1], <MODE>mode))" > + && ((register_operand (operands[0], <MODE>mode) > + || register_operand (operands[1], <MODE>mode)) > + || (memory_operand (operands[0], <MODE>mode) > + && immediate_operand (operands[1], <MODE>mode)))" > { > switch (which_alternative) > { > case 0: > return "ldr\\t%q0, %1"; > case 1: > - return "str\\t%q1, %0"; > + return "stp\\txzr, xzr, %0"; > case 2: > - return "mov\t%0.<Vbtype>, %1.<Vbtype>"; > + return "str\\t%q1, %0"; > case 3: > + return "mov\t%0.<Vbtype>, %1.<Vbtype>"; > case 4: > case 5: > - return "#"; > case 6: > + return "#"; > + case 7: > return aarch64_output_simd_mov_immediate (operands[1], <MODE>mode, 128); > default: > gcc_unreachable (); > } > } > [(set_attr "type" "neon_load1_1reg<q>, neon_store1_1reg<q>,\ > - neon_logic<q>, multiple, multiple, multiple,\ > - neon_move<q>") > - (set_attr "length" "4,4,4,8,8,8,4")] > + neon_stp, neon_logic<q>, multiple, multiple,\ > + multiple, neon_move<q>") > + (set_attr "length" "4,4,4,4,8,8,8,4")] > ) > > ;; When storing lane zero we can use the normal STR and its more permissive > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c > b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..07198de109432b530745cc540790303ae0245efb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/simd/neon_str_zero.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O1" } */ > + > +#include <arm_neon.h> > + > +void > +f (uint32x4_t *p) > +{ > + uint32x4_t x = { 0, 0, 0, 0}; > + p[1] = x; > + > + /* { dg-final { scan-assembler "stp\txzr, xzr," } } */ > +} > + > +void > +g (float32x2_t *p) > +{ > + float32x2_t x = {0.0, 0.0}; > + p[0] = x; > + > + /* { dg-final { scan-assembler "str\txzr, " } } */ > +} >