Hi Richard, Sorry for the delay in getting back to this. I'm now working on a patch to adjust this.
> -----Original Message----- > From: Richard Sandiford <[email protected]> > Sent: Tuesday, December 14, 2021 10:48 AM > To: Kyrylo Tkachov via Gcc-patches <[email protected]> > Cc: Kyrylo Tkachov <[email protected]> > Subject: Re: [PATCH][1/4][committed] aarch64: Add support for Armv8.8-a > memory operations and memcpy expansion > > Kyrylo Tkachov via Gcc-patches <[email protected]> writes: > > @@ -23568,6 +23568,28 @@ > aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, > > *dst = aarch64_progress_pointer (*dst); > > } > > > > +/* Expand a cpymem using the MOPS extension. OPERANDS are taken > > + from the cpymem pattern. Return true iff we succeeded. */ > > +static bool > > +aarch64_expand_cpymem_mops (rtx *operands) > > +{ > > + if (!TARGET_MOPS) > > + return false; > > + rtx addr_dst = XEXP (operands[0], 0); > > + rtx addr_src = XEXP (operands[1], 0); > > + rtx sz_reg = operands[2]; > > + > > + if (!REG_P (sz_reg)) > > + sz_reg = force_reg (DImode, sz_reg); > > + if (!REG_P (addr_dst)) > > + addr_dst = force_reg (DImode, addr_dst); > > + if (!REG_P (addr_src)) > > + addr_src = force_reg (DImode, addr_src); > > + emit_insn (gen_aarch64_cpymemdi (addr_dst, addr_src, sz_reg)); > > + > > + return true; > > +} > > On this, I think it would be better to adjust the original src and dst > MEMs if possible, since they contain metadata about the size of the > access and alias set information. It looks like the code above > generates an instruction with a wild read and a wild write instead. > Hmm, do you mean adjust the address of the MEMs in operands with something like replace_equiv_address_nv? > It should be possible to do that with a define_expand/define_insn > pair, where the define_expand takes two extra operands for the MEMs, > but the define_insn contains the same operands as now. Could you please expand on this a bit? This path is reached from the cpymemdi expander that already takes the two MEMs as operands and generates the aarch64_cpymemdi define_insn that uses just the address registers as operands. Should we carry the MEMs around in the define_insn as well after expand? > > Since the instruction clobbers the three registers, I think we have to > use copy_to_reg (unconditionally) to force a fresh register. The ultimate > caller is not expecting the values of the registers in the original > address to change. Thanks, Kyrill > > Thanks, > Richard > > > > > + > > /* Expand cpymem, as if from a __builtin_memcpy. Return true if > > we succeed, otherwise return false, indicating that a libcall to > > memcpy should be emitted. */ > > @@ -23581,19 +23603,25 @@ aarch64_expand_cpymem (rtx *operands) > > rtx base; > > machine_mode cur_mode = BLKmode; > > > > - /* Only expand fixed-size copies. */ > > + /* Variable-sized memcpy can go through the MOPS expansion if > available. */ > > if (!CONST_INT_P (operands[2])) > > - return false; > > + return aarch64_expand_cpymem_mops (operands); > > > > unsigned HOST_WIDE_INT size = INTVAL (operands[2]); > > > > - /* Try to inline up to 256 bytes. */ > > - unsigned HOST_WIDE_INT max_copy_size = 256; > > + /* Try to inline up to 256 bytes or use the MOPS threshold if available. > > */ > > + unsigned HOST_WIDE_INT max_copy_size > > + = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256; > > > > bool size_p = optimize_function_for_size_p (cfun); > > > > + /* Large constant-sized cpymem should go through MOPS when possible. > > + It should be a win even for size optimization in the general case. > > + For speed optimization the choice between MOPS and the SIMD > sequence > > + depends on the size of the copy, rather than number of instructions, > > + alignment etc. */ > > if (size > max_copy_size) > > - return false; > > + return aarch64_expand_cpymem_mops (operands); > > > > int copy_bits = 256; > > > > @@ -23643,9 +23671,9 @@ aarch64_expand_cpymem (rtx *operands) > > nops += 2; > > n -= mode_bits; > > > > - /* Emit trailing copies using overlapping unaligned accesses - this > > is > > - smaller and faster. */ > > - if (n > 0 && n < copy_bits / 2) > > + /* Emit trailing copies using overlapping unaligned accesses > > + (when !STRICT_ALIGNMENT) - this is smaller and faster. */ > > + if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT) > > { > > machine_mode next_mode = smallest_mode_for_size (n, > MODE_INT); > > int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); > > @@ -23657,9 +23685,25 @@ aarch64_expand_cpymem (rtx *operands) > > } > > rtx_insn *seq = get_insns (); > > end_sequence (); > > + /* MOPS sequence requires 3 instructions for the memory copying + 1 to > move > > + the constant size into a register. */ > > + unsigned mops_cost = 3 + 1; > > + > > + /* If MOPS is available at this point we don't consider the libcall as > > it's > > + not a win even on code size. At this point only consider MOPS if > > + optimizing for size. For speed optimizations we will have chosen > between > > + the two based on copy size already. */ > > + if (TARGET_MOPS) > > + { > > + if (size_p && mops_cost < nops) > > + return aarch64_expand_cpymem_mops (operands); > > + emit_insn (seq); > > + return true; > > + } > > > > /* A memcpy libcall in the worst case takes 3 instructions to prepare the > > - arguments + 1 for the call. */ > > + arguments + 1 for the call. When MOPS is not available and we're > > + optimizing for size a libcall may be preferable. */ > > unsigned libcall_cost = 4; > > if (size_p && libcall_cost < nops) > > return false; > > diff --git a/gcc/config/aarch64/aarch64.md > b/gcc/config/aarch64/aarch64.md > > index > 5297b2d3f95744ac72e36814c6676cc97478d48b..d623c1b00bf62bf24420813f > b7a3a6bf09ff1dc0 100644 > > --- a/gcc/config/aarch64/aarch64.md > > +++ b/gcc/config/aarch64/aarch64.md > > @@ -143,6 +143,7 @@ (define_c_enum "unspec" [ > > UNSPEC_AUTIBSP > > UNSPEC_CALLEE_ABI > > UNSPEC_CASESI > > + UNSPEC_CPYMEM > > UNSPEC_CRC32B > > UNSPEC_CRC32CB > > UNSPEC_CRC32CH > > @@ -1572,6 +1573,18 @@ (define_split > > } > > ) > > > > +(define_insn "aarch64_cpymemdi" > > + [(parallel [ > > + (set (match_operand:DI 2 "register_operand" "+&r") (const_int 0)) > > + (clobber (match_operand:DI 0 "register_operand" "+&r")) > > + (clobber (match_operand:DI 1 "register_operand" "+&r")) > > + (set (mem:BLK (match_dup 0)) > > + (unspec:BLK [(mem:BLK (match_dup 1)) (match_dup 2)] > UNSPEC_CPYMEM))])] > > + "TARGET_MOPS" > > + "cpyfp\t[%x0]!, [%x1]!, %x2!\;cpyfm\t[%x0]!, [%x1]!, %x2!\;cpyfe\t[%x0]!, > [%x1]!, %x2!" > > + [(set_attr "length" "12")] > > +) > > + > > ;; 0 is dst > > ;; 1 is src > > ;; 2 is size of copy in bytes > > @@ -1580,9 +1593,9 @@ (define_split > > (define_expand "cpymemdi" > > [(match_operand:BLK 0 "memory_operand") > > (match_operand:BLK 1 "memory_operand") > > - (match_operand:DI 2 "immediate_operand") > > + (match_operand:DI 2 "general_operand") > > (match_operand:DI 3 "immediate_operand")] > > - "!STRICT_ALIGNMENT" > > + "!STRICT_ALIGNMENT || TARGET_MOPS" > > { > > if (aarch64_expand_cpymem (operands)) > > DONE; > > diff --git a/gcc/config/aarch64/aarch64.opt > b/gcc/config/aarch64/aarch64.opt > > index > 32191cf1acf43302c4a544b85db60b7b6e14da48..7445ed106cca9cb8a4537414 > 499f6f28476951bf 100644 > > --- a/gcc/config/aarch64/aarch64.opt > > +++ b/gcc/config/aarch64/aarch64.opt > > @@ -280,3 +280,7 @@ Target Joined UInteger > Var(aarch64_autovec_preference) Init(0) IntegerRange(0, 4 > > > > -param=aarch64-loop-vect-issue-rate-niters= > > Target Joined UInteger Var(aarch64_loop_vect_issue_rate_niters) Init(6) > IntegerRange(0, 65536) Param > > + > > +-param=aarch64-mops-memcpy-size-threshold= > > +Target Joined UInteger Var(aarch64_mops_memcpy_size_threshold) > Init(256) Param > > +Constant memcpy size in bytes above which to start using MOPS > sequence. > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > > index > 510ed079b99374028e38d20f3edbac12d75f7842..9c38277e4f317332088555c9 > 948ef19935ae890c 100644 > > --- a/gcc/doc/invoke.texi > > +++ b/gcc/doc/invoke.texi > > @@ -19135,6 +19135,9 @@ prior to Armv8.2-A is not supported. > > @item ls64 > > Enable the 64-byte atomic load and store instructions for accelerators. > > This option is enabled by default for @option{-march=armv8.7-a}. > > +@item mops > > +Enable the instructions to accelerate memory operations like > @code{memcpy}, > > +@code{memmove}, @code{memset}. > > @item flagm > > Enable the Flag Manipulation instructions Extension. > > @item pauth > > diff --git a/gcc/testsuite/gcc.target/aarch64/mops_1.c > b/gcc/testsuite/gcc.target/aarch64/mops_1.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..661c14192e8a84fd4641a4d > 818b8db46ab4f1b28 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/mops_1.c > > @@ -0,0 +1,57 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -march=armv8.6-a+mops --param=aarch64-mops- > memcpy-size-threshold=0" } */ > > +/* { dg-final { check-function-bodies "**" "" "" } } */ > > + > > +#include <stdlib.h> > > + > > +/* We want to inline variable-sized memcpy. > > +** do_it_cpy: > > +** cpyfp \[x1\]\!, \[x0\]\!, x2\! > > +** cpyfm \[x1\]\!, \[x0\]\!, x2\! > > +** cpyfe \[x1\]\!, \[x0\]\!, x2\! > > +** ret > > +*/ > > +void do_it_cpy (char * in, char * out, size_t size) > > +{ > > + __builtin_memcpy (out, in, size); > > +} > > + > > +/* > > +** do_it_cpy_large: > > +** mov x2, 1024 > > +** cpyfp \[x1\]\!, \[x0\]!, x2\! > > +** cpyfm \[x1\]\!, \[x0\]!, x2\! > > +** cpyfe \[x1\]\!, \[x0\]\!, x2\! > > +** ret > > +*/ > > +void do_it_cpy_large (char * in, char * out) > > +{ > > + __builtin_memcpy (out, in, 1024); > > +} > > + > > +/* > > +** do_it_cpy_127: > > +** mov x2, 127 > > +** cpyfp \[x1\]\!, \[x0\]!, x2\! > > +** cpyfm \[x1\]\!, \[x0\]!, x2\! > > +** cpyfe \[x1\]\!, \[x0\]\!, x2\! > > +** ret > > +*/ > > +void do_it_cpy_127 (char * in, char * out) > > +{ > > + __builtin_memcpy (out, in, 127); > > +} > > + > > +/* > > +** do_it_cpy_128: > > +** mov x2, 128 > > +** cpyfp \[x1\]\!, \[x0\]!, x2\! > > +** cpyfm \[x1\]\!, \[x0\]!, x2\! > > +** cpyfe \[x1\]\!, \[x0\]\!, x2\! > > +** ret > > +*/ > > +void do_it_cpy_128 (char * in, char * out) > > +{ > > + __builtin_memcpy (out, in, 128); > > +} > > +
