On 06/06/14 09:50, James Greenhalgh wrote: > > Hi, > > The move_by_pieces infrastructure performs a copy by repeatedly trying > the largest safe copy it can make. So for a 15-byte copy we might see: > > offset amount bytes copied > 0 8 0-7 > 8 4 8-11 > 12 2 12-13 > 14 1 14 > > However, we can implement a 15-byte copy as so: > > offset amount bytes copied > 0 8 0-7 > 7 8 7-14 > > Which can prove more efficient for both space and speed. > > In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and > implement the movmem pattern name to expand small block copy cases. Note, this > optimization does not apply for -mstrict-align targets, which must continue > copying byte-by-byte. > > Setting MOVE_RATIO low in this way causes a few tests to begin failing, > both of these are documented in the test-case as expected to fail for > low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations. > > Bootstrapped on aarch64-unknown-linux-gnu with no issues. > > OK for trunk? >
This is OK. Hmm, I notice sra-12 fails on ARM as well. Is that for the same reason? If so, perhaps you could prepare a patch for that as well (consider it pre-approved). R. > Thanks, > James > > --- > gcc/ > > 2014-06-06 James Greenhalgh <james.greenha...@arm.com> > > * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New. > * config/aarch64/aarch64.c (aarch64_move_pointer): New. > (aarch64_progress_pointer): Likewise. > (aarch64_copy_one_part_and_move_pointers): Likewise. > (aarch64_expand_movmen): Likewise. > * config/aarch64/aarch64.h (MOVE_RATIO): Set low. > * config/aarch64/aarch64.md (movmem<mode>): New. > > gcc/testsuite/ > > 2014-06-06 James Greenhalgh <james.greenha...@arm.com> > > * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64. > * gcc.dg/tree-ssa/sra-12.c: Likewise. > > > 0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch > > > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index 68d488d..c4f75b3 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode, > enum aarch64_symbol_type > aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context); > bool aarch64_constant_address_p (rtx); > +bool aarch64_expand_movmem (rtx *); > bool aarch64_float_const_zero_rtx_p (rtx); > bool aarch64_function_arg_regno_p (unsigned); > bool aarch64_gen_movmemqi (rtx *); > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index b26e5f5..0ae21cd 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -9434,6 +9434,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, > enum machine_mode mode2) > return false; > } > > +/* Return a new RTX holding the result of moving POINTER forward by > + AMOUNT bytes. */ > + > +static rtx > +aarch64_move_pointer (rtx pointer, int amount) > +{ > + rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount); > + > + return adjust_automodify_address (pointer, GET_MODE (pointer), > + next, amount); > +} > + > +/* Return a new RTX holding the result of moving POINTER forward by the > + size of the mode it points to. */ > + > +static rtx > +aarch64_progress_pointer (rtx pointer) > +{ > + HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer)); > + > + return aarch64_move_pointer (pointer, amount); > +} > + > +/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by > + MODE bytes. */ > + > +static void > +aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, > + enum machine_mode mode) > +{ > + rtx reg = gen_reg_rtx (mode); > + > + /* "Cast" the pointers to the correct mode. */ > + *src = adjust_address (*src, mode, 0); > + *dst = adjust_address (*dst, mode, 0); > + /* Emit the memcpy. */ > + emit_move_insn (reg, *src); > + emit_move_insn (*dst, reg); > + /* Move the pointers forward. */ > + *src = aarch64_progress_pointer (*src); > + *dst = aarch64_progress_pointer (*dst); > +} > + > +/* Expand movmem, as if from a __builtin_memcpy. Return true if > + we succeed, otherwise return false. */ > + > +bool > +aarch64_expand_movmem (rtx *operands) > +{ > + unsigned int n; > + rtx dst = operands[0]; > + rtx src = operands[1]; > + rtx base; > + bool speed_p = !optimize_function_for_size_p (cfun); > + > + /* When optimizing for size, give a better estimate of the length of a > + memcpy call, but use the default otherwise. */ > + unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2; > + > + /* We can't do anything smart if the amount to copy is not constant. */ > + if (!CONST_INT_P (operands[2])) > + return false; > + > + n = UINTVAL (operands[2]); > + > + /* Try to keep the number of instructions low. For cases below 16 bytes we > + need to make at most two moves. For cases above 16 bytes it will be one > + move for each 16 byte chunk, then at most two additional moves. */ > + if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions) > + return false; > + > + base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); > + dst = adjust_automodify_address (dst, VOIDmode, base, 0); > + > + base = copy_to_mode_reg (Pmode, XEXP (src, 0)); > + src = adjust_automodify_address (src, VOIDmode, base, 0); > + > + /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a > + 1-byte chunk. */ > + if (n < 4) > + { > + if (n >= 2) > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); > + n -= 2; > + } > + > + if (n == 1) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); > + > + return true; > + } > + > + /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second > + 4-byte chunk, partially overlapping with the previously copied chunk. > */ > + if (n < 8) > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + n -= 4; > + if (n > 0) > + { > + int move = n - 4; > + > + src = aarch64_move_pointer (src, move); > + dst = aarch64_move_pointer (dst, move); > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + } > + return true; > + } > + > + /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of > + them, then (if applicable) an 8-byte chunk. */ > + while (n >= 8) > + { > + if (n / 16) > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode); > + n -= 16; > + } > + else > + { > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); > + n -= 8; > + } > + } > + > + /* Finish the final bytes of the copy. We can always do this in one > + instruction. We either copy the exact amount we need, or partially > + overlap with the previous chunk we copied and copy 8-bytes. */ > + if (n == 0) > + return true; > + else if (n == 1) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode); > + else if (n == 2) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode); > + else if (n == 4) > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + else > + { > + if (n == 3) > + { > + src = aarch64_move_pointer (src, -1); > + dst = aarch64_move_pointer (dst, -1); > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode); > + } > + else > + { > + int move = n - 8; > + > + src = aarch64_move_pointer (src, move); > + dst = aarch64_move_pointer (dst, move); > + aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode); > + } > + } > + > + return true; > +} > + > #undef TARGET_ADDRESS_COST > #define TARGET_ADDRESS_COST aarch64_address_cost > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > index ced5a5e..c5d144e 100644 > --- a/gcc/config/aarch64/aarch64.h > +++ b/gcc/config/aarch64/aarch64.h > @@ -659,12 +659,14 @@ do { > \ > /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends. */ > #define AARCH64_CALL_RATIO 8 > > -/* When optimizing for size, give a better estimate of the length of a memcpy > - call, but use the default otherwise. But move_by_pieces_ninsns() counts > - memory-to-memory moves, and we'll have to generate a load & store for > each, > - so halve the value to take that into account. */ > +/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure. > + move_by_pieces will continually copy the largest safe chunks. So a > + 7-byte copy is a 4-byte + 2-byte + byte copy. This proves inefficient > + for both size and speed of copy, so we will instead use the "movmem" > + standard name to implement the copy. This logic does not apply when > + targeting -mstrict-align, so keep a sensible default in that case. */ > #define MOVE_RATIO(speed) \ > - (((speed) ? 15 : AARCH64_CALL_RATIO) / 2) > + (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)) > > /* For CLEAR_RATIO, when optimizing for size, give a better estimate > of the length of a memset call, but use the default otherwise. */ > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index > 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd > 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -883,6 +883,24 @@ (define_split > } > ) > > +;; 0 is dst > +;; 1 is src > +;; 2 is size of move in bytes > +;; 3 is alignment > + > +(define_expand "movmemdi" > + [(match_operand:BLK 0 "memory_operand") > + (match_operand:BLK 1 "memory_operand") > + (match_operand:DI 2 "immediate_operand") > + (match_operand:DI 3 "immediate_operand")] > + "!STRICT_ALIGNMENT" > +{ > + if (aarch64_expand_movmem (operands)) > + DONE; > + FAIL; > +} > +) > + > ;; Operands 1 and 3 are tied together by the final condition; so we allow > ;; fairly lax checking on the second memory operation. > (define_insn "load_pair<mode>" > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > index a970c85..07f575d 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c > @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr) > /* Whether the structs are totally scalarized or not depends on the > MOVE_RATIO macro definition in the back end. The scalarization will > not take place when using small values for MOVE_RATIO. */ > -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { > target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } > } */ > -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { > target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } > } */ > +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { > target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* > sh*-*-*" } } } } */ > +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { > target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* > sh*-*-*" } } } } */ > /* { dg-final { cleanup-tree-dump "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > index 59e5e6a..45aa963 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c > @@ -21,5 +21,5 @@ int foo (struct S *p) > *p = l; > } > > -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! > "avr*-*-* nds32*-*-*" } } } } */ > +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! > "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */ > /* { dg-final { cleanup-tree-dump "release_ssa" } } */ >