Re: [AArch64] Implement movmem for the benefit of inline memcpy

Richard Earnshaw Fri, 06 Jun 2014 03:39:21 -0700

On 06/06/14 09:50, James Greenhalgh wrote:
> 
> Hi,
> 
> The move_by_pieces infrastructure performs a copy by repeatedly trying
> the largest safe copy it can make. So for a 15-byte copy we might see:
> 
> offset   amount  bytes copied
> 0        8       0-7
> 8        4       8-11
> 12       2       12-13
> 14       1       14
> 
> However, we can implement a 15-byte copy as so:
> 
> offset   amount  bytes copied
> 0        8       0-7
> 7        8       7-14
> 
> Which can prove more efficient for both space and speed.
> 
> In this patch we set MOVE_RATIO low to avoid using move_by_pieces, and
> implement the movmem pattern name to expand small block copy cases. Note, this
> optimization does not apply for -mstrict-align targets, which must continue
> copying byte-by-byte.
> 
> Setting MOVE_RATIO low in this way causes a few tests to begin failing,
> both of these are documented in the test-case as expected to fail for
> low MOVE_RATIO targets, which do not allow certain Tree-Level optimizations.
> 
> Bootstrapped on aarch64-unknown-linux-gnu with no issues.
> 
> OK for trunk?
>


This is OK.

Hmm, I notice sra-12 fails on ARM as well.  Is that for the same reason?
 If so, perhaps you could prepare a patch for that as well (consider it
pre-approved).

R.

> Thanks,
> James
> 
> ---
> gcc/
> 
> 2014-06-06  James Greenhalgh  <james.greenha...@arm.com>
> 
>       * config/aarch64/aarch64-protos.h (aarch64_expand_movmem): New.
>       * config/aarch64/aarch64.c (aarch64_move_pointer): New.
>       (aarch64_progress_pointer): Likewise.
>       (aarch64_copy_one_part_and_move_pointers): Likewise.
>       (aarch64_expand_movmen): Likewise.
>       * config/aarch64/aarch64.h (MOVE_RATIO): Set low.
>       * config/aarch64/aarch64.md (movmem<mode>): New.
> 
> gcc/testsuite/
> 
> 2014-06-06  James Greenhalgh  <james.greenha...@arm.com>
> 
>       * gcc.dg/tree-ssa/pr42585.c: Skip for AArch64.
>       * gcc.dg/tree-ssa/sra-12.c: Likewise.
> 
> 
> 0001-AArch64-Implement-movmem-for-the-benefit-of-inline-m.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 68d488d..c4f75b3 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -180,6 +180,7 @@ bool aarch64_cannot_change_mode_class (enum machine_mode,
>  enum aarch64_symbol_type
>  aarch64_classify_symbolic_expression (rtx, enum aarch64_symbol_context);
>  bool aarch64_constant_address_p (rtx);
> +bool aarch64_expand_movmem (rtx *);
>  bool aarch64_float_const_zero_rtx_p (rtx);
>  bool aarch64_function_arg_regno_p (unsigned);
>  bool aarch64_gen_movmemqi (rtx *);
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index b26e5f5..0ae21cd 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -9434,6 +9434,164 @@ aarch64_modes_tieable_p (enum machine_mode mode1, 
> enum machine_mode mode2)
>    return false;
>  }
>  
> +/* Return a new RTX holding the result of moving POINTER forward by
> +   AMOUNT bytes.  */
> +
> +static rtx
> +aarch64_move_pointer (rtx pointer, int amount)
> +{
> +  rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
> +
> +  return adjust_automodify_address (pointer, GET_MODE (pointer),
> +                                 next, amount);
> +}
> +
> +/* Return a new RTX holding the result of moving POINTER forward by the
> +   size of the mode it points to.  */
> +
> +static rtx
> +aarch64_progress_pointer (rtx pointer)
> +{
> +  HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
> +
> +  return aarch64_move_pointer (pointer, amount);
> +}
> +
> +/* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
> +   MODE bytes.  */
> +
> +static void
> +aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
> +                                           enum machine_mode mode)
> +{
> +  rtx reg = gen_reg_rtx (mode);
> +
> +  /* "Cast" the pointers to the correct mode.  */
> +  *src = adjust_address (*src, mode, 0);
> +  *dst = adjust_address (*dst, mode, 0);
> +  /* Emit the memcpy.  */
> +  emit_move_insn (reg, *src);
> +  emit_move_insn (*dst, reg);
> +  /* Move the pointers forward.  */
> +  *src = aarch64_progress_pointer (*src);
> +  *dst = aarch64_progress_pointer (*dst);
> +}
> +
> +/* Expand movmem, as if from a __builtin_memcpy.  Return true if
> +   we succeed, otherwise return false.  */
> +
> +bool
> +aarch64_expand_movmem (rtx *operands)
> +{
> +  unsigned int n;
> +  rtx dst = operands[0];
> +  rtx src = operands[1];
> +  rtx base;
> +  bool speed_p = !optimize_function_for_size_p (cfun);
> +
> +  /* When optimizing for size, give a better estimate of the length of a
> +     memcpy call, but use the default otherwise.  */
> +  unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
> +
> +  /* We can't do anything smart if the amount to copy is not constant.  */
> +  if (!CONST_INT_P (operands[2]))
> +    return false;
> +
> +  n = UINTVAL (operands[2]);
> +
> +  /* Try to keep the number of instructions low.  For cases below 16 bytes we
> +     need to make at most two moves.  For cases above 16 bytes it will be one
> +     move for each 16 byte chunk, then at most two additional moves.  */
> +  if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
> +    return false;
> +
> +  base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
> +  dst = adjust_automodify_address (dst, VOIDmode, base, 0);
> +
> +  base = copy_to_mode_reg (Pmode, XEXP (src, 0));
> +  src = adjust_automodify_address (src, VOIDmode, base, 0);
> +
> +  /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
> +     1-byte chunk.  */
> +  if (n < 4)
> +    {
> +      if (n >= 2)
> +     {
> +       aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
> +       n -= 2;
> +     }
> +
> +      if (n == 1)
> +     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
> +
> +      return true;
> +    }
> +
> +  /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
> +     4-byte chunk, partially overlapping with the previously copied chunk.  
> */
> +  if (n < 8)
> +    {
> +      aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +      n -= 4;
> +      if (n > 0)
> +     {
> +       int move = n - 4;
> +
> +       src = aarch64_move_pointer (src, move);
> +       dst = aarch64_move_pointer (dst, move);
> +       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +     }
> +      return true;
> +    }
> +
> +  /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
> +     them, then (if applicable) an 8-byte chunk.  */
> +  while (n >= 8)
> +    {
> +      if (n / 16)
> +     {
> +       aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
> +       n -= 16;
> +     }
> +      else
> +     {
> +       aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
> +       n -= 8;
> +     }
> +    }
> +
> +  /* Finish the final bytes of the copy.  We can always do this in one
> +     instruction.  We either copy the exact amount we need, or partially
> +     overlap with the previous chunk we copied and copy 8-bytes.  */
> +  if (n == 0)
> +    return true;
> +  else if (n == 1)
> +    aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
> +  else if (n == 2)
> +    aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
> +  else if (n == 4)
> +    aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +  else
> +    {
> +      if (n == 3)
> +     {
> +       src = aarch64_move_pointer (src, -1);
> +       dst = aarch64_move_pointer (dst, -1);
> +       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
> +     }
> +      else
> +     {
> +       int move = n - 8;
> +
> +       src = aarch64_move_pointer (src, move);
> +       dst = aarch64_move_pointer (dst, move);
> +       aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
> +     }
> +    }
> +
> +  return true;
> +}
> +
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST aarch64_address_cost
>  
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index ced5a5e..c5d144e 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -659,12 +659,14 @@ do {                                                    
>                      \
>  /* The base cost overhead of a memcpy call, for MOVE_RATIO and friends.  */
>  #define AARCH64_CALL_RATIO 8
>  
> -/* When optimizing for size, give a better estimate of the length of a memcpy
> -   call, but use the default otherwise.  But move_by_pieces_ninsns() counts
> -   memory-to-memory moves, and we'll have to generate a load & store for 
> each,
> -   so halve the value to take that into account.  */
> +/* MOVE_RATIO dictates when we will use the move_by_pieces infrastructure.
> +   move_by_pieces will continually copy the largest safe chunks.  So a
> +   7-byte copy is a 4-byte + 2-byte + byte copy.  This proves inefficient
> +   for both size and speed of copy, so we will instead use the "movmem"
> +   standard name to implement the copy.  This logic does not apply when
> +   targeting -mstrict-align, so keep a sensible default in that case.  */
>  #define MOVE_RATIO(speed) \
> -  (((speed) ? 15 : AARCH64_CALL_RATIO) / 2)
> +  (!STRICT_ALIGNMENT ? 2 : (((speed) ? 15 : AARCH64_CALL_RATIO) / 2))
>  
>  /* For CLEAR_RATIO, when optimizing for size, give a better estimate
>     of the length of a memset call, but use the default otherwise.  */
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 6e605c19f0acbe88d53f460cb513d24dde6d658f..661d784b93e60fd2f636f5b5f03c10c6d53493dd
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -883,6 +883,24 @@ (define_split
>    }
>  )
>  
> +;; 0 is dst
> +;; 1 is src
> +;; 2 is size of move in bytes
> +;; 3 is alignment
> +
> +(define_expand "movmemdi"
> +  [(match_operand:BLK 0 "memory_operand")
> +   (match_operand:BLK 1 "memory_operand")
> +   (match_operand:DI 2 "immediate_operand")
> +   (match_operand:DI 3 "immediate_operand")]
> +   "!STRICT_ALIGNMENT"
> +{
> +  if (aarch64_expand_movmem (operands))
> +    DONE;
> +  FAIL;
> +}
> +)
> +
>  ;; Operands 1 and 3 are tied together by the final condition; so we allow
>  ;; fairly lax checking on the second memory operation.
>  (define_insn "load_pair<mode>"
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> index a970c85..07f575d 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr42585.c
> @@ -35,6 +35,6 @@ Cyc_string_ungetc (int ignore, struct _fat_ptr *sptr)
>  /* Whether the structs are totally scalarized or not depends on the
>     MOVE_RATIO macro definition in the back end.  The scalarization will
>     not take place when using small values for MOVE_RATIO.  */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { 
> target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } 
> } */
> -/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { 
> target { ! "arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* sh*-*-*" } } } 
> } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _ans" 0 "optimized" { 
> target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* 
> sh*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "struct _fat_ptr _T2" 0 "optimized" { 
> target { ! "aarch64*-*-* arm*-*-* avr-*-* nds32*-*-* powerpc*-*-* s390*-*-* 
> sh*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> index 59e5e6a..45aa963 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/sra-12.c
> @@ -21,5 +21,5 @@ int foo (struct S *p)
>    *p = l;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! 
> "avr*-*-* nds32*-*-*" } } } } */
> +/* { dg-final { scan-tree-dump-times "l;" 0 "release_ssa" { target { ! 
> "aarch64*-*-* avr*-*-* nds32*-*-*" } } } } */
>  /* { dg-final { cleanup-tree-dump "release_ssa" } } */
>

Re: [AArch64] Implement movmem for the benefit of inline memcpy

Reply via email to