On 20/09/2023 14:50, Wilco Dijkstra wrote:
> 
> The cpymemdi/setmemdi implementation doesn't fully support strict alignment.
> Block the expansion if the alignment is less than 16 with STRICT_ALIGNMENT.
> Clean up the condition when to use MOPS.
>     
> Passes regress/bootstrap, OK for commit?
>     
> gcc/ChangeLog/
>         PR target/103100
>         * config/aarch64/aarch64.md (cpymemdi): Remove pattern condition.

Shouldn't this be a separate patch?  It's not immediately obvious that this is 
a necessary part of this change.

>         (setmemdi): Likewise.
>         * config/aarch64/aarch64.cc (aarch64_expand_cpymem): Support
>         strict-align.  Cleanup condition for using MOPS.
>         (aarch64_expand_setmem): Likewise.
> 
> ---
> 
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> dd6874d13a75f20d10a244578afc355b25c73da2..8f3bfb91c0f4ec43f37fe9289a66092a29a47e4d
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25261,27 +25261,23 @@ aarch64_expand_cpymem (rtx *operands)
>    int mode_bits;
>    rtx dst = operands[0];
>    rtx src = operands[1];
> +  unsigned align = INTVAL (operands[3]);

This should read the value with UINTVAL.  Given the useful range of the 
alignment, it should be OK that we're not using unsigned HWI.

>    rtx base;
>    machine_mode cur_mode = BLKmode;
> +  bool size_p = optimize_function_for_size_p (cfun);
>  
> -  /* Variable-sized memcpy can go through the MOPS expansion if available.  
> */
> -  if (!CONST_INT_P (operands[2]))
> +  /* Variable-sized or strict-align copies may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_cpymem_mops (operands);

So what about align=4 and copying, for example, 8 or 12 bytes; wouldn't we want 
a sequence of LDR/STR in that case?  Doesn't this fall back to MOPS too eagerly?


>  
>    unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
>  
> -  /* Try to inline up to 256 bytes or use the MOPS threshold if available.  
> */
> -  unsigned HOST_WIDE_INT max_copy_size
> -    = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
> -
> -  bool size_p = optimize_function_for_size_p (cfun);
> +  /* Try to inline up to 256 bytes.  */
> +  unsigned max_copy_size = 256;
> +  unsigned max_mops_size = aarch64_mops_memcpy_size_threshold;

I find this name slightly confusing.  Surely it's min_mops_size (since above 
that we want to use MOPS rather than inlined loads/stores).  But why not just 
use aarch64_mops_memcpy_size_threshold directly in the one place it's used?

>  
> -  /* Large constant-sized cpymem should go through MOPS when possible.
> -     It should be a win even for size optimization in the general case.
> -     For speed optimization the choice between MOPS and the SIMD sequence
> -     depends on the size of the copy, rather than number of instructions,
> -     alignment etc.  */
> -  if (size > max_copy_size)
> +  /* Large copies use MOPS when available or a library call.  */
> +  if (size > max_copy_size || (TARGET_MOPS && size > max_mops_size))
>      return aarch64_expand_cpymem_mops (operands);
>  
>    int copy_bits = 256;
> @@ -25445,12 +25441,13 @@ aarch64_expand_setmem (rtx *operands)

Similar comments apply to this code as well.

>    unsigned HOST_WIDE_INT len;
>    rtx dst = operands[0];
>    rtx val = operands[2], src;
> +  unsigned align = INTVAL (operands[3]);
>    rtx base;
>    machine_mode cur_mode = BLKmode, next_mode;
>  
> -  /* If we don't have SIMD registers or the size is variable use the MOPS
> -     inlined sequence if possible.  */
> -  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
> +  /* Variable-sized or strict-align memset may use the MOPS expansion.  */
> +  if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
> +      || (STRICT_ALIGNMENT && align < 16))
>      return aarch64_expand_setmem_mops (operands);
>  
>    bool size_p = optimize_function_for_size_p (cfun);
> @@ -25458,10 +25455,13 @@ aarch64_expand_setmem (rtx *operands)

And here.

>    /* Default the maximum to 256-bytes when considering only libcall vs
>       SIMD broadcast sequence.  */
>    unsigned max_set_size = 256;
> +  unsigned max_mops_size = aarch64_mops_memset_size_threshold;
>  
>    len = INTVAL (operands[1]);
> -  if (len > max_set_size && !TARGET_MOPS)
> -    return false;
> +
> +  /* Large memset uses MOPS when available or a library call.  */
> +  if (len > max_set_size || (TARGET_MOPS && len > max_mops_size))
> +    return aarch64_expand_setmem_mops (operands);
>  
>    int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
>    /* The MOPS sequence takes:
> @@ -25474,12 +25474,6 @@ aarch64_expand_setmem (rtx *operands)
>       the arguments + 1 for the call.  */
>    unsigned libcall_cost = 4;
>  
> -  /* Upper bound check.  For large constant-sized setmem use the MOPS 
> sequence
> -     when available.  */
> -  if (TARGET_MOPS
> -      && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
> -    return aarch64_expand_setmem_mops (operands);
> -
>    /* Attempt a sequence with a vector broadcast followed by stores.
>       Count the number of operations involved to see if it's worth it
>       against the alternatives.  A simple counter simd_ops on the
> @@ -25521,10 +25515,8 @@ aarch64_expand_setmem (rtx *operands)
>        simd_ops++;
>        n -= mode_bits;
>  
> -      /* Do certain trailing copies as overlapping if it's going to be
> -      cheaper.  i.e. less instructions to do so.  For instance doing a 15
> -      byte copy it's more efficient to do two overlapping 8 byte copies than
> -      8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
> +      /* Emit trailing writes using overlapping unaligned accesses
> +     (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
>        if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
>       {
>         next_mode = smallest_mode_for_size (n, MODE_INT);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 6d0f072a9dd6d094e8764a513222a9129d8296fa..96508a2580876d1fdbdfa6c67d1a3d02608c1d24
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1627,7 +1627,7 @@ (define_expand "cpymemdi"
>     (match_operand:BLK 1 "memory_operand")
>     (match_operand:DI 2 "general_operand")
>     (match_operand:DI 3 "immediate_operand")]
> -   "!STRICT_ALIGNMENT || TARGET_MOPS"
> +   ""
>  {
>    if (aarch64_expand_cpymem (operands))
>      DONE;
> @@ -1724,7 +1724,7 @@ (define_expand "setmemdi"
>          (match_operand:QI  2 "nonmemory_operand")) ;; Value
>     (use (match_operand:DI  1 "general_operand")) ;; Length
>     (match_operand          3 "immediate_operand")] ;; Align
> - "TARGET_SIMD || TARGET_MOPS"
> + ""
>   {
>    if (aarch64_expand_setmem (operands))
>      DONE;
> 
> 

Are there any additional tests for this?
R.

Reply via email to