Alex Coplan <alex.cop...@arm.com> writes:
> This patch generalises the TFmode load/store pair patterns to TImode and
> TDmode.  This brings them in line with the DXmode patterns, and uses the
> same technique with separate mode iterators (TX and TX2) to allow for
> distinct modes in each arm of the load/store pair.
>
> For example, in combination with the post-RA load/store pair fusion pass
> in the following patch, this improves the codegen for the following
> varargs testcase involving TImode stores:
>
> void g(void *);
> int foo(int x, ...)
> {
>     __builtin_va_list ap;
>     __builtin_va_start (ap, x);
>     g(&ap);
>     __builtin_va_end (ap);
> }
>
> from:
>
> foo:
> .LFB0:
>       stp     x29, x30, [sp, -240]!
> .LCFI0:
>       mov     w9, -56
>       mov     w8, -128
>       mov     x29, sp
>       add     x10, sp, 176
>       stp     x1, x2, [sp, 184]
>       add     x1, sp, 240
>       add     x0, sp, 16
>       stp     x1, x1, [sp, 16]
>       str     x10, [sp, 32]
>       stp     w9, w8, [sp, 40]
>       str     q0, [sp, 48]
>       str     q1, [sp, 64]
>       str     q2, [sp, 80]
>       str     q3, [sp, 96]
>       str     q4, [sp, 112]
>       str     q5, [sp, 128]
>       str     q6, [sp, 144]
>       str     q7, [sp, 160]
>       stp     x3, x4, [sp, 200]
>       stp     x5, x6, [sp, 216]
>       str     x7, [sp, 232]
>       bl      g
>       ldp     x29, x30, [sp], 240
> .LCFI1:
>       ret
>
> to:
>
> foo:
> .LFB0:
>       stp     x29, x30, [sp, -240]!
> .LCFI0:
>       mov     w9, -56
>       mov     w8, -128
>       mov     x29, sp
>       add     x10, sp, 176
>       stp     x1, x2, [sp, 1bd4971b7c71e70a637a1dq84]
>       add     x1, sp, 240
>       add     x0, sp, 16
>       stp     x1, x1, [sp, 16]
>       str     x10, [sp, 32]
>       stp     w9, w8, [sp, 40]
>       stp     q0, q1, [sp, 48]
>       stp     q2, q3, [sp, 80]
>       stp     q4, q5, [sp, 112]
>       stp     q6, q7, [sp, 144]
>       stp     x3, x4, [sp, 200]
>       stp     x5, x6, [sp, 216]
>       str     x7, [sp, 232]
>       bl      g
>       ldp     x29, x30, [sp], 240
> .LCFI1:
>       ret
>
> Note that this patch isn't needed if we only use the mode
> canonicalization approach in the new ldp fusion pass (since we
> canonicalize T{I,F,D}mode to V16QImode), but we seem to get slightly
> better performance with mode canonicalization disabled (see
> --param=aarch64-ldp-canonicalize-modes in the following patch).
>
> Bootstrapped/regtested as a series on aarch64-linux-gnu, OK for trunk?
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64.md (load_pair_dw_tftf): Rename to ...
>       (load_pair_dw_<TX:mode><TX2:mode>): ... this.
>       (store_pair_dw_tftf): Rename to ...
>       (store_pair_dw_<TX:mode><TX2:mode>): ... this.
>       * config/aarch64/iterators.md (TX2): New.

OK, thanks.  It would be nice to investigate & fix the reasons for
the regressions with canonicalised modes, but I agree that this patch
is a strict improvement, since it fixes a hole in the current scheme.

Richard

> ---
>  gcc/config/aarch64/aarch64.md   | 22 +++++++++++-----------
>  gcc/config/aarch64/iterators.md |  3 +++
>  2 files changed, 14 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 32c7adc8928..e6af09c2e8b 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1757,16 +1757,16 @@ (define_insn "load_pair_dw_<DX:mode><DX2:mode>"
>    }
>  )
>  
> -(define_insn "load_pair_dw_tftf"
> -  [(set (match_operand:TF 0 "register_operand" "=w")
> -     (match_operand:TF 1 "aarch64_mem_pair_operand" "Ump"))
> -   (set (match_operand:TF 2 "register_operand" "=w")
> -     (match_operand:TF 3 "memory_operand" "m"))]
> +(define_insn "load_pair_dw_<TX:mode><TX2:mode>"
> +  [(set (match_operand:TX 0 "register_operand" "=w")
> +     (match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
> +   (set (match_operand:TX2 2 "register_operand" "=w")
> +     (match_operand:TX2 3 "memory_operand" "m"))]
>     "TARGET_SIMD
>      && rtx_equal_p (XEXP (operands[3], 0),
>                   plus_constant (Pmode,
>                                  XEXP (operands[1], 0),
> -                                GET_MODE_SIZE (TFmode)))"
> +                                GET_MODE_SIZE (<TX:MODE>mode)))"
>    "ldp\\t%q0, %q2, %z1"
>    [(set_attr "type" "neon_ldp_q")
>     (set_attr "fp" "yes")]
> @@ -1805,11 +1805,11 @@ (define_insn "store_pair_dw_<DX:mode><DX2:mode>"
>    }
>  )
>  
> -(define_insn "store_pair_dw_tftf"
> -  [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump")
> -     (match_operand:TF 1 "register_operand" "w"))
> -   (set (match_operand:TF 2 "memory_operand" "=m")
> -     (match_operand:TF 3 "register_operand" "w"))]
> +(define_insn "store_pair_dw_<TX:mode><TX2:mode>"
> +  [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
> +     (match_operand:TX 1 "register_operand" "w"))
> +   (set (match_operand:TX2 2 "memory_operand" "=m")
> +     (match_operand:TX2 3 "register_operand" "w"))]
>     "TARGET_SIMD &&
>      rtx_equal_p (XEXP (operands[2], 0),
>                plus_constant (Pmode,
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 2451d8c2cd8..f9e2210095e 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -319,6 +319,9 @@ (define_mode_iterator VS [V2SI V4SI])
>  
>  (define_mode_iterator TX [TI TF TD])
>  
> +;; Duplicate of the above
> +(define_mode_iterator TX2 [TI TF TD])
> +
>  (define_mode_iterator VTX [TI TF TD V16QI V8HI V4SI V2DI V8HF V4SF V2DF 
> V8BF])
>  
>  ;; Advanced SIMD opaque structure modes.

Reply via email to