Re: [Patch AArch64] Stop generating BSL for simple integer code

James Greenhalgh Wed, 21 Jun 2017 03:50:06 -0700

*ping*

Thanks,
James


On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> [Sorry for the re-send. I spotted that the attributes were not right for the
>  new pattern I was adding. The change between this and the first version was:
> 
>   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
>   +   (set_attr "length" "4,4,4,12")]
> ]
> 
> ---
> 
> Hi,
> 
> In this testcase, all argument registers and the return register
> will be general purpose registers:
> 
>   long long
>   foo (long long a, long long b, long long c)
>   {
>     return ((a ^ b) & c) ^ b;
>   }
> 
> However, due to the implementation of aarch64_simd_bsl<mode>_internal
> we'll match that pattern and emit a BSL, necessitating moving all those
> arguments and results to the Advanced SIMD registers:
> 
>       fmov    d2, x0
>       fmov    d0, x2
>       fmov    d1, x1
>       bsl     v0.8b, v2.8b, v1.8b
>       fmov    x0, d0
> 
> To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> knows to split back to integer operations if the register allocation
> falls that way.
> 
> We could have used an unspec, but then we lose some of the nice
> simplifications that can be made from explicitly spelling out the semantics
> of BSL.
> 
> Bootstrapped on aarch64-none-linux-gnu.
> 
> OK?
> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2017-06-12  James Greenhalgh  <james.greenha...@arm.com>
> 
>       * config/aarch64/aarch64-simd.md
>       (aarch64_simd_bsl<mode>_internal): Remove DImode.
>       (*aarch64_simd_bsl<mode>_alt): Likewise.
>       (aarch64_simd_bsldi_internal): New.
> 
> gcc/testsuite/
> 
> 2017-06-12  James Greenhalgh  <james.greenha...@arm.com>
> 
>       * gcc.target/aarch64/no-dimode-bsl.c: New.
>       * gcc.target/aarch64/dimode-bsl.c: New.
> 

> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index c5a86ff..7b6b12f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2256,13 +2256,13 @@
>  ;; in *aarch64_simd_bsl<mode>_alt.
>  
>  (define_insn "aarch64_simd_bsl<mode>_internal"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -     (xor:VSDQ_I_DI
> -        (and:VSDQ_I_DI
> -          (xor:VSDQ_I_DI
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +     (xor:VDQ_I
> +        (and:VDQ_I
> +          (xor:VDQ_I
>              (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
> -            (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
> -          (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> +            (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
> +          (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
>         (match_dup:<V_cmp_result> 3)
>       ))]
>    "TARGET_SIMD"
> @@ -2280,14 +2280,14 @@
>  ;; permutations of commutative operations, we have to have a separate 
> pattern.
>  
>  (define_insn "*aarch64_simd_bsl<mode>_alt"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -     (xor:VSDQ_I_DI
> -        (and:VSDQ_I_DI
> -          (xor:VSDQ_I_DI
> -            (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
> -            (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
> -           (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> -       (match_dup:VSDQ_I_DI 2)))]
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +     (xor:VDQ_I
> +        (and:VDQ_I
> +          (xor:VDQ_I
> +            (match_operand:VDQ_I 3 "register_operand" "w,w,0")
> +            (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
> +           (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> +       (match_dup:VDQ_I 2)))]
>    "TARGET_SIMD"
>    "@
>    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
> @@ -2296,6 +2296,45 @@
>    [(set_attr "type" "neon_bsl<q>")]
>  )
>  
> +;; DImode is special, we want to avoid computing operations which are
> +;; more naturally computed in general purpose registers in the vector
> +;; registers.  If we do that, we need to move all three operands from general
> +;; purpose registers to vector registers, then back again.  However, we
> +;; don't want to make this pattern an UNSPEC as we'd lose scope for
> +;; optimizations based on the component operations of a BSL.
> +;;
> +;; That means we need a splitter back to the individual operations, if they
> +;; would be better calculated on the integer side.
> +
> +(define_insn_and_split "aarch64_simd_bsldi_internal"
> +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> +     (xor:DI
> +        (and:DI
> +          (xor:DI
> +            (match_operand:DI 3 "register_operand" "w,0,w,r")
> +            (match_operand:DI 2 "register_operand" "w,w,0,r"))
> +          (match_operand:DI 1 "register_operand" "0,w,w,r"))
> +       (match_dup:DI 3)
> +     ))]
> +  "TARGET_SIMD"
> +  "@
> +  bsl\\t%0.8b, %2.8b, %3.8b
> +  bit\\t%0.8b, %2.8b, %1.8b
> +  bif\\t%0.8b, %3.8b, %1.8b
> +  #"
> +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> +  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
> +{
> +  /* Split back to individual operations.  */
> +  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
> +  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
> +  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
> +  DONE;
> +}
> +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> +   (set_attr "length" "4,4,4,12")]
> +)
> +
>  (define_expand "aarch64_simd_bsl<mode>"
>    [(match_operand:VALLDIF 0 "register_operand")
>     (match_operand:<V_cmp_result> 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c 
> b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> new file mode 100644
> index 0000000..4e63511
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Test that we can generate DImode BSL when we are using
> +   copysign.  */
> +
> +double
> +foo (double a, double b)
> +{
> +  return __builtin_copysign (a, b);
> +}
> +
> +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c 
> b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> new file mode 100644
> index 0000000..67dfda0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Test that we don't combine to BSL when in DImode, avoiding register
> +   moves in the general case.
> +
> +   We want:
> +     eor     x0, x0, x1
> +     and     x0, x0, x2
> +     eor     x0, x0, x1
> +     ret
> +
> +   Rather than:
> +     fmov    d2, x0
> +     fmov    d0, x2
> +     fmov    d1, x1
> +     bsl     v0.8b, v2.8b, v1.8b
> +     fmov    x0, d0
> +     ret  */
> +
> +long long
> +foo (long long a, long long b, long long c)
> +{
> +  return ((a ^ b) & c) ^ b;
> +}
> +
> +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */

Re: [Patch AArch64] Stop generating BSL for simple integer code

Reply via email to