On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote: > [Sorry for the re-send. I spotted that the attributes were not right for the > new pattern I was adding. The change between this and the first version was: > > + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") > + (set_attr "length" "4,4,4,12")] > ] > > --- > > Hi, > > In this testcase, all argument registers and the return register > will be general purpose registers: > > long long > foo (long long a, long long b, long long c) > { > return ((a ^ b) & c) ^ b; > } > > However, due to the implementation of aarch64_simd_bsl<mode>_internal > we'll match that pattern and emit a BSL, necessitating moving all those > arguments and results to the Advanced SIMD registers: > > fmov d2, x0 > fmov d0, x2 > fmov d1, x1 > bsl v0.8b, v2.8b, v1.8b > fmov x0, d0 > > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that > knows to split back to integer operations if the register allocation > falls that way. > > We could have used an unspec, but then we lose some of the nice > simplifications that can be made from explicitly spelling out the semantics > of BSL.
Off list, Richard and I found considerable issues with this patch. From idioms failing to match in the testcase, to subtle register allocation bugs, to potential for suboptimal code generation. That's not good! This spin of the patch corrects those issues by adding a second split pattern, allocating a temporary register if we're permitted to, or properly using tied output operands if we're not, and by generally playing things a bit safer around the possibility of register overlaps. The testcase is expanded to an execute test, hopefully giving a little more assurance that we're doing the right thing - now testing the BSL idiom generation in both general-purpose and floating-point registers and comparing the results. Hopefully we're now a bit more robust! Bootstrapped and tested on aarch64-none-linux-gnu and cross-tested on aarch64-none-elf with no issues. OK for trunk? Thanks, James --- gcc/ 2017-07-27 James Greenhalgh <james.greenha...@arm.com> * config/aarch64/aarch64-simd.md (aarch64_simd_bsl<mode>_internal): Remove DImode. (*aarch64_simd_bsl<mode>_alt): Likewise. (aarch64_simd_bsldi_internal): New. (aarch64_simd_bsldi_alt): Likewise. gcc/testsuite/ 2017-07-27 James Greenhalgh <james.greenha...@arm.com> * gcc.target/aarch64/bsl-idiom.c: New. * gcc.target/aarch64/copysign-bsl.c: New.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 011fcec0..a186eae 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2280,13 +2280,13 @@ ;; in *aarch64_simd_bsl<mode>_alt. (define_insn "aarch64_simd_bsl<mode>_internal" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) + (match_operand:VDQ_I 2 "register_operand" "w,w,0")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) (match_dup:<V_cmp_result> 3) ))] "TARGET_SIMD" @@ -2304,14 +2304,14 @@ ;; permutations of commutative operations, we have to have a separate pattern. (define_insn "*aarch64_simd_bsl<mode>_alt" - [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w") - (xor:VSDQ_I_DI - (and:VSDQ_I_DI - (xor:VSDQ_I_DI - (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0") - (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w")) - (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w")) - (match_dup:VSDQ_I_DI 2)))] + [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") + (xor:VDQ_I + (and:VDQ_I + (xor:VDQ_I + (match_operand:VDQ_I 3 "register_operand" "w,w,0") + (match_operand:VDQ_I 2 "register_operand" "w,0,w")) + (match_operand:VDQ_I 1 "register_operand" "0,w,w")) + (match_dup:VDQ_I 2)))] "TARGET_SIMD" "@ bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype> @@ -2320,6 +2320,100 @@ [(set_attr "type" "neon_bsl<q>")] ) +;; DImode is special, we want to avoid computing operations which are +;; more naturally computed in general purpose registers in the vector +;; registers. If we do that, we need to move all three operands from general +;; purpose registers to vector registers, then back again. However, we +;; don't want to make this pattern an UNSPEC as we'd lose scope for +;; optimizations based on the component operations of a BSL. +;; +;; That means we need a splitter back to the individual operations, if they +;; would be better calculated on the integer side. + +(define_insn_and_split "aarch64_simd_bsldi_internal" + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") + (xor:DI + (and:DI + (xor:DI + (match_operand:DI 3 "register_operand" "w,0,w,r") + (match_operand:DI 2 "register_operand" "w,w,0,r")) + (match_operand:DI 1 "register_operand" "0,w,w,r")) + (match_dup:DI 3) + ))] + "TARGET_SIMD" + "@ + bsl\\t%0.8b, %2.8b, %3.8b + bit\\t%0.8b, %2.8b, %1.8b + bif\\t%0.8b, %3.8b, %1.8b + #" + "&& GP_REGNUM_P (REGNO (operands[0]))" + [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)] +{ + /* Split back to individual operations. If we're before reload, and + able to create a temporary register, do so. If we're after reload, + we've got an early-clobber destination register, so use that. + Otherwise, we can't create pseudos and we can't yet guarantee that + operands[0] is safe to write, so FAIL to split. */ + + rtx scratch; + if (reload_completed) + scratch = operands[0]; + else if (can_create_pseudo_p ()) + scratch = gen_reg_rtx (DImode); + else + FAIL; + + emit_insn (gen_xordi3 (scratch, operands[2], operands[3])); + emit_insn (gen_anddi3 (scratch, scratch, operands[1])); + emit_insn (gen_xordi3 (operands[0], scratch, operands[3])); + DONE; +} + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") + (set_attr "length" "4,4,4,12")] +) + +(define_insn_and_split "aarch64_simd_bsldi_alt" + [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r") + (xor:DI + (and:DI + (xor:DI + (match_operand:DI 3 "register_operand" "w,w,0,r") + (match_operand:DI 2 "register_operand" "w,0,w,r")) + (match_operand:DI 1 "register_operand" "0,w,w,r")) + (match_dup:DI 2) + ))] + "TARGET_SIMD" + "@ + bsl\\t%0.8b, %3.8b, %2.8b + bit\\t%0.8b, %3.8b, %1.8b + bif\\t%0.8b, %2.8b, %1.8b + #" + "&& GP_REGNUM_P (REGNO (operands[0]))" + [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)] +{ + /* Split back to individual operations. If we're before reload, and + able to create a temporary register, do so. If we're after reload, + we've got an early-clobber destination register, so use that. + Otherwise, we can't create pseudos and we can't yet guarantee that + operands[0] is safe to write, so FAIL to split. */ + + rtx scratch; + if (reload_completed) + scratch = operands[0]; + else if (can_create_pseudo_p ()) + scratch = gen_reg_rtx (DImode); + else + FAIL; + + emit_insn (gen_xordi3 (scratch, operands[2], operands[3])); + emit_insn (gen_anddi3 (scratch, scratch, operands[1])); + emit_insn (gen_xordi3 (operands[0], scratch, operands[2])); + DONE; +} + [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple") + (set_attr "length" "4,4,4,12")] +) + (define_expand "aarch64_simd_bsl<mode>" [(match_operand:VALLDIF 0 "register_operand") (match_operand:<V_cmp_result> 1 "register_operand") diff --git a/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c new file mode 100644 index 0000000..8151387 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c @@ -0,0 +1,88 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */ + +/* Test that we don't generate BSL when in DImode with values in integer + registers, and do generate it where we have values in floating-point + registers. This is useful, as it allows us to avoid register moves + in the general case. + + We want: + eor x0, x0, x1 + and x0, x0, x2 + eor x0, x0, x1 + ret + + Rather than: + fmov d2, x0 + fmov d0, x2 + fmov d1, x1 + bsl v0.8b, v2.8b, v1.8b + fmov x0, d0 + ret */ + +extern void abort (void); + +unsigned long long __attribute__ ((noinline)) +foo (unsigned long long a, unsigned long long b, unsigned long long c) +{ + return ((a ^ b) & c) ^ b; +} + +unsigned long long __attribute__ ((noinline)) +foo2 (unsigned long long a, unsigned long long b, unsigned long long c) +{ + return ((a ^ b) & c) ^ a; +} + +#define force_simd(V1) asm volatile ("mov %d0, %1.d[0]" \ + : "=w"(V1) \ + : "w"(V1) \ + : /* No clobbers */); + +unsigned long long __attribute__ ((noinline)) +bar (unsigned long long a, unsigned long long b, unsigned long long c) +{ + force_simd (a); + force_simd (b); + force_simd (c); + c = ((a ^ b) & c) ^ b; + force_simd (c); + return c; +} + +unsigned long long __attribute__ ((noinline)) +bar2 (unsigned long long a, unsigned long long b, unsigned long long c) +{ + force_simd (a); + force_simd (b); + force_simd (c); + c = ((a ^ b) & c) ^ a; + force_simd (c); + return c; +} + +int +main (int argc, char** argv) +{ + unsigned long long a = 0x0123456789abcdefULL; + unsigned long long b = 0xfedcba9876543210ULL; + unsigned long long c = 0xaabbccddeeff7777ULL; + if (foo (a, b, c) != bar (a, b, c)) + abort (); + if (foo2 (a, b, c) != bar2 (a, b, c)) + abort (); + return 0; +} + +/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general +purpose registers) for the "bar" tests, which should still use BSL. */ +/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */ +/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */ +/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */ + +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */ +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */ + +/* We always match the idiom during combine. */ +/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */ +/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c new file mode 100644 index 0000000..4e63511 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* Test that we can generate DImode BSL when we are using + copysign. */ + +double +foo (double a, double b) +{ + return __builtin_copysign (a, b); +} + +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */