[Patch AArch64] Stop generating BSL for simple integer code

James Greenhalgh Mon, 12 Jun 2017 06:45:37 -0700

[Sorry for the re-send. I spotted that the attributes were not right for the
 new pattern I was adding. The change between this and the first version was:


  +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
  +   (set_attr "length" "4,4,4,12")]
]

---

Hi,

In this testcase, all argument registers and the return register
will be general purpose registers:

  long long
  foo (long long a, long long b, long long c)
  {
    return ((a ^ b) & c) ^ b;
  }

However, due to the implementation of aarch64_simd_bsl<mode>_internal
we'll match that pattern and emit a BSL, necessitating moving all those
arguments and results to the Advanced SIMD registers:

        fmov    d2, x0
        fmov    d0, x2
        fmov    d1, x1
        bsl     v0.8b, v2.8b, v1.8b
        fmov    x0, d0

To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
knows to split back to integer operations if the register allocation
falls that way.

We could have used an unspec, but then we lose some of the nice
simplifications that can be made from explicitly spelling out the semantics
of BSL.

Bootstrapped on aarch64-none-linux-gnu.

OK?

Thanks,
James

---
gcc/

2017-06-12  James Greenhalgh  <james.greenha...@arm.com>

        * config/aarch64/aarch64-simd.md
        (aarch64_simd_bsl<mode>_internal): Remove DImode.
        (*aarch64_simd_bsl<mode>_alt): Likewise.
        (aarch64_simd_bsldi_internal): New.

gcc/testsuite/

2017-06-12  James Greenhalgh  <james.greenha...@arm.com>

        * gcc.target/aarch64/no-dimode-bsl.c: New.
        * gcc.target/aarch64/dimode-bsl.c: New.

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5a86ff..7b6b12f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2256,13 +2256,13 @@
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_cmp_result> 3)
 	))]
   "TARGET_SIMD"
@@ -2280,14 +2280,14 @@
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:VDQ_I 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2296,6 +2296,45 @@
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  */
+  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
+  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
+  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_cmp_result> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+   copysign.  */
+
+double
+foo (double a, double b)
+{
+  return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
new file mode 100644
index 0000000..67dfda0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we don't combine to BSL when in DImode, avoiding register
+   moves in the general case.
+
+   We want:
+	eor	x0, x0, x1
+	and	x0, x0, x2
+	eor	x0, x0, x1
+	ret
+
+   Rather than:
+	fmov	d2, x0
+	fmov	d0, x2
+	fmov	d1, x1
+	bsl	v0.8b, v2.8b, v1.8b
+	fmov	x0, d0
+	ret  */
+
+long long
+foo (long long a, long long b, long long c)
+{
+  return ((a ^ b) & c) ^ b;
+}
+
+/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */

[Patch AArch64] Stop generating BSL for simple integer code

Reply via email to