From 9e67b44d9ff111b0f280d7f3fe2c197aa7dabc94 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Thu, 3 Jul 2025 09:45:02 -0700
Subject: [PATCH 3/7] aarch64: Handle DImode BCAX operations

To handle DImode BCAX operations we want to do them on the SIMD side only if
the incoming arguments don't require a cross-bank move.
This means we need to split back the combination to separate GP BIC+EOR
instructions if the operands are expected to be in GP regs through reload.
The split happens pre-reload if we already know that the destination will be
a GP reg.  Otherwise if reload descides to use the "=r,r" alternative we ensure
operand 0 is early-clobber.
This scheme is similar to how we handle the BSL operations elsewhere in
aarch64-simd.md.

Thus, for the functions:
uint64_t bcax_d_gp (uint64_t a, uint64_t b, uint64_t c) { return BCAX (a, b, c); }
uint64x1_t bcax_d (uint64x1_t a, uint64x1_t b, uint64x1_t c) { return BCAX (a, b, c); }

we now generate the desired:
bcax_d_gp:
        bic     x1, x1, x2
        eor     x0, x1, x0
        ret

bcax_d:
        bcax    v0.16b, v0.16b, v1.16b, v2.16b
        ret

When the inputs are in SIMD regs we use BCAX and when they are in GP regs we
don't force them to SIMD with extra moves.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/

	* config/aarch64/aarch64-simd.md (*bcaxqdi4): New
	define_insn_and_split.

gcc/testsuite/

	* gcc.target/aarch64/simd/bcax_d.c: Add tests for DImode arguments.
---
 gcc/config/aarch64/aarch64-simd.md            | 29 +++++++++++++++++++
 .../gcc.target/aarch64/simd/bcax_d.c          |  6 +++-
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4493e55603d..270cb2ff3a1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9252,6 +9252,35 @@
   [(set_attr "type" "crypto_sha3")]
 )
 
+(define_insn_and_split "*bcaxqdi4"
+  [(set (match_operand:DI 0 "register_operand")
+	(xor:DI
+	  (and:DI
+	    (not:DI (match_operand:DI 3 "register_operand"))
+	    (match_operand:DI 2 "register_operand"))
+	  (match_operand:DI 1 "register_operand")))]
+  "TARGET_SHA3"
+  {@ [ cons: =0, 1, 2 , 3  ; attrs: type ]
+     [ w       , w, w , w  ; crypto_sha3 ] bcax\t%0.16b, %1.16b, %2.16b, %3.16b
+     [ &r      , r, r0, r0 ; multiple    ] #
+  }
+  "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  [(set (match_dup 4)
+	(and:DI (not:DI (match_dup 3))
+		(match_dup 2)))
+   (set (match_dup 0)
+	(xor:DI (match_dup 4)
+		(match_dup 1)))]
+  {
+    if (reload_completed)
+      operands[4] = operands[0];
+    else if (can_create_pseudo_p ())
+      operands[4] = gen_reg_rtx (DImode);
+    else
+      FAIL;
+  }
+)
+
 ;; SM3
 
 (define_insn "aarch64_sm3ss1qv4si"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c b/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c
index d68f0e102bf..a7640c3f6f1 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/bcax_d.c
@@ -7,9 +7,13 @@
 
 #define BCAX(x,y,z)  ((x) ^ ((y) & ~(z)))
 
+/* When the inputs come from GP regs don't form a BCAX.  */
+uint64_t bcax_d_gp (uint64_t a, uint64_t b, uint64_t c) { return BCAX (a, b, c); }
+
+uint64x1_t bcax_d (uint64x1_t a, uint64x1_t b, uint64x1_t c) { return BCAX (a, b, c); }
 uint32x2_t bcax_s (uint32x2_t a, uint32x2_t b, uint32x2_t c) { return BCAX (a, b, c); }
 uint16x4_t bcax_h (uint16x4_t a, uint16x4_t b, uint16x4_t c) { return BCAX (a, b, c); }
 uint8x8_t bcax_b (uint8x8_t a, uint8x8_t b, uint8x8_t c) { return BCAX (a, b, c); }
 
-/* { dg-final { scan-assembler-times {bcax\tv0.16b, v0.16b, v1.16b, v2.16b} 3 } } */
+/* { dg-final { scan-assembler-times {bcax\tv0.16b, v0.16b, v1.16b, v2.16b} 4 } } */
 
-- 
2.44.0

