From 8412e2a2c73700d709c483459902b02eeac50c21 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Thu, 3 Jul 2025 11:06:08 -0700
Subject: [PATCH 5/7] aarch64: Use SVE2 NBSL for DImode arguments

Similar to the BCAX and EOR3 patterns from TARGET_SHA3 we can use the
SVE2 NBSL instruction for DImode arugments when they come in SIMD registers.
Again, this is accomplished with a new splitter for the GP case.  I noticed
that the split has a side-effect of producing a GP EON instruction where it
wasn't getting generated before because the BSL insn-and-split got in the way.
So for the inputs:

uint64_t nbsl_gp(uint64_t a, uint64_t b, uint64_t c) { return NBSL (a, b, c); }
uint64x1_t nbsl_d (uint64x1_t a, uint64x1_t b, uint64x1_t c) { return NBSL (a, b, c); }

We now generate:
nbsl_gp:
        eor     x0, x0, x1
        and     x0, x0, x2
        eon     x0, x0, x1
        ret

nbsl_d:
        nbsl    z0.d, z0.d, z1.d, z2.d
        ret

instead of:
nbsl_gp:
        eor     x0, x1, x0
        and     x0, x0, x2
        eor     x0, x0, x1
        mvn     x0, x0
        ret

nbsl_d:
        bif     v0.8b, v1.8b, v2.8b
        mvn     v0.8b, v0.8b
        ret

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/

	* config/aarch64/aarch64-sve.md (*aarch64_sve2_nbsl_unpreddi): New
	define_insn_and_split.

gcc/testsuite/

	* gcc.target/aarch64/sve2/nbsl_d.c: New test.
---
 gcc/config/aarch64/aarch64-sve2.md            | 30 +++++++++++++++++++
 .../gcc.target/aarch64/sve2/nbsl_d.c          | 23 ++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/nbsl_d.c

diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 62524f36de6..ef44b956dae 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1750,6 +1750,36 @@
   }
 )
 
+(define_insn_and_split "*aarch64_sve2_nbsl_unpreddi"
+  [(set (match_operand:DI 0 "register_operand")
+	(not:DI
+	  (xor:DI
+	    (and:DI
+	      (xor:DI
+		(match_operand:DI 1 "register_operand")
+		(match_operand:DI 2 "register_operand"))
+	      (match_operand:DI 3 "register_operand"))
+	    (match_dup BSL_DUP))))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , 1         , 2         , 3 ; attrs: movprfx ]
+     [ w        , <bsl_1st> , <bsl_2nd> , w ; *              ] nbsl\t%Z0.d, %Z0.d, %Z<bsl_dup>.d, %Z3.d
+     [ ?&w      , w         , w         , w ; yes            ] movprfx\t%Z0, %Z<bsl_mov>\;nbsl\t%Z0.d, %Z0.d, %Z<bsl_dup>.d, %Z3.d
+     [ &r       , r         , r         , r ; *              ] #
+  }
+  "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  [(set (match_dup 4) (xor:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 4) (and:DI (match_dup 4) (match_dup 3)))
+   (set (match_dup 0) (not:DI (xor:DI (match_dup 4) (match_dup BSL_DUP))))]
+  {
+    if (reload_completed)
+      operands[4] = operands[0];
+    else if (can_create_pseudo_p ())
+      operands[4] = gen_reg_rtx (DImode);
+    else
+      FAIL;
+  }
+)
+
 ;; Unpredicated bitwise select with inverted first operand.
 ;; (op3 ? ~bsl_mov : bsl_dup) == ((~(bsl_mov ^ bsl_dup) & op3) ^ bsl_dup)
 (define_expand "@aarch64_sve2_bsl1n<mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_d.c b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_d.c
new file mode 100644
index 00000000000..e18ca9a849f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/nbsl_d.c
@@ -0,0 +1,23 @@
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+#define NBSL(x,y,z)  (~(((x) & (z)) | ( (y) & ~(z))))
+
+/*
+** nbsl_gp:
+** 	eor	x0, x0, x1
+** 	and	x0, x0, x2
+** 	eon	x0, x0, x1
+** 	ret
+*/
+uint64_t nbsl_gp(uint64_t a, uint64_t b, uint64_t c) { return NBSL (a, b, c); }
+
+/*
+** nbsl_d:
+** 	nbsl	z0\.d, z0\.d, z1\.d, z2\.d
+** 	ret
+*/
+uint64x1_t nbsl_d (uint64x1_t a, uint64x1_t b, uint64x1_t c) { return NBSL (a, b, c); }
+
-- 
2.44.0

