From c32116ff754debeface1844820081be7c10fbecf Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Fri, 4 Jul 2025 05:30:29 -0700
Subject: [PATCH 7/7] aarch64: Use BSL2N for DImode operands

The intent of the patch is similar to previous in the series.
Make more use of BSL2N when we have DImode operands in SIMD regs,
but still use the GP instructions when that's where the operands are.
Compared to the previous patches there are a couple of complications:
* The operands are a bit more complex and get rejected by RTX costs during
combine.  This is fixed by adding some costing logic to aarch64_rtx_costs.

* The GP split sequence requires two temporaries instead of just one.
I've marked operand 1 to be an input/output earlyclobber operand to give
the second temporary together with the earlyclobber operand 0.  This means
that operand is marked with "+" even for the "w" alternatives as the modifier
is global, but I don't see another way out here. Suggestions welcome.

With these fixed for the testcase we generate:
bsl2n_gp: // unchanged scalar output
        orr     x1, x2, x1
        and     x0, x0, x2
        orn     x0, x0, x1
        ret

bsl2n_d:
        bsl2n   z0.d, z0.d, z1.d, z2.d
        ret

compared to the previous:
bsl2n_gp:
        orr     x1, x2, x1
        and     x0, x0, x2
        orn     x0, x0, x1
        ret

bsl2n_d:
        orr     v1.8b, v2.8b, v1.8b
        and     v0.8b, v2.8b, v0.8b
        orn     v0.8b, v0.8b, v1.8b
        ret

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/

	* config/aarch64/aarch64-sve2.md (*aarch64_sve2_bsl2n_unpreddi): New
	define_insn_and_split.
	* config/aarch64/aarch64.cc (aarch64_bsl2n_rtx_form_p): Define.
	(aarch64_rtx_costs): Use the above.  Cost BSL2N ops.

gcc/testsuite/

	* gcc.target/aarch64/sve2/bsl2n_d.c: New test.
---
 gcc/config/aarch64/aarch64-sve2.md            | 37 +++++++++++
 gcc/config/aarch64/aarch64.cc                 | 61 +++++++++++++++++++
 .../gcc.target/aarch64/sve2/bsl2n_d.c         | 23 +++++++
 3 files changed, 121 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve2/bsl2n_d.c

diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index dae5fee25e0..15714712d3b 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1978,6 +1978,43 @@
   }
 )
 
+;; The GP regs split sequence needs two temporaries so mark operand 1 as an
+;; inout earlyclobber operand as well.
+(define_insn_and_split "*aarch64_sve2_bsl2n_unpreddi"
+  [(set (match_operand:DI 0 "register_operand")
+	(ior:DI
+	  (and:DI
+	    (match_operand:DI 1 "register_operand")
+	    (match_operand:DI 2 "register_operand"))
+	  (and:DI
+	    (not:DI (match_dup BSL_DUP))
+	    (not:DI (match_operand:DI 3 "register_operand")))))]
+  "TARGET_SVE2"
+  {@ [ cons: =0 , +1         , 2        , 3 ; attrs: movprfx ]
+     [ w        , <bsl_1st> , <bsl_2nd> , w ; *              ] bsl2n\t%Z0.d, %Z0.d, %Z3.d, %Z<bsl_dup>.d
+     [ ?&w      , w         , w         , w ; yes            ] movprfx\t%Z0, %Z<bsl_mov>\;bsl2n\t%Z0.d, %Z0.d, %Z3.d, %Z<bsl_dup>.d
+     [ &r       , &r        , r         , r ; *              ] #
+  }
+  "&& REG_P (operands[0]) && GP_REGNUM_P (REGNO (operands[0]))"
+  [(set (match_dup 5) (ior:DI (match_dup 2) (match_dup 3)))
+   (set (match_dup 4) (and:DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (ior:DI (not:DI (match_dup 5)) (match_dup 4)))]
+  {
+    if (reload_completed)
+      {
+	operands[5] = operands[0];
+	operands[4] = operands[1];
+      }
+    else if (can_create_pseudo_p ())
+      {
+	operands[5] = gen_reg_rtx (DImode);
+	operands[4] = gen_reg_rtx (DImode);
+      }
+    else
+      FAIL;
+  }
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Shift-and-accumulate operations
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f3ce3a15b09..bb0659ee96b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14250,6 +14250,44 @@ aarch64_frint_unspec_p (unsigned int u)
     }
 }
 
+/* Return true iff LHS and RHS represent the arms of a BSL2N operation joined
+   by a top-level IOR.  This function doesn't check that the right duplicate
+   sub-rtx'es appear in each sub-arm, just that the form of the RTX tree looks
+   like a BSL2N.  */
+static bool
+aarch64_bsl2n_rtx_form_p (rtx lhs, rtx rhs)
+{
+  auto both_reg_or_subreg = [](rtx a, rtx b) -> bool
+    {
+      if (SUBREG_P (a) && subreg_lowpart_p (a))
+	a = XEXP (a, 0);
+      if (SUBREG_P (b) && subreg_lowpart_p (b))
+	b = XEXP (b, 0);
+      return REG_P (a) && REG_P (b);
+    };
+
+  auto and_arm = [&both_reg_or_subreg](rtx x) -> bool
+    {
+      if (GET_CODE (x) != AND)
+	return false;
+      return both_reg_or_subreg (XEXP (x, 0), XEXP (x, 1));
+    };
+
+  auto and_not_arm = [&both_reg_or_subreg](rtx x) -> bool
+    {
+      if (GET_CODE (x) != AND)
+	return false;
+      rtx x0 = XEXP (x, 0);
+      rtx x1 = XEXP (x, 1);
+      if (GET_CODE (x0) != NOT || GET_CODE (x1) != NOT)
+	return false;
+      return both_reg_or_subreg (XEXP (x0, 0), XEXP (x1, 0));
+    };
+
+  return (and_arm (lhs) && and_not_arm (rhs))
+	 || (and_arm (rhs) && and_not_arm (lhs));
+}
+
 /* Return true iff X is an rtx that will match an extr instruction
    i.e. as described in the *extr<mode>5_insn family of patterns.
    OP0 and OP1 will be set to the operands of the shifts involved
@@ -15210,6 +15248,29 @@ cost_plus:
 
           return true;
         }
+
+      if (aarch64_bsl2n_rtx_form_p (XEXP (x, 0), XEXP (x, 1)))
+	{
+	  /* Vector forms of BSL2N use that one vector instruction.  */
+	  if (VECTOR_MODE_P (mode))
+	    {
+	      *cost = COSTS_N_INSNS (1);
+	      if (speed)
+		*cost += extra_cost->vect.alu;
+	    }
+	  /* DImode BSL2N can be split to 3 GP logical operations in the worst
+	     case.  */
+	  else if (mode == DImode)
+	    {
+	      *cost = COSTS_N_INSNS (3);
+	      if (speed)
+		*cost += extra_cost->alu.logical;
+	    }
+	  else
+	    return false;
+
+	  return true;
+	}
     /* Fall through.  */
     case XOR:
     case AND:
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/bsl2n_d.c b/gcc/testsuite/gcc.target/aarch64/sve2/bsl2n_d.c
new file mode 100644
index 00000000000..7004e2dbeb8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/bsl2n_d.c
@@ -0,0 +1,23 @@
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+#define BSL2N(x,y,z) (  ((x) & (z)) | (~(y) & ~(z)))
+
+/*
+** bsl2n_gp:
+** 	orr	x1, x2, x1
+** 	and	x0, x0, x2
+** 	orn	x0, x0, x1
+** 	ret
+*/
+uint64_t bsl2n_gp(uint64_t a, uint64_t b, uint64_t c) { return BSL2N (a, b, c); }
+
+/*
+** bsl2n_d:
+** 	bsl2n	z0\.d, z0\.d, z1\.d, z2\.d
+** 	ret
+*/
+uint64x1_t bsl2n_d (uint64x1_t a, uint64x1_t b, uint64x1_t c) { return BSL2N (a, b, c); }
+
-- 
2.44.0

