From d4955aea9158ddc39d06015e33de871d20535017 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Fri, 4 Jul 2025 06:49:15 -0700
Subject: [PATCH] aarch64: Improve popcountti2 with SVE

The TImode popcount sequence can be slightly improved with SVE.
If we generate:
        ldr     q31, [x0]
        ptrue   p7.b, vl16
        cnt     z31.d, p7/m, z31.d
        addp    d31, v31.2d
        fmov    x0, d31
        ret

instead of:
h128:
        ldr     q31, [x0]
        cnt     v31.16b, v31.16b
        addv    b31, v31.16b
        fmov    w0, s31
        ret

we use the ADDP instruction for reduction, which is cheaper on all CPUs AFAIK,
as it is only a single 64-bit addition vs the tree of additions for ADDV.
For example, on a CPU like Grace we get a latency and throughput of 2,4 vs 4,1
for ADDV.
We do generate one more instruction due to the PTRUE being materialised, but that
is cheap itself and can be scheduled away from the critical path or even CSE'd
with other PTRUE constants.
As this sequence is larger code size-wise it is avoided for -Os.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/

	* config/aarch64/aarch64.md (popcountti2): Add TARGET_SVE path.

gcc/testsuite/

	* gcc.target/aarch64/popcnt9.c: Add +nosve to target pragma.
	* gcc.target/aarch64/popcnt13.c: New test.
---
 gcc/config/aarch64/aarch64.md               | 13 +++++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt13.c | 24 +++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/popcnt9.c  |  2 +-
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt13.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 509ef4c0f2f..27efc9155dc 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5771,6 +5771,19 @@
    (match_operand:TI 1 "register_operand")]
   "TARGET_SIMD && !TARGET_CSSC"
 {
+  /* For SVE we can do popcount on DImode chunks of the TImode argument
+     and then use a cheap ADDP reduction.  The SVE CNT instruction requires
+     materializing a PTRUE so don't do this if optimizing for size.  */
+  if (TARGET_SVE && !optimize_function_for_size_p (cfun))
+    {
+      rtx v = gen_reg_rtx (V2DImode);
+      rtx v1 = gen_reg_rtx (V2DImode);
+      emit_move_insn (v, gen_lowpart (V2DImode, operands[1]));
+      rtx p = aarch64_ptrue_reg (VNx2BImode, 16);
+      emit_insn (gen_aarch64_pred_popcountv2di (v1, p, v));
+      emit_insn (gen_reduc_plus_scal_v2di (operands[0], v1));
+      DONE;
+    }
   rtx v = gen_reg_rtx (V16QImode);
   rtx v1 = gen_reg_rtx (V16QImode);
   emit_move_insn (v, gen_lowpart (V16QImode, operands[1]));
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt13.c b/gcc/testsuite/gcc.target/aarch64/popcnt13.c
new file mode 100644
index 00000000000..2a30e984332
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt13.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC target "+nocssc+sve"
+
+/*
+** h128:
+**	ldr	q([0-9]+), \[x0\]
+**	ptrue	p([0-9]+).b, vl16
+**	cnt	z([0-9]+).d, p\2/m, z\1.d
+**	addp	d([0-9]+), v\3.2d
+**	fmov	x0, d\4
+**	ret
+*/
+
+unsigned h128 (const unsigned __int128 *a) {
+	  return __builtin_popcountg (a[0]);
+}
+
+/* There should be only one POPCOUNT. */
+/* { dg-final { scan-tree-dump-times "POPCOUNT " 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-not " __builtin_popcount"  "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt9.c b/gcc/testsuite/gcc.target/aarch64/popcnt9.c
index c778fc7f420..cfed8c58b7e 100644
--- a/gcc/testsuite/gcc.target/aarch64/popcnt9.c
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt9.c
@@ -3,7 +3,7 @@
 /* { dg-final { check-function-bodies "**" "" } } */
 /* PR target/113042 */
 
-#pragma GCC target "+nocssc"
+#pragma GCC target "+nocssc+nosve"
 
 /*
 ** h128:
-- 
2.44.0