On Sat, Mar 21, 2026, at 10:14 PM, John Naylor wrote:
> On Sat, Mar 21, 2026 at 11:56 PM Greg Burd <[email protected]> wrote:
>> Attached is a small patch that enables hardware popcount on RISC-V when 
>> available and also sets the arch flag to 'rv64gc_zbb' flag when appropriate.
>
> I have to ask what the point is -- isn't that like putting a 4-inch
> exhaust tip on a go-kart?

Hey John,

The point is to go fast, right? And to look cool (with awesome 4-inch exhaust 
tips) if possible! ;-P

gburd@rv:~/ws/postgres$ gcc -O2 -o popcnt-wo-zbb riscv-popcnt.c
gburd@rv:~/ws/postgres$ gcc -O2 -march=rv64gc_zbb -o popcnt-zbb riscv-popcnt.c
gburd@rv:~/ws/postgres$ ./popcnt-wo-zbb && ./popcnt-zbb
sw popcount:    0.196 sec  (    510.08 MB/s)
hw popcount:    0.293 sec  (    341.48 MB/s)

diff: 0.67x
match: 406261900 bits counted
sw popcount:    0.182 sec  (    548.86 MB/s)
hw popcount:    0.044 sec  (   2279.89 MB/s)

diff: 4.15x
match: 406261900 bits counted

But my first email/patch was incomplete/rushed, I should have followed the 
pattern used for similar ARM-specific logic.  v2 attached along with a test 
program.

> --
> John Naylor
> Amazon Web Services

best.

-greg
/*
 * riscv-popcnt.c
 *
 * RISC-V Zbb popcount optimization
 *
 *   gcc -O2 -o popcnt-wo-zbb riscv-popcnt.c
 *   gcc -O2 -march=rv64gc_zbb -o popcnt-zbb riscv-popcnt.c
 */

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>

#define TEST_SIZE (1024 * 1024)  /* 1 MB */
#define ITERATIONS 100

/* software popcount taken from pg_bitutils.h */
static int
popcount_sw(uint64_t x)
{
	x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
	x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
	x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
	return (x * 0x0101010101010101ULL) >> 56;
}

/* hardware popcount, expect that the compiler will use cpop on Zbb */
static int
popcount_hw(uint64_t x)
{
	return __builtin_popcountll(x);
}

static double
now(void)
{
	struct timespec ts;
	clock_gettime(CLOCK_MONOTONIC, &ts);
	return ts.tv_sec + ts.tv_nsec / 1e9;
}

int
main(void)
{
	uint64_t *data;
	uint64_t count_sw = 0, count_hw = 0;
	double start, elapsed_sw, elapsed_hw;
	double mb_per_sec;
	size_t i;

	data = malloc(TEST_SIZE);
	srand(42);

	for (i = 0; i < TEST_SIZE / sizeof(uint64_t); i++)
		data[i] = ((uint64_t)rand() << 32) | rand();

	start = now();
	for (int iter = 0; iter < ITERATIONS; iter++)
	{
		for (i = 0; i < TEST_SIZE / sizeof(uint64_t); i++)
			count_sw += popcount_sw(data[i]);
	}
	elapsed_sw = now() - start;
	mb_per_sec = (TEST_SIZE * ITERATIONS / (1024.0 * 1024.0)) / elapsed_sw;
	printf("sw popcount: %8.3f sec  (%10.2f MB/s)\n",
	       elapsed_sw, mb_per_sec);

	start = now();
	for (int iter = 0; iter < ITERATIONS; iter++)
	{
		for (i = 0; i < TEST_SIZE / sizeof(uint64_t); i++)
			count_hw += popcount_hw(data[i]);
	}
	elapsed_hw = now() - start;
	mb_per_sec = (TEST_SIZE * ITERATIONS / (1024.0 * 1024.0)) / elapsed_hw;
	printf("hw popcount: %8.3f sec  (%10.2f MB/s)\n",
	       elapsed_hw, mb_per_sec);

	printf("\ndiff: %.2fx\n", elapsed_sw / elapsed_hw);

	if (count_sw != count_hw)
	{
		printf("\n[ERROR] Results don't match!\n");
		printf("\tsw: %llu\n", (unsigned long long)count_sw);
		printf("\thw: %llu\n", (unsigned long long)count_hw);
	}
	else
	{
		printf("match: %llu bits counted\n", (unsigned long long)count_sw);
	}

	free(data);
	return 0;
}
From 9cd6714dc4ad03a34fddf6cd568b9ee7e700ef16 Mon Sep 17 00:00:00 2001
From: Greg Burd <[email protected]>
Date: Sun, 22 Mar 2026 11:15:41 -0400
Subject: [PATCH v2] Add RISC-V Zbb popcount optimization

Implement hardware popcount support for RISC-V using the Zbb (basic bit
manipulation) extension. The Zbb extension provides the 'cpop'
instruction which GCC and Clang emit from __builtin_popcountll() when
compiling with -march=rv64gc_zbb.

This patch adds:
- Build-time detection of Zbb support (configure.ac, meson.build)
- Runtime detection using __riscv_hwprobe() on Linux
- Optimized popcount implementation using cpop instruction

The implementation follows PostgreSQL's established pattern for hardware
acceleration (similar to x86 POPCNT and ARM SVE). Zbb-optimized code is
compiled separately with -march=rv64gc_zbb, while the main binary
remains portable across all RISC-V 64-bit systems.
---
 configure.ac                   |  29 ++++++
 meson.build                    |  32 +++++++
 src/include/port/pg_bitutils.h |   2 +-
 src/port/meson.build           |   7 +-
 src/port/pg_bitutils.c         |   5 +-
 src/port/pg_popcount_riscv.c   | 159 +++++++++++++++++++++++++++++++++
 6 files changed, 229 insertions(+), 5 deletions(-)
 create mode 100644 src/port/pg_popcount_riscv.c

diff --git a/configure.ac b/configure.ac
index f8327a7020a..abb8c957906 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2156,6 +2156,35 @@ if test x"$host_cpu" = x"aarch64"; then
   fi
 fi
 
+# Check for RISC-V Zbb bitmanip extension (provides 'cpop' for popcount).
+#
+# The Zbb extension provides the 'cpop' instruction for hardware popcount.
+# GCC/Clang emit the cpop instruction from __builtin_popcountll() when
+# -march=rv64gc_zbb is used.  We test compilation with this flag, then
+# restore CFLAGS to avoid global march flags (for binary portability).
+# We define USE_RISCV_ZBB_WITH_RUNTIME_CHECK and use __riscv_hwprobe()
+# for runtime detection.  We compile src/port/pg_popcount_riscv.c with
+# -march=rv64gc_zbb separately (like ARM SVE and x86 POPCNT).
+AC_MSG_CHECKING([for RISC-V Zbb extension (cpop/popcount)])
+if test x"$host_cpu" = x"riscv64"; then
+  pgac_save_CFLAGS_zbb="$CFLAGS"
+  CFLAGS="$CFLAGS -march=rv64gc_zbb"
+  AC_COMPILE_IFELSE(
+    [AC_LANG_PROGRAM(
+      [/* Test that the compiler will emit cpop from __builtin_popcountll */
+       static inline int test_cpop(unsigned long long x)
+       { return __builtin_popcountll(x); }],
+      [volatile int r = test_cpop(0xdeadbeefULL); (void) r;])],
+    [AC_DEFINE(USE_RISCV_ZBB_WITH_RUNTIME_CHECK, 1,
+      [Define to 1 to use RISC-V Zbb popcount with runtime detection.])
+     CFLAGS="$pgac_save_CFLAGS_zbb"
+     AC_MSG_RESULT([yes, with runtime check])],
+    [CFLAGS="$pgac_save_CFLAGS_zbb"
+     AC_MSG_RESULT([no])])
+else
+  AC_MSG_RESULT([not on RISC-V])
+fi
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 PGAC_SSE42_CRC32_INTRINSICS()
diff --git a/meson.build b/meson.build
index 0a181909fab..6d02f280e08 100644
--- a/meson.build
+++ b/meson.build
@@ -2529,6 +2529,38 @@ int main(void)
 
 endif
 
+# ---------------------------------------------------------------------------
+# Check for RISC-V Zbb bitmanip extension (provides 'cpop' for popcount).
+#
+# The Zbb extension provides the 'cpop' instruction for hardware popcount.
+# GCC/Clang emit the cpop instruction from __builtin_popcountll() when
+# -march=rv64gc_zbb is used.  We test compilation with this flag, but
+# do NOT add it globally (for binary portability).  Instead, we define
+# USE_RISCV_ZBB_WITH_RUNTIME_CHECK and compile src/port/pg_popcount_riscv.c
+# with -march=rv64gc_zbb separately (like ARM SVE and x86 POPCNT).
+# Runtime detection uses __riscv_hwprobe().
+# ---------------------------------------------------------------------------
+zbb_test_code = '''
+static inline int test_cpop(unsigned long long x)
+{ return __builtin_popcountll(x); }
+int main(void) {
+  volatile int r = test_cpop(0xdeadbeefULL);
+  (void) r;
+  return 0;
+}
+'''
+
+cflags_zbb = []
+if host_cpu == 'riscv64'
+  if cc.compiles(zbb_test_code,
+                 args: ['-march=rv64gc_zbb'],
+                 name: 'RISC-V Zbb cpop')
+    cdata.set('USE_RISCV_ZBB_WITH_RUNTIME_CHECK', 1)
+    # Flag will be added only to pg_popcount_riscv.c in src/port/meson.build
+    cflags_zbb = ['-march=rv64gc_zbb']
+  endif
+endif
+
 
 ###############################################################
 # Select CRC-32C implementation.
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 0bca559caaa..8db645c4a42 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -279,7 +279,7 @@ pg_ceil_log2_64(uint64 num)
 extern uint64 pg_popcount_portable(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask);
 
-#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK)
+#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) || defined(USE_RISCV_ZBB_WITH_RUNTIME_CHECK)
 /*
  * Attempt to use specialized CPU instructions, but perform a runtime check
  * first.
diff --git a/src/port/meson.build b/src/port/meson.build
index 7296f8e3c03..9d0bb59aca0 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -98,12 +98,15 @@ replace_funcs_pos = [
   # loongarch
   ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
 
+  # riscv
+  ['pg_popcount_riscv', 'USE_RISCV_ZBB_WITH_RUNTIME_CHECK', 'zbb'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'zbb': cflags_zbb}
+pgport_sources_cflags = {'crc': [], 'zbb': []}
 
 foreach f : replace_funcs_neg
   func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 49b130f1306..699ae89129f 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -162,7 +162,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask)
 	return popcnt;
 }
 
-#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON)
+#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON) && !defined(USE_RISCV_ZBB_WITH_RUNTIME_CHECK)
 
 /*
  * When special CPU instructions are not available, there's no point in using
@@ -191,4 +191,5 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
 	return pg_popcount_masked_portable(buf, bytes, mask);
 }
 
-#endif							/* ! HAVE_X86_64_POPCNTQ && ! USE_NEON */
+#endif							/* ! HAVE_X86_64_POPCNTQ && ! USE_NEON && !
+								 * USE_RISCV_ZBB_WITH_RUNTIME_CHECK */
diff --git a/src/port/pg_popcount_riscv.c b/src/port/pg_popcount_riscv.c
new file mode 100644
index 00000000000..a3b1dda4bac
--- /dev/null
+++ b/src/port/pg_popcount_riscv.c
@@ -0,0 +1,159 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_riscv.c
+ *	  Holds the RISC-V Zbb popcount implementations.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_riscv.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#ifdef USE_RISCV_ZBB_WITH_RUNTIME_CHECK
+
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/hwprobe.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+/*
+ * Hardware implementation using RISC-V Zbb cpop instruction.
+ */
+static uint64 pg_popcount_zbb(const char *buf, int bytes);
+static uint64 pg_popcount_masked_zbb(const char *buf, int bytes, bits8 mask);
+
+/*
+ * The function pointers are initially set to "choose" functions.  These
+ * functions will first set the pointers to the right implementations (based on
+ * what the current CPU supports) and then will call the pointer to fulfill the
+ * caller's request.
+ */
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
+uint64		(*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
+uint64		(*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
+
+static inline bool
+pg_popcount_zbb_available(void)
+{
+#if defined(__linux__) && defined(__NR_riscv_hwprobe)
+	struct riscv_hwprobe pair = {.key = RISCV_HWPROBE_KEY_IMA_EXT_0};
+
+	if (syscall(__NR_riscv_hwprobe, &pair, 1, 0, NULL, 0) != 0)
+		return false;
+
+	return (pair.value & RISCV_HWPROBE_EXT_ZBB) != 0;
+#else
+	return false;
+#endif
+}
+
+static inline void
+choose_popcount_functions(void)
+{
+	if (pg_popcount_zbb_available())
+	{
+		pg_popcount_optimized = pg_popcount_zbb;
+		pg_popcount_masked_optimized = pg_popcount_masked_zbb;
+	}
+	else
+	{
+		pg_popcount_optimized = pg_popcount_portable;
+		pg_popcount_masked_optimized = pg_popcount_masked_portable;
+	}
+}
+
+static uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+	choose_popcount_functions();
+	return pg_popcount_optimized(buf, bytes);
+}
+
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+	choose_popcount_functions();
+	return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
+/*
+ * pg_popcount64_zbb
+ *		Return the number of 1 bits set in word
+ *
+ * Uses the RISC-V Zbb 'cpop' (count population) instruction via
+ * __builtin_popcountll().  When compiled with -march=rv64gc_zbb, GCC and
+ * Clang will emit the cpop instruction for this builtin.
+ */
+pg_attribute_target("arch=rv64gc_zbb")
+static inline int
+pg_popcount64_zbb(uint64 word)
+{
+	return __builtin_popcountll(word);
+}
+
+/*
+ * pg_popcount_zbb
+ *		Returns number of 1 bits in buf
+ *
+ * Similar approach to x86 SSE4.2 POPCNT: process data in 8-byte chunks using
+ * the cpop instruction, with byte-by-byte fallback for remaining data.
+ */
+pg_attribute_target("arch=rv64gc_zbb")
+static uint64
+pg_popcount_zbb(const char *buf, int bytes)
+{
+	uint64		popcnt = 0;
+	const uint64 *words = (const uint64 *) buf;
+
+	/* Process 8-byte chunks */
+	while (bytes >= 8)
+	{
+		popcnt += pg_popcount64_zbb(*words++);
+		bytes -= 8;
+	}
+
+	buf = (const char *) words;
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+	return popcnt;
+}
+
+/*
+ * pg_popcount_masked_zbb
+ *		Returns number of 1 bits in buf after applying the mask to each byte
+ */
+pg_attribute_target("arch=rv64gc_zbb")
+static uint64
+pg_popcount_masked_zbb(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt = 0;
+	uint64		maskv = ~UINT64CONST(0) / 0xFF * mask;
+	const uint64 *words = (const uint64 *) buf;
+
+	/* Process 8-byte chunks */
+	while (bytes >= 8)
+	{
+		popcnt += pg_popcount64_zbb(*words++ & maskv);
+		bytes -= 8;
+	}
+
+	buf = (const char *) words;
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+
+	return popcnt;
+}
+
+#endif							/* USE_RISCV_ZBB_WITH_RUNTIME_CHECK */
-- 
2.51.2

Reply via email to