On Sat, Mar 21, 2026, at 10:14 PM, John Naylor wrote:
> On Sat, Mar 21, 2026 at 11:56 PM Greg Burd <[email protected]> wrote:
>> Attached is a small patch that enables hardware popcount on RISC-V when
>> available and also sets the arch flag to 'rv64gc_zbb' flag when appropriate.
>
> I have to ask what the point is -- isn't that like putting a 4-inch
> exhaust tip on a go-kart?
Hey John,
The point is to go fast, right? And to look cool (with awesome 4-inch exhaust
tips) if possible! ;-P
gburd@rv:~/ws/postgres$ gcc -O2 -o popcnt-wo-zbb riscv-popcnt.c
gburd@rv:~/ws/postgres$ gcc -O2 -march=rv64gc_zbb -o popcnt-zbb riscv-popcnt.c
gburd@rv:~/ws/postgres$ ./popcnt-wo-zbb && ./popcnt-zbb
sw popcount: 0.196 sec ( 510.08 MB/s)
hw popcount: 0.293 sec ( 341.48 MB/s)
diff: 0.67x
match: 406261900 bits counted
sw popcount: 0.182 sec ( 548.86 MB/s)
hw popcount: 0.044 sec ( 2279.89 MB/s)
diff: 4.15x
match: 406261900 bits counted
But my first email/patch was incomplete/rushed, I should have followed the
pattern used for similar ARM-specific logic. v2 attached along with a test
program.
> --
> John Naylor
> Amazon Web Services
best.
-greg
/*
* riscv-popcnt.c
*
* RISC-V Zbb popcount optimization
*
* gcc -O2 -o popcnt-wo-zbb riscv-popcnt.c
* gcc -O2 -march=rv64gc_zbb -o popcnt-zbb riscv-popcnt.c
*/
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#define TEST_SIZE (1024 * 1024) /* 1 MB */
#define ITERATIONS 100
/* software popcount taken from pg_bitutils.h */
static int
popcount_sw(uint64_t x)
{
x = (x & 0x5555555555555555ULL) + ((x >> 1) & 0x5555555555555555ULL);
x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
x = (x & 0x0F0F0F0F0F0F0F0FULL) + ((x >> 4) & 0x0F0F0F0F0F0F0F0FULL);
return (x * 0x0101010101010101ULL) >> 56;
}
/* hardware popcount, expect that the compiler will use cpop on Zbb */
static int
popcount_hw(uint64_t x)
{
return __builtin_popcountll(x);
}
static double
now(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec + ts.tv_nsec / 1e9;
}
int
main(void)
{
uint64_t *data;
uint64_t count_sw = 0, count_hw = 0;
double start, elapsed_sw, elapsed_hw;
double mb_per_sec;
size_t i;
data = malloc(TEST_SIZE);
srand(42);
for (i = 0; i < TEST_SIZE / sizeof(uint64_t); i++)
data[i] = ((uint64_t)rand() << 32) | rand();
start = now();
for (int iter = 0; iter < ITERATIONS; iter++)
{
for (i = 0; i < TEST_SIZE / sizeof(uint64_t); i++)
count_sw += popcount_sw(data[i]);
}
elapsed_sw = now() - start;
mb_per_sec = (TEST_SIZE * ITERATIONS / (1024.0 * 1024.0)) / elapsed_sw;
printf("sw popcount: %8.3f sec (%10.2f MB/s)\n",
elapsed_sw, mb_per_sec);
start = now();
for (int iter = 0; iter < ITERATIONS; iter++)
{
for (i = 0; i < TEST_SIZE / sizeof(uint64_t); i++)
count_hw += popcount_hw(data[i]);
}
elapsed_hw = now() - start;
mb_per_sec = (TEST_SIZE * ITERATIONS / (1024.0 * 1024.0)) / elapsed_hw;
printf("hw popcount: %8.3f sec (%10.2f MB/s)\n",
elapsed_hw, mb_per_sec);
printf("\ndiff: %.2fx\n", elapsed_sw / elapsed_hw);
if (count_sw != count_hw)
{
printf("\n[ERROR] Results don't match!\n");
printf("\tsw: %llu\n", (unsigned long long)count_sw);
printf("\thw: %llu\n", (unsigned long long)count_hw);
}
else
{
printf("match: %llu bits counted\n", (unsigned long long)count_sw);
}
free(data);
return 0;
}
From 9cd6714dc4ad03a34fddf6cd568b9ee7e700ef16 Mon Sep 17 00:00:00 2001
From: Greg Burd <[email protected]>
Date: Sun, 22 Mar 2026 11:15:41 -0400
Subject: [PATCH v2] Add RISC-V Zbb popcount optimization
Implement hardware popcount support for RISC-V using the Zbb (basic bit
manipulation) extension. The Zbb extension provides the 'cpop'
instruction which GCC and Clang emit from __builtin_popcountll() when
compiling with -march=rv64gc_zbb.
This patch adds:
- Build-time detection of Zbb support (configure.ac, meson.build)
- Runtime detection using __riscv_hwprobe() on Linux
- Optimized popcount implementation using cpop instruction
The implementation follows PostgreSQL's established pattern for hardware
acceleration (similar to x86 POPCNT and ARM SVE). Zbb-optimized code is
compiled separately with -march=rv64gc_zbb, while the main binary
remains portable across all RISC-V 64-bit systems.
---
configure.ac | 29 ++++++
meson.build | 32 +++++++
src/include/port/pg_bitutils.h | 2 +-
src/port/meson.build | 7 +-
src/port/pg_bitutils.c | 5 +-
src/port/pg_popcount_riscv.c | 159 +++++++++++++++++++++++++++++++++
6 files changed, 229 insertions(+), 5 deletions(-)
create mode 100644 src/port/pg_popcount_riscv.c
diff --git a/configure.ac b/configure.ac
index f8327a7020a..abb8c957906 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2156,6 +2156,35 @@ if test x"$host_cpu" = x"aarch64"; then
fi
fi
+# Check for RISC-V Zbb bitmanip extension (provides 'cpop' for popcount).
+#
+# The Zbb extension provides the 'cpop' instruction for hardware popcount.
+# GCC/Clang emit the cpop instruction from __builtin_popcountll() when
+# -march=rv64gc_zbb is used. We test compilation with this flag, then
+# restore CFLAGS to avoid global march flags (for binary portability).
+# We define USE_RISCV_ZBB_WITH_RUNTIME_CHECK and use __riscv_hwprobe()
+# for runtime detection. We compile src/port/pg_popcount_riscv.c with
+# -march=rv64gc_zbb separately (like ARM SVE and x86 POPCNT).
+AC_MSG_CHECKING([for RISC-V Zbb extension (cpop/popcount)])
+if test x"$host_cpu" = x"riscv64"; then
+ pgac_save_CFLAGS_zbb="$CFLAGS"
+ CFLAGS="$CFLAGS -march=rv64gc_zbb"
+ AC_COMPILE_IFELSE(
+ [AC_LANG_PROGRAM(
+ [/* Test that the compiler will emit cpop from __builtin_popcountll */
+ static inline int test_cpop(unsigned long long x)
+ { return __builtin_popcountll(x); }],
+ [volatile int r = test_cpop(0xdeadbeefULL); (void) r;])],
+ [AC_DEFINE(USE_RISCV_ZBB_WITH_RUNTIME_CHECK, 1,
+ [Define to 1 to use RISC-V Zbb popcount with runtime detection.])
+ CFLAGS="$pgac_save_CFLAGS_zbb"
+ AC_MSG_RESULT([yes, with runtime check])],
+ [CFLAGS="$pgac_save_CFLAGS_zbb"
+ AC_MSG_RESULT([no])])
+else
+ AC_MSG_RESULT([not on RISC-V])
+fi
+
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
PGAC_SSE42_CRC32_INTRINSICS()
diff --git a/meson.build b/meson.build
index 0a181909fab..6d02f280e08 100644
--- a/meson.build
+++ b/meson.build
@@ -2529,6 +2529,38 @@ int main(void)
endif
+# ---------------------------------------------------------------------------
+# Check for RISC-V Zbb bitmanip extension (provides 'cpop' for popcount).
+#
+# The Zbb extension provides the 'cpop' instruction for hardware popcount.
+# GCC/Clang emit the cpop instruction from __builtin_popcountll() when
+# -march=rv64gc_zbb is used. We test compilation with this flag, but
+# do NOT add it globally (for binary portability). Instead, we define
+# USE_RISCV_ZBB_WITH_RUNTIME_CHECK and compile src/port/pg_popcount_riscv.c
+# with -march=rv64gc_zbb separately (like ARM SVE and x86 POPCNT).
+# Runtime detection uses __riscv_hwprobe().
+# ---------------------------------------------------------------------------
+zbb_test_code = '''
+static inline int test_cpop(unsigned long long x)
+{ return __builtin_popcountll(x); }
+int main(void) {
+ volatile int r = test_cpop(0xdeadbeefULL);
+ (void) r;
+ return 0;
+}
+'''
+
+cflags_zbb = []
+if host_cpu == 'riscv64'
+ if cc.compiles(zbb_test_code,
+ args: ['-march=rv64gc_zbb'],
+ name: 'RISC-V Zbb cpop')
+ cdata.set('USE_RISCV_ZBB_WITH_RUNTIME_CHECK', 1)
+ # Flag will be added only to pg_popcount_riscv.c in src/port/meson.build
+ cflags_zbb = ['-march=rv64gc_zbb']
+ endif
+endif
+
###############################################################
# Select CRC-32C implementation.
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 0bca559caaa..8db645c4a42 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -279,7 +279,7 @@ pg_ceil_log2_64(uint64 num)
extern uint64 pg_popcount_portable(const char *buf, int bytes);
extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask);
-#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK)
+#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) || defined(USE_RISCV_ZBB_WITH_RUNTIME_CHECK)
/*
* Attempt to use specialized CPU instructions, but perform a runtime check
* first.
diff --git a/src/port/meson.build b/src/port/meson.build
index 7296f8e3c03..9d0bb59aca0 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -98,12 +98,15 @@ replace_funcs_pos = [
# loongarch
['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+ # riscv
+ ['pg_popcount_riscv', 'USE_RISCV_ZBB_WITH_RUNTIME_CHECK', 'zbb'],
+
# generic fallback
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
]
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'zbb': cflags_zbb}
+pgport_sources_cflags = {'crc': [], 'zbb': []}
foreach f : replace_funcs_neg
func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 49b130f1306..699ae89129f 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -162,7 +162,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask)
return popcnt;
}
-#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON)
+#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON) && !defined(USE_RISCV_ZBB_WITH_RUNTIME_CHECK)
/*
* When special CPU instructions are not available, there's no point in using
@@ -191,4 +191,5 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
return pg_popcount_masked_portable(buf, bytes, mask);
}
-#endif /* ! HAVE_X86_64_POPCNTQ && ! USE_NEON */
+#endif /* ! HAVE_X86_64_POPCNTQ && ! USE_NEON && !
+ * USE_RISCV_ZBB_WITH_RUNTIME_CHECK */
diff --git a/src/port/pg_popcount_riscv.c b/src/port/pg_popcount_riscv.c
new file mode 100644
index 00000000000..a3b1dda4bac
--- /dev/null
+++ b/src/port/pg_popcount_riscv.c
@@ -0,0 +1,159 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_riscv.c
+ * Holds the RISC-V Zbb popcount implementations.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/port/pg_popcount_riscv.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#ifdef USE_RISCV_ZBB_WITH_RUNTIME_CHECK
+
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/hwprobe.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+/*
+ * Hardware implementation using RISC-V Zbb cpop instruction.
+ */
+static uint64 pg_popcount_zbb(const char *buf, int bytes);
+static uint64 pg_popcount_masked_zbb(const char *buf, int bytes, bits8 mask);
+
+/*
+ * The function pointers are initially set to "choose" functions. These
+ * functions will first set the pointers to the right implementations (based on
+ * what the current CPU supports) and then will call the pointer to fulfill the
+ * caller's request.
+ */
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
+uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
+uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
+
+static inline bool
+pg_popcount_zbb_available(void)
+{
+#if defined(__linux__) && defined(__NR_riscv_hwprobe)
+ struct riscv_hwprobe pair = {.key = RISCV_HWPROBE_KEY_IMA_EXT_0};
+
+ if (syscall(__NR_riscv_hwprobe, &pair, 1, 0, NULL, 0) != 0)
+ return false;
+
+ return (pair.value & RISCV_HWPROBE_EXT_ZBB) != 0;
+#else
+ return false;
+#endif
+}
+
+static inline void
+choose_popcount_functions(void)
+{
+ if (pg_popcount_zbb_available())
+ {
+ pg_popcount_optimized = pg_popcount_zbb;
+ pg_popcount_masked_optimized = pg_popcount_masked_zbb;
+ }
+ else
+ {
+ pg_popcount_optimized = pg_popcount_portable;
+ pg_popcount_masked_optimized = pg_popcount_masked_portable;
+ }
+}
+
+static uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+ choose_popcount_functions();
+ return pg_popcount_optimized(buf, bytes);
+}
+
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+ choose_popcount_functions();
+ return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
+/*
+ * pg_popcount64_zbb
+ * Return the number of 1 bits set in word
+ *
+ * Uses the RISC-V Zbb 'cpop' (count population) instruction via
+ * __builtin_popcountll(). When compiled with -march=rv64gc_zbb, GCC and
+ * Clang will emit the cpop instruction for this builtin.
+ */
+pg_attribute_target("arch=rv64gc_zbb")
+static inline int
+pg_popcount64_zbb(uint64 word)
+{
+ return __builtin_popcountll(word);
+}
+
+/*
+ * pg_popcount_zbb
+ * Returns number of 1 bits in buf
+ *
+ * Similar approach to x86 SSE4.2 POPCNT: process data in 8-byte chunks using
+ * the cpop instruction, with byte-by-byte fallback for remaining data.
+ */
+pg_attribute_target("arch=rv64gc_zbb")
+static uint64
+pg_popcount_zbb(const char *buf, int bytes)
+{
+ uint64 popcnt = 0;
+ const uint64 *words = (const uint64 *) buf;
+
+ /* Process 8-byte chunks */
+ while (bytes >= 8)
+ {
+ popcnt += pg_popcount64_zbb(*words++);
+ bytes -= 8;
+ }
+
+ buf = (const char *) words;
+
+ /* Process any remaining bytes */
+ while (bytes--)
+ popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+ return popcnt;
+}
+
+/*
+ * pg_popcount_masked_zbb
+ * Returns number of 1 bits in buf after applying the mask to each byte
+ */
+pg_attribute_target("arch=rv64gc_zbb")
+static uint64
+pg_popcount_masked_zbb(const char *buf, int bytes, bits8 mask)
+{
+ uint64 popcnt = 0;
+ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
+ const uint64 *words = (const uint64 *) buf;
+
+ /* Process 8-byte chunks */
+ while (bytes >= 8)
+ {
+ popcnt += pg_popcount64_zbb(*words++ & maskv);
+ bytes -= 8;
+ }
+
+ buf = (const char *) words;
+
+ /* Process any remaining bytes */
+ while (bytes--)
+ popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+
+ return popcnt;
+}
+
+#endif /* USE_RISCV_ZBB_WITH_RUNTIME_CHECK */
--
2.51.2