From 5fb72a44d29b1ea3d7f8475aaa45714db7c0aa67 Mon Sep 17 00:00:00 2001
From: "xiang.gao" <xiang.gao@arm.com>
Date: Wed, 13 Sep 2023 15:13:37 +0800
Subject: [PATCH] PostgreSQL: CRC32C optimization

Crc32c Parallel computation optimization
Algorithm comes from Intel whitepaper: crc-iscsi-polynomial-crc32-instruction-paper
Input data is divided into three equal-sized blocks.
Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes. One Block: 42(BLK_LEN) * 8 bytes

Crc32c unitest: https://gist.github.com/gaoxyt/138fd53ca1eead8102eeb9204067f7e4
Crc32c benchmark: https://gist.github.com/gaoxyt/4506c10fc06b3501445e32c4257113e9
It gets ~2x speedup compared to linear Arm crc32c instructions.

Signed-off-by: xiang.gao <xiang.gao@arm.com>
Change-Id: If876bbca5bbc3940946a7d72e14fe9fdf54682c1
---
 config/c-compiler.m4              | 25 ++++++++
 configure                         | 59 ++++++++++++++++++-
 configure.ac                      | 22 +++++++-
 src/include/pg_config.h.in        |  3 +
 src/include/port/pg_crc32c.h      | 19 ++++---
 src/port/pg_crc32c_armv8.c        | 94 +++++++++++++++++++++++++++++++
 src/port/pg_crc32c_armv8_choose.c | 49 +++++++++++++++-
 7 files changed, 259 insertions(+), 12 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5db02b2ab7..483d4724d1 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -662,6 +662,31 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
 
+# PGAC_ARMV8_VMULL_INTRINSICS
+# ----------------------------
+# Check if the compiler supports the vmull_p64
+# intrinsic functions. These instructions
+# were first introduced in ARMv8 crypto Extension.
+#
+# An optional compiler flag can be passed as argument (e.g.
+# -march=armv8-a+crypto). If the intrinsics are supported, sets
+# pgac_armv8_vmull_intrinsics, and CFLAGS_VMULL.
+AC_DEFUN([PGAC_ARMV8_VMULL_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_armv8_vmull_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for vmull_p64 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_neon.h>],
+  [return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678);])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_armv8_vmull_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_ARMV8_VMULL_INTRINSICS
+
 # PGAC_LOONGARCH_CRC32C_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the LoongArch CRCC instructions, using
diff --git a/configure b/configure
index d47e0f8b26..b7f60cae87 100755
--- a/configure
+++ b/configure
@@ -18033,6 +18033,44 @@ fi
 
 
 
+# Check for ARMv8 VMULL intrinsics to do polynomial multiplication
+#
+# Check if vmull_p64 intrinsics can be used with the compiler
+# flag -march=armv8-a+crypto.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vmull_p64 with CFLAGS=-march=armv8-a+crypto" >&5
+$as_echo_n "checking for vmull_p64 with CFLAGS=-march=armv8-a+crypto... " >&6; }
+if ${pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -march=armv8-a+crypto"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_neon.h>
+int
+main ()
+{
+return ((uint64_t)vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto=yes
+else
+  pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" >&5
+$as_echo "$pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" >&6; }
+if test x"$pgac_cv_armv8_vmull_intrinsics__march_armv8_apcrypto" = x"yes"; then
+  pgac_armv8_vmull_intrinsics=yes
+fi
+
+
 # Select CRC-32C implementation.
 #
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
@@ -18084,6 +18122,13 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
   fi
 fi
 
+# Use ARM VMULL if available and ARM CRC32C intrinsic is avaliable too.
+if test x"$USE_ARMV8_VMULL" = x"" && (test x"$USE_ARMV8_CRC32C" = x"1" || test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"); then
+  if test x"$pgac_armv8_vmull_intrinsics" = x"yes"; then
+    USE_ARMV8_VMULL=1
+  fi
+fi
+
 # Set PG_CRC32C_OBJS appropriately depending on the selected implementation.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking which CRC-32C implementation to use" >&5
 $as_echo_n "checking which CRC-32C implementation to use... " >&6; }
@@ -18107,7 +18152,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; }
 
 $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
 
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
       { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
 $as_echo "ARMv8 CRC instructions" >&6; }
     else
@@ -18140,6 +18185,18 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use ARM VMULL intrinsic" >&5
+$as_echo_n "checking whether to use ARM VMULL intrinsic... " >&6; }
+if test x"$USE_ARMV8_VMULL" = x"1"; then
+
+$as_echo "#define USE_ARMV8_VMULL 1" >>confdefs.h
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index 440b08d113..de33e326a2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2105,6 +2105,12 @@ PGAC_LOONGARCH_CRC32C_INTRINSICS()
 
 AC_SUBST(CFLAGS_CRC)
 
+# Check for ARMv8 VMULL intrinsics to do polynomial multiplication
+#
+# Check if vmull_p64 intrinsics can be used with the compiler
+# flag -march=armv8-a+crypto.
+PGAC_ARMV8_VMULL_INTRINSICS([-march=armv8-a+crypto])
+
 # Select CRC-32C implementation.
 #
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
@@ -2156,6 +2162,13 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
   fi
 fi
 
+# Use ARM VMULL if available and ARM CRC32C intrinsic is avaliable too.
+if test x"$USE_ARMV8_VMULL" = x"" && (test x"$USE_ARMV8_CRC32C" = x"1" || test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"); then
+  if test x"$pgac_armv8_vmull_intrinsics" = x"yes"; then
+    USE_ARMV8_VMULL=1
+  fi
+fi
+
 # Set PG_CRC32C_OBJS appropriately depending on the selected implementation.
 AC_MSG_CHECKING([which CRC-32C implementation to use])
 if test x"$USE_SSE42_CRC32C" = x"1"; then
@@ -2170,7 +2183,7 @@ else
   else
     if test x"$USE_ARMV8_CRC32C" = x"1"; then
       AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
       AC_MSG_RESULT(ARMv8 CRC instructions)
     else
       if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@@ -2193,6 +2206,13 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+AC_MSG_CHECKING([whether to use ARM VMULL intrinsic])
+if test x"$USE_ARMV8_VMULL" = x"1"; then
+  AC_DEFINE(USE_ARMV8_VMULL, 1, [Define to 1 to use ARMv8 VMULL Extension.])
+  AC_MSG_RESULT(yes)
+else
+  AC_MSG_RESULT(no)
+fi
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d8a2985567..65cd43e156 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -689,6 +689,9 @@
 /* Define to 1 to use ARMv8 CRC Extension with a runtime check. */
 #undef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use ARMv8 VMULL Extension. */
+#undef USE_ARMV8_VMULL
+
 /* Define to 1 to build with assertion checks. (--enable-cassert) */
 #undef USE_ASSERT_CHECKING
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index d085f1dc00..35eb689a3b 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -49,14 +49,20 @@ typedef uint32 pg_crc32c;
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_ARMV8_CRC32C)
+#elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 /* Use ARMv8 CRC Extension instructions. */
-
 #define COMP_CRC32C(crc, data, len)							\
-	((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+	((crc) = pg_comp_crc32c((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+
+#if defined(USE_ARMV8_VMULL)
+#include<arm_neon.h>
+extern pg_crc32c pg_comp_crc32c_with_vmull_armv8(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
@@ -67,10 +73,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
- * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
  * to check that they are available.
  */
 #define COMP_CRC32C(crc, data, len) \
@@ -83,9 +89,6 @@ extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len)
 #ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 #endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
-#endif
 
 #else
 /*
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index d8fae510cf..672a4e417b 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -2,6 +2,7 @@
  *
  * pg_crc32c_armv8.c
  *	  Compute CRC-32C checksum using ARMv8 CRC Extension instructions
+ *	  with ARMv8 VMULL Extentsion instructions or not
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -18,6 +19,99 @@
 
 #include "port/pg_crc32c.h"
 
+#if defined(USE_ARMV8_VMULL)
+#include <arm_neon.h>
+__attribute__((target("+crypto")))
+pg_crc32c
+pg_comp_crc32c_with_vmull_armv8(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * ARMv8 doesn't require alignment, but aligned memory access is
+	 * significantly faster. Process leading bytes so that the loop below
+	 * starts with a pointer aligned to eight bytes.
+	 */
+	if (!PointerIsAligned(p, uint16) &&
+		p + 1 <= pend)
+	{
+		crc = __crc32cb(crc, *p);
+		p += 1;
+	}
+	if (!PointerIsAligned(p, uint32) &&
+		p + 2 <= pend)
+	{
+		crc = __crc32ch(crc, *(uint16 *) p);
+		p += 2;
+	}
+	if (!PointerIsAligned(p, uint64) &&
+		p + 4 <= pend)
+	{
+		crc = __crc32cw(crc, *(uint32 *) p);
+		p += 4;
+	}
+
+/*
+ * Crc32c parallel computation Input data is divided into three
+ * equal-sized blocks. Block length : 42 words(42 * 8 bytes).
+ * CRC0: 0 ~ 41 * 8,
+ * CRC1: 42 * 8 ~ (42 * 2 - 1) * 8,
+ * CRC2: 42 * 2 * 8 ~ (42 * 3 - 1) * 8.
+ */
+	while (p + 1024 <= pend)
+	{
+#define BLOCK_LEN 42
+		const uint64_t *in64 = (const uint64_t *) (p);
+		uint32_t	crc0 = crc,
+					crc1 = 0,
+					crc2 = 0;
+
+		for (int i = 0; i < BLOCK_LEN; i++, in64++)
+		{
+			crc0 = __crc32cd(crc0, *(in64));
+			crc1 = __crc32cd(crc1, *(in64 + BLOCK_LEN));
+			crc2 = __crc32cd(crc2, *(in64 + BLOCK_LEN * 2));
+		}
+		in64 += BLOCK_LEN * 2;
+		crc0 = __crc32cd(0, vmull_p64(crc0, 0xcec3662e));
+		crc1 = __crc32cd(0, vmull_p64(crc1, 0xa60ce07b));
+		crc = crc0 ^ crc1 ^ crc2;
+
+		crc = __crc32cd(crc, *in64++);
+		crc = __crc32cd(crc, *in64++);
+
+		p += 1024;
+#undef BLOCK_LEN
+	}
+
+	/* Process eight bytes at a time, as far as we can. */
+	while (p + 8 <= pend)
+	{
+		crc = __crc32cd(crc, *(uint64 *) p);
+		p += 8;
+	}
+
+	/* Process remaining 0-7 bytes. */
+	if (p + 4 <= pend)
+	{
+		crc = __crc32cw(crc, *(uint32 *) p);
+		p += 4;
+	}
+	if (p + 2 <= pend)
+	{
+		crc = __crc32ch(crc, *(uint16 *) p);
+		p += 2;
+	}
+	if (p < pend)
+	{
+		crc = __crc32cb(crc, *p);
+	}
+
+	return crc;
+}
+#endif
+
 pg_crc32c
 pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 {
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index 0fdddccaf7..2a3b8ba907 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -4,8 +4,8 @@
  *	  Choose between ARMv8 and software CRC-32C implementation.
  *
  * On first call, checks if the CPU we're running on supports the ARMv8
- * CRC Extension. If it does, use the special instructions for CRC-32C
- * computation. Otherwise, fall back to the pure software implementation
+ * CRC Extension and VMULL Extension. If it does, use the special instructions
+ * for CRC-32C computation. Otherwise, fall back to the pure software implementation
  * (slicing-by-8).
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
@@ -77,6 +77,36 @@ pg_crc32c_armv8_available(void)
 	return (result > 0);
 }
 
+#if defined(USE_ARMV8_VMULL)
+__attribute__((target("+crypto")))
+static bool
+pg_vmull_armv8_available(void)
+{
+	int			result;
+
+	pqsignal(SIGILL, illegal_instruction_handler);
+	if (sigsetjmp(illegal_instruction_jump, 1) == 0)
+	{
+		result = ((uint64_t) vmull_p64(0x12345678, 0x9abcde01) == 0x8860e9abc170678);
+	}
+	else
+	{
+		/* We got the SIGILL trap */
+		result = -1;
+	}
+	pqsignal(SIGILL, SIG_DFL);
+
+#ifndef FRONTEND
+	/* We don't expect this case, so complain loudly */
+	if (result == 0)
+		elog(ERROR, "vmull_p64 hardware results error");
+
+	elog(DEBUG1, "using armv8 vmull_p64 hardware = %d", (result > 0));
+#endif
+	return (result > 0);
+}
+#endif
+
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -85,9 +115,24 @@ static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
 	if (pg_crc32c_armv8_available())
+	{
+#if defined(USE_ARMV8_VMULL)
+		if (pg_vmull_armv8_available())
+		{
+			pg_comp_crc32c = pg_comp_crc32c_with_vmull_armv8;
+		}
+		else
+		{
+			pg_comp_crc32c = pg_comp_crc32c_armv8;
+		}
+#else
 		pg_comp_crc32c = pg_comp_crc32c_armv8;
+#endif
+	}
 	else
+	{
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
+	}
 
 	return pg_comp_crc32c(crc, data, len);
 }
-- 
2.34.1

