diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5db02b2ab7..a5a3246199 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -694,3 +694,36 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_LOONGARCH_CRC32C_INTRINSICS
+
+# PGAC_AVX512_POPCNT_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the x86_64 AVX512 POPCNT instructions using
+# intrinsics used in CPUID features AVX512F and AVX512VPOPCNTDQ.
+#
+# Optional compiler flags can be passed as argument (e.g. -mavx512vpopcntdq).
+# If the intrinsics are supported then pgac_avx512_popcnt_intrinsics and
+# CFLAGS_AVX512_POPCNT are set.
+AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+#include <stdint.h>],
+  [__m512i tmp __attribute__((aligned(64)));
+   __m512i input = _mm512_setzero_si512();
+   __m512i output = _mm512_popcnt_epi64(input);
+   uint64_t cnt = 999;
+   _mm512_store_si512(&tmp, output);
+   cnt = _mm512_reduce_add_epi64(tmp);
+   /* return computed value, to prevent the above being optimized away */
+   return cnt == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_AVX512_POPCNT="$1"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_POPCNT_INTRINSICS
diff --git a/configure b/configure
index 6b87e5c9a8..0252dab6d5 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,7 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
+CFLAGS_AVX512_POPCNT
 LIBOBJS
 OPENSSL
 ZSTD
@@ -17708,6 +17709,41 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h
 
 fi
 
+# Check for x86 cpuid_count instruction
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5
+$as_echo_n "checking for __get_cpuid_count... " >&6; }
+if ${pgac_cv__get_cpuid_count+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <cpuid.h>
+int
+main ()
+{
+unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv__get_cpuid_count="yes"
+else
+  pgac_cv__get_cpuid_count="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5
+$as_echo "$pgac_cv__get_cpuid_count" >&6; }
+if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
+
+$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h
+
+fi
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5
 $as_echo_n "checking for __cpuid... " >&6; }
 if ${pgac_cv__cpuid+:} false; then :
@@ -17742,6 +17778,164 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5
+$as_echo_n "checking for __cpuidex... " >&6; }
+if ${pgac_cv__cpuidex+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <intrin.h>
+int
+main ()
+{
+unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuidex(exx[0], 7, 0);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv__cpuidex="yes"
+else
+  pgac_cv__cpuidex="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5
+$as_echo "$pgac_cv__cpuidex" >&6; }
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+
+$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __immintrin" >&5
+$as_echo_n "checking for __immintrin... " >&6; }
+if ${pgac_cv__immintrin+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+/* Don't exclude code so added return. */
+    return 1701;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv__immintrin="yes"
+else
+  pgac_cv__immintrin="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__immintrin" >&5
+$as_echo "$pgac_cv__immintrin" >&6; }
+if test x"$pgac_cv__immintrin" = x"yes"; then
+
+$as_echo "#define HAVE__IMMINTRIN 1" >>confdefs.h
+
+fi
+
+# Check for Intel AVX512 intrinsics to do POPCNT calculations.
+#
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+#include <stdint.h>
+int
+main ()
+{
+__m512i tmp __attribute__((aligned(64)));
+   __m512i input = _mm512_setzero_si512();
+   __m512i output = _mm512_popcnt_epi64(input);
+   uint64_t cnt = 999;
+   _mm512_store_si512(&tmp, output);
+   cnt = _mm512_reduce_add_epi64(tmp);
+   /* return computed value, to prevent the above being optimized away */
+   return cnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics_=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
+  CFLAGS_AVX512_POPCNT=""
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512f"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+#include <stdint.h>
+int
+main ()
+{
+__m512i tmp __attribute__((aligned(64)));
+   __m512i input = _mm512_setzero_si512();
+   __m512i output = _mm512_popcnt_epi64(input);
+   uint64_t cnt = 999;
+   _mm512_store_si512(&tmp, output);
+   cnt = _mm512_reduce_add_epi64(tmp);
+   /* return computed value, to prevent the above being optimized away */
+   return cnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" = x"yes"; then
+  CFLAGS_AVX512_POPCNT="-mavx512vpopcntdq -mavx512f"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+fi
+
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 6e64ece11d..8fcf635b08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2068,6 +2068,18 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then
   AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.])
 fi
 
+# Check for x86 cpuid_count instruction
+AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>],
+  [[unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+  ]])],
+  [pgac_cv__get_cpuid_count="yes"],
+  [pgac_cv__get_cpuid_count="no"])])
+if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
+  AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid.])
+fi
+
 AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid],
 [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
   [[unsigned int exx[4] = {0, 0, 0, 0};
@@ -2079,6 +2091,36 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
+AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
+  [[unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuidex(exx[0], 7, 0);
+  ]])],
+  [pgac_cv__cpuidex="yes"],
+  [pgac_cv__cpuidex="no"])])
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+  AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
+fi
+
+AC_CACHE_CHECK([for __immintrin], [pgac_cv__immintrin],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [[/* Don't exclude code so added return. */
+    return 1701;
+  ]])],
+  [pgac_cv__immintrin="yes"],
+  [pgac_cv__immintrin="no"])])
+if test x"$pgac_cv__immintrin" = x"yes"; then
+  AC_DEFINE(HAVE__IMMINTRIN, 1, [Define to 1 if you have immintrin.])
+fi
+
+# Check for Intel AVX512 intrinsics to do POPCNT calculations.
+#
+PGAC_AVX512_POPCNT_INTRINSICS([])
+if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+  PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512f])
+fi
+AC_SUBST(CFLAGS_AVX512_POPCNT)
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/meson.build b/meson.build
index 8ed51b6aae..bd297d9fa9 100644
--- a/meson.build
+++ b/meson.build
@@ -1773,6 +1773,37 @@ elif cc.links('''
 endif
 
 
+if cc.links('''
+    #include <cpuid.h>
+    int main(int arg, char **argv)
+    {
+        unsigned int exx[4] = {0, 0, 0, 0};
+        __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+    }
+    ''', name: '__get_cpuid_count',
+    args: test_c_args)
+  cdata.set('HAVE__GET_CPUID_COUNT', 1)
+elif cc.links('''
+    #include <intrin.h>
+    int main(int arg, char **argv)
+    {
+        unsigned int exx[4] = {0, 0, 0, 0};
+        __cpuidex(exx, 7, 0);
+    }
+    ''', name: '__cpuidex',
+    args: test_c_args)
+  cdata.set('HAVE__CPUIDEX', 1)
+endif
+
+
+# Check for header immintrin.h
+if cc.has_header('immintrin.h',
+    include_directories: postgres_inc, args: test_c_args)
+  cdata.set('HAVE__IMMINTRIN', 1,
+            description: 'Define to 1 if you have the immintrin.h header file.')
+endif
+
+
 # Defend against clang being used on x86-32 without SSE2 enabled.  As current
 # versions of clang do not understand -fexcess-precision=standard, the use of
 # x87 floating point operations leads to problems like isinf possibly returning
@@ -2146,6 +2177,43 @@ elif host_cpu == 'ppc' or host_cpu == 'ppc64'
   endif
 endif
 
+###############################################################
+# AVX 512 POPCNT Intrinsic check
+###############################################################
+have_avx512_popcnt = false
+cflags_avx512_popcnt = []
+if host_cpu == 'x86_64'
+  test_flags = ['-mavx512vpopcntdq', '-mavx512f']
+  if host_system == 'windows'
+    test_flags = ['/arch:AVX512']
+  endif
+  prog = '''
+      #include <immintrin.h>
+      #include <stdint.h>
+      void main(void)
+      {
+        __m512i tmp __attribute__((aligned(64)));
+        __m512i input = _mm512_setzero_si512();
+        __m512i output = _mm512_popcnt_epi64(input);
+        uint64_t cnt = 999;
+        _mm512_store_si512(&tmp, output);
+        cnt = _mm512_reduce_add_epi64(tmp);
+        /* return computed value, to prevent the above being optimized away */
+        return cnt == 0;
+      }'''
+  if cc.links(prog, name: '_mm512_* methods with -mavx512vpopcntdq -mavx512f',
+              args: test_c_args + test_flags)
+    have_avx512_popcnt = true
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+    cdata.set('HAVE__AVX512_POPCNT', 1)
+    cflags_avx512_popcnt = test_flags
+  else
+    have_avx512_popcnt = false
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+    cflags_avx512_popcnt = []
+  endif # compile/link test
+endif # host_cpu check
+
 
 
 ###############################################################
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8b3f8c24e0..089f49b7f3 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -263,6 +263,7 @@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
 CFLAGS_CRC = @CFLAGS_CRC@
+CFLAGS_AVX512_POPCNT = @CFLAGS_AVX512_POPCNT@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 CXXFLAGS = @CXXFLAGS@
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 07e73567dc..20e14c6499 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -555,6 +555,12 @@
 /* Define to 1 if you have __get_cpuid. */
 #undef HAVE__GET_CPUID
 
+/* Define to 1 if you have __get_cpuid_count. */
+#undef HAVE__GET_CPUID_COUNT
+
+/* Define to 1 if you have  immintrin. */
+#undef HAVE__IMMINTRIN
+
 /* Define to 1 if your compiler understands _Static_assert. */
 #undef HAVE__STATIC_ASSERT
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 799f70d052..caca78d805 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -303,16 +303,23 @@ pg_ceil_log2_64(uint64 num)
 extern int	(*pg_popcount32) (uint32 word);
 extern int	(*pg_popcount64) (uint64 word);
 
+#if defined(_MSC_VER)
+extern uint64 pg_popcount(const char *buf, int bytes);
+extern uint64 (*pg_popcount_indirect)(const char *buf, int bytes);
+#else
+extern uint64 (*pg_popcount)(const char *buf, int bytes);
+#endif
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int	pg_popcount32(uint32 word);
 extern int	pg_popcount64(uint64 word);
 
-#endif							/* TRY_POPCNT_FAST */
-
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
 
+#endif							/* TRY_POPCNT_FAST */
+
 /*
  * Rotate the bits of "word" to the right/left by n bits.
  */
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index b0f4178b3d..ee3647282e 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -100,6 +100,7 @@ pgxs_kv = {
     ' '.join(cflags_no_decl_after_statement),
 
   'CFLAGS_CRC': ' '.join(cflags_crc),
+  'CFLAGS_AVX512_POPCNT': ' '.join(cflags_avx512_popcnt),
   'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
   'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
 
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..ef6c02a6bf 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -43,6 +43,8 @@ OBJS = \
 	inet_net_ntop.o \
 	noblock.o \
 	path.o \
+	pg_popcnt_choose.o \
+	pg_popcnt_x86_64_accel.o \
 	pg_bitutils.o \
 	pg_strong_random.o \
 	pgcheckdir.o \
@@ -87,6 +89,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# Newer Intel processors can use AVX-512 POPCNT Capabilities (01/30/2024)
+pg_popcnt_x86_64_accel.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT)
+pg_popcnt_x86_64_accel_shlib.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT)
+pg_popcnt_x86_64_accel_srv.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT)
+
 # all versions of pg_crc32c_armv8.o need CFLAGS_CRC
 pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef..d7930672cb 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_popcnt_choose.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
   'pgmkdirp.c',
@@ -84,6 +85,7 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_popcnt_x86_64_accel', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'avx512'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -98,8 +100,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'avx512': cflags_avx512_popcnt}
+pgport_sources_cflags = {'crc': [], 'avx512': []}
 
 foreach f : replace_funcs_neg
   func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 640a89561a..942e396141 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -12,16 +12,8 @@
  */
 #include "c.h"
 
-#ifdef HAVE__GET_CPUID
-#include <cpuid.h>
-#endif
-#ifdef HAVE__CPUID
-#include <intrin.h>
-#endif
-
 #include "port/pg_bitutils.h"
 
-
 /*
  * Array giving the position of the left-most set bit for each possible
  * byte value.  We count the right-most position as the 0th bit, and the
@@ -78,6 +70,7 @@ const uint8 pg_rightmost_one_pos[256] = {
 	4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
 };
 
+
 /*
  * Array giving the number of 1-bits in each possible byte value.
  *
@@ -103,123 +96,35 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-static int	pg_popcount32_slow(uint32 word);
-static int	pg_popcount64_slow(uint64 word);
+int	pg_popcount32_slow(uint32 word);
+int	pg_popcount64_slow(uint64 word);
+uint64 pg_popcount_slow(const char *buf, int bytes);
 
 #ifdef TRY_POPCNT_FAST
-static bool pg_popcount_available(void);
-static int	pg_popcount32_choose(uint32 word);
-static int	pg_popcount64_choose(uint64 word);
-static int	pg_popcount32_fast(uint32 word);
-static int	pg_popcount64_fast(uint64 word);
+extern int	pg_popcount32_choose(uint32 word);
+extern int	pg_popcount64_choose(uint64 word);
+extern uint64 pg_popcount_choose(const char *buf, int bytes);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-#endif							/* TRY_POPCNT_FAST */
-
-#ifdef TRY_POPCNT_FAST
-
-/*
- * Return true if CPUID indicates that the POPCNT instruction is available.
- */
-static bool
-pg_popcount_available(void)
-{
-	unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
-	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-	__cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-
-	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
-}
-
-/*
- * These functions get called on the first call to pg_popcount32 etc.
- * They detect whether we can use the asm implementations, and replace
- * the function pointers so that subsequent calls are routed directly to
- * the chosen implementation.
- */
-static int
-pg_popcount32_choose(uint32 word)
-{
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-	}
-
-	return pg_popcount32(word);
-}
-
-static int
-pg_popcount64_choose(uint64 word)
-{
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-	}
-
-	return pg_popcount64(word);
-}
-
-/*
- * pg_popcount32_fast
- *		Return the number of 1 bits set in word
- */
-static int
-pg_popcount32_fast(uint32 word)
+#if defined(_MSC_VER)
+uint64 (*pg_popcount_indirect)(const char *buf, int bytes) = pg_popcount_choose;
+uint64 pg_popcount(const char *buf, int bytes)
 {
-#ifdef _MSC_VER
-	return __popcnt(word);
-#else
-	uint32		res;
-
-__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
-	return (int) res;
-#endif
+	return pg_popcount_indirect(buf, bytes);
 }
-
-/*
- * pg_popcount64_fast
- *		Return the number of 1 bits set in word
- */
-static int
-pg_popcount64_fast(uint64 word)
-{
-#ifdef _MSC_VER
-	return __popcnt64(word);
 #else
-	uint64		res;
-
-__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
-	return (int) res;
+uint64		(*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
 #endif
-}
-
-#endif							/* TRY_POPCNT_FAST */
-
+#else								/* TRY_POPCNT_FAST */
+uint64 pg_popcount(const char *buf, int bytes);
+#endif 								/* TRY_POPCNT_FAST */
 
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount32_slow(uint32 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -241,7 +146,7 @@ pg_popcount32_slow(uint32 word)
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
  */
-static int
+int
 pg_popcount64_slow(uint64 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -286,22 +191,29 @@ pg_popcount64(uint64 word)
 	return pg_popcount64_slow(word);
 }
 
+uint64
+pg_popcount(const char *buf, int bytes)
+{
+	return pg_popcount_slow(buf, bytes);
+}
+
 #endif							/* !TRY_POPCNT_FAST */
 
 /*
  * pg_popcount
- *		Returns the number of 1-bits in buf
+ *		Returns the number of 1-bits in buf using either 32 or 64 bit loops
+ *      or fallback to __builtin_* or pure software.
  */
 uint64
-pg_popcount(const char *buf, int bytes)
+pg_popcount_slow(const char *buf, int bytes)
 {
 	uint64		popcnt = 0;
 
-#if SIZEOF_VOID_P >= 8
+#if SIZEOF_VOID_P == 8
 	/* Process in 64-bit chunks if the buffer is aligned. */
-	if (buf == (const char *) TYPEALIGN(8, buf))
+	if (buf == (const char *)TYPEALIGN(8, buf))
 	{
-		const uint64 *words = (const uint64 *) buf;
+		const uint64 *words = (const uint64 *)buf;
 
 		while (bytes >= 8)
 		{
@@ -309,9 +221,9 @@ pg_popcount(const char *buf, int bytes)
 			bytes -= 8;
 		}
 
-		buf = (const char *) words;
+		buf = (const char *)words;
 	}
-#else
+#elif SIZEOF_VOID_P == 4
 	/* Process in 32-bit chunks if the buffer is aligned. */
 	if (buf == (const char *) TYPEALIGN(4, buf))
 	{
diff --git a/src/port/pg_popcnt_choose.c b/src/port/pg_popcnt_choose.c
new file mode 100644
index 0000000000..e170e16ff9
--- /dev/null
+++ b/src/port/pg_popcnt_choose.c
@@ -0,0 +1,168 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcnt_x86_64_choose.c
+ *	  Miscellaneous functions for bit-wise operations.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcnt_x86_64_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_bitutils.h"
+
+#ifdef TRY_POPCNT_FAST
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+static bool pg_popcount_available(void);
+int pg_popcount32_choose(uint32 word);
+int pg_popcount64_choose(uint64 word);
+uint64 pg_popcount_choose(const char *buf, int bytes);
+
+extern int pg_popcount32_fast(uint32 word);
+extern int pg_popcount64_fast(uint64 word);
+extern int pg_popcount32_slow(uint32 word);
+extern int pg_popcount64_slow(uint64 word);
+extern uint64 pg_popcount512_fast(const char *buf, int bytes);
+extern uint64 pg_popcount_slow(const char *buf, int bytes);
+extern uint64 (*pg_popcount_indirect)(const char *buf, int bytes);
+
+extern int (*pg_popcount32)(uint32 word);
+extern int (*pg_popcount64)(uint64 word);
+
+/*
+ * Return true if CPUID indicates that the POPCNT instruction is available.
+ */
+static bool
+pg_popcount_available(void)
+{
+    unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+    __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+    __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+    return (exx[2] & (1 << 23)) != 0; /* POPCNT */
+}
+
+/*
+ * Return true if CPUID indicates that the AVX512_POPCNT instruction is
+ * available. This is similar to the method above; see
+ * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features
+ *
+ * Finally, we make sure the xgetbv result is consistent with the CPUID
+ * results.
+ */
+static bool
+pg_popcount512_available(void)
+{
+    unsigned int exx[4] = {0, 0, 0, 0};
+
+    /* Check for AVX512VPOPCNTDQ and AVX512F */
+#if defined(HAVE__GET_CPUID_COUNT)
+    __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+    __cpuidex(exx, 7, 0);
+#endif
+
+    if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0)
+    {
+        /*
+         * CPUID succeeded, does the current running OS support the
+         * ZMM registers which are required for AVX512? This check is
+         * required to make sure an old OS on a new CPU is correctly
+         * checked or a VM hypervisor is not excluding AVX512 ZMM
+         * support in the VM; see "5.1.9 Detection of AVX Instructions"
+         * https://www.intel.com/content/www/us/en/content-details/671488/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html
+         */
+        uint64 xcr = 0;
+#ifdef _MSC_VER
+        uint64 highlow = _xgetbv(xcr);
+
+        return (highlow & 0xE0) != 0;
+#else
+        uint32 high;
+        uint32 low;
+        
+        __asm__ __volatile__("xgetbv\t\n" : "=a"(low), "=d"(high) : "c"(xcr));
+        return (low & 0xE0) != 0;
+#endif
+    } /* POPCNT 512 */
+    return false;
+}
+
+/*
+ * These functions get called on the first call to pg_popcount32 etc.
+ * They detect whether we can use the asm implementations, and replace
+ * the function pointers so that subsequent calls are routed directly to
+ * the chosen implementation.
+ */
+static void set_up_function_pointers()
+{
+    if (pg_popcount512_available())
+    {
+#if defined(_MSC_VER)
+        pg_popcount_indirect = pg_popcount512_fast;
+#else
+        pg_popcount = pg_popcount512_fast;
+#endif
+    }
+    else
+    {
+#if defined(_MSC_VER)
+        pg_popcount_indirect = pg_popcount_slow;
+#else
+        pg_popcount = pg_popcount_slow;
+#endif
+    }
+    if (pg_popcount_available())
+    {
+        pg_popcount32 = pg_popcount32_fast;
+        pg_popcount64 = pg_popcount64_fast;
+    }
+    else
+    {
+        pg_popcount32 = pg_popcount32_slow;
+        pg_popcount64 = pg_popcount64_slow;
+    }
+}
+
+int pg_popcount32_choose(uint32 word)
+{
+    set_up_function_pointers();
+    return pg_popcount32(word);
+}
+
+int
+pg_popcount64_choose(uint64 word)
+{
+    set_up_function_pointers();
+    return pg_popcount64(word);
+}
+
+uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+    set_up_function_pointers();
+#if defined(_MSC_VER)
+    return pg_popcount_indirect(buf, bytes);
+#else
+    return pg_popcount(buf, bytes);
+#endif
+}
+
+#endif                                          /* TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcnt_x86_64_accel.c b/src/port/pg_popcnt_x86_64_accel.c
new file mode 100644
index 0000000000..aef32c1174
--- /dev/null
+++ b/src/port/pg_popcnt_x86_64_accel.c
@@ -0,0 +1,93 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcnt_x86_64_accel.c
+ *	  Miscellaneous functions for bit-wise operations.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcnt_x86_64_accel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#if defined(HAVE__IMMINTRIN)
+#include <immintrin.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+#ifdef TRY_POPCNT_FAST
+extern const uint8 pg_number_of_ones[256];
+extern uint64 pg_popcount_slow(const char *buf, int bytes);
+uint64 pg_popcount512_fast(const char *buf, int bytes);
+int pg_popcount32_fast(uint32 word);
+int pg_popcount64_fast(uint64 word);
+
+/*
+ * pg_popcount32_fast
+ *		Return the number of 1 bits set in word
+ */
+int pg_popcount32_fast(uint32 word)
+{
+#ifdef _MSC_VER
+    return __popcnt(word);
+#else
+    uint32 res;
+    
+    __asm__ __volatile__(" popcntl %1,%0\n" : "=q"(res) : "rm"(word) : "cc");
+    return (int)res;
+#endif
+}
+
+/*
+ * pg_popcount64_fast
+ *		Return the number of 1 bits set in word
+ */
+int
+pg_popcount64_fast(uint64 word)
+{
+#ifdef _MSC_VER
+    return __popcnt64(word);
+#else
+    uint64 res;
+
+    __asm__ __volatile__(" popcntq %1,%0\n" : "=q"(res) : "rm"(word) : "cc");
+    return (int)res;
+#endif
+}
+
+/*
+ * Use AVX-512 Intrinsics for supported Intel CPUs or fall back the the software
+ * loop in pg_bunutils.c and use the best 32 or 64 bit fast methods. If no fast
+ * methods are used this will fall back to __builtin_* or pure software.
+ */
+uint64
+pg_popcount512_fast(const char *buf, int bytes)
+{
+#if defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1
+    uint64 popcnt = 0;
+    __m512i accumulator = _mm512_setzero_si512();
+
+    while (bytes >= 64)
+    {
+        const __m512i v = _mm512_loadu_si512((const __m512i *)buf);
+        const __m512i p = _mm512_popcnt_epi64(v);
+
+        accumulator = _mm512_add_epi64(accumulator, p);
+        bytes -= 64;
+        buf += 64;
+    }
+
+    popcnt = _mm512_reduce_add_epi64(accumulator);
+
+    /* Process any remaining bytes */
+    while (bytes--)
+        popcnt += pg_number_of_ones[(unsigned char)*buf++];
+    return popcnt;
+#else
+    return pg_popcount_slow(buf, bytes);
+#endif /* USE_AVX512_CODE */
+}
+#endif                              /* TRY_POPCNT_FAST */
