[PATCH 01/11] libstdc++: C++26 [simd] details

Matthias Kretz Wed, 08 Oct 2025 04:57:36 -0700


libstdc++-v3/ChangeLog:


        * include/bits/simd_details.h: New file.
        * include/bits/simd_x86.h: New file.
        * include/bits/vec_ops.h: New file.

Signed-off-by: Matthias Kretz <[email protected]>
---
 libstdc++-v3/include/bits/simd_details.h | 1443 ++++++++++++++++++++++
 libstdc++-v3/include/bits/simd_x86.h     |  953 ++++++++++++++
 libstdc++-v3/include/bits/vec_ops.h      |  592 +++++++++
 3 files changed, 2988 insertions(+)
 create mode 100644 libstdc++-v3/include/bits/simd_details.h
 create mode 100644 libstdc++-v3/include/bits/simd_x86.h
 create mode 100644 libstdc++-v3/include/bits/vec_ops.h


--
──────────────────────────────────────────────────────────────────────────
 Dr. Matthias Kretz                           https://mattkretz.github.io
 GSI Helmholtz Center for Heavy Ion Research               https://gsi.de
 std::simd
──────────────────────────────────────────────────────────────────────────

diff --git a/libstdc++-v3/include/bits/simd_details.h b/libstdc++-v3/include/bits/simd_details.h
new file mode 100644
index 00000000000..f9a793d3a18
--- /dev/null
+++ b/libstdc++-v3/include/bits/simd_details.h
@@ -0,0 +1,1443 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025      GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ *                       Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_SIMD_BASE_H
+#define _GLIBCXX_SIMD_BASE_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include <bit>
+#include <concepts>
+#include <limits>
+#include <complex>
+
+#include <bits/c++config.h>
+#include <bits/ranges_base.h>
+#include <bits/utility.h> // integer_sequence, etc.
+
+#if __CHAR_BIT__ != 8
+// There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8.
+// Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target).
+#error "<simd> is not supported for CHAR_BIT != 8"
+#endif
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+// Work around _GLIBCXX_CLANG not being defined with older libstdc++ when compiling with Clang
+#if __GLIBCXX__ < 20250922 and defined __clang__ and __GNUC_MINOR__ == 2 and not defined _GLIBCXX_CLANG
+#define _GLIBCXX_CLANG __clang__
+#endif
+
+#if defined __x86_64__ && !__SSE2__
+#error "Use of SSE2 is required on x86-64"
+#endif
+
+#if defined __x86_64__ or defined __i386__
+#define _GLIBCXX_X86 1
+#else
+#define _GLIBCXX_X86 0
+#endif
+
+#if !_GLIBCXX_X86
+#error "Not implemented yet. Only supported on x86 for now."
+#endif
+
+#ifndef _GLIBCXX_SIMD_NOEXCEPT
+/** @internal
+ * For unit-testing preconditions, use this macro to remove noexcept.
+ */
+#define _GLIBCXX_SIMD_NOEXCEPT noexcept
+#endif
+
+#if __cpp_deleted_function >= 202403L
+#define _GLIBCXX_DELETE_MSG(msg) delete(msg)
+#else
+#define _GLIBCXX_DELETE_MSG(msg) delete
+#endif
+
+#define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x
+#define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x)
+#define _GLIBCXX_SIMD_LOC __FILE__ ":" _GLIBCXX_SIMD_TOSTRING(__LINE__) ": "
+
+#if not IFNDR_SIMD_PRECONDITIONS
+#define __glibcxx_simd_precondition(expr, msg, ...)                                                \
+  do {                                                                                             \
+    if (__builtin_expect(!bool(expr), false))                                                      \
+      std::simd::__invoke_ub(                                                                      \
+        _GLIBCXX_SIMD_LOC "precondition failure in '%s':\n" msg " ('" #expr "' does not hold)",    \
+        __PRETTY_FUNCTION__ __VA_OPT__(,) __VA_ARGS__);                                            \
+  } while(false)
+#else
+#define __glibcxx_simd_precondition(expr, msg, ...)                                                \
+  do {                                                                                             \
+    const bool __precondition_result = !bool(expr);                                                \
+    if (__builtin_constant_p(__precondition_result) && __precondition_result)                      \
+      []() __attribute__((__noinline__, __noipa__, __error__("precondition failure."               \
+        "\n" _GLIBCXX_SIMD_LOC "note: " msg " (precondition '" #expr "' does not hold)")))         \
+      { __builtin_unreachable(); }();                                                              \
+    else if (__builtin_expect(__precondition_result, false))                                       \
+      std::simd::__invoke_ub(                                                                      \
+        _GLIBCXX_SIMD_LOC "precondition failure in '%s':\n" msg " ('" #expr "' does not hold)",    \
+        __PRETTY_FUNCTION__ __VA_OPT__(,) __VA_ARGS__);                                            \
+  } while(false)
+#endif
+
+namespace std::simd
+{
+  template <typename... _Args>
+    [[noreturn, __gnu__::__always_inline__]]
+    inline void
+    __invoke_ub([[maybe_unused]] const char* __msg, [[maybe_unused]] const _Args&... __args)
+    {
+#ifdef _GLIBCXX_ASSERTIONS
+      __builtin_fprintf(stderr, __msg, __args...);
+      __builtin_fprintf(stderr, "\n");
+      __builtin_abort();
+#elif _GLIBCXX_SIMD_TRAP_ON_UB
+      __builtin_trap();
+#else
+      __builtin_unreachable();
+#endif
+    }
+
+  template <typename _Tp>
+    inline constexpr _Tp
+    __iota = [] { static_assert(false, "invalid __iota specialization"); }();
+
+#if __has_builtin(__integer_pack)
+  template <typename _Tp, std::size_t _Np>
+    inline constexpr type_identity_t<_Tp[_Np]>
+    __iota<_Tp[_Np]> = {__integer_pack(_Tp(_Np))...};
+#else
+  template<typename _Tp, typename>
+    struct __iota_array;
+
+  template<typename _Tp, _Tp... _Is>
+    struct __iota_array<_Tp, integer_sequence<_Tp, _Is...>>
+    { static constexpr _Tp _S_data[sizeof...(_Is)] = {_Is...}; };
+
+  template <typename _Tp, std::size_t _Np>
+    inline constexpr auto&
+    __iota<_Tp[_Np]> = __iota_array<_Tp, make_integer_sequence<_Tp, _Np>>::_S_data;
+#endif
+
+  // [simd.general] vectorizable types
+  template <typename _Cp, auto __re, auto __im, typename _Tp = typename _Cp::value_type>
+    constexpr _Cp __complex_object = _Cp {_Tp(__re), _Tp(__im)};
+
+  template <typename _Tp>
+    struct _Arr2
+    { _Tp _M_data[2]; };
+
+  template <typename _Tp>
+    concept __complex_like_impl
+      = requires(_Tp __x) {
+        typename _Tp::value_type;
+        { __x.real() } -> same_as<typename _Tp::value_type>;
+        { __x.imag() } -> same_as<typename _Tp::value_type>;
+        { real(__x) } -> same_as<typename _Tp::value_type>;
+        { imag(__x) } -> same_as<typename _Tp::value_type>;
+        { +__x } -> same_as<_Tp>;
+        { -__x } -> same_as<_Tp>;
+        { __x + __x } -> same_as<_Tp>;
+        { __x - __x } -> same_as<_Tp>;
+        { __x * __x } -> same_as<_Tp>;
+        { __x / __x } -> same_as<_Tp>;
+        { __x += __x } -> same_as<_Tp&>;
+        { __x -= __x } -> same_as<_Tp&>;
+        { __x *= __x } -> same_as<_Tp&>;
+        { __x /= __x } -> same_as<_Tp&>;
+        { abs(__x) } -> same_as<typename _Tp::value_type>;
+        { arg(__x) } -> same_as<typename _Tp::value_type>;
+        { norm(__x) } -> same_as<typename _Tp::value_type>;
+        { conj(__x) } -> same_as<_Tp>;
+        { proj(__x) } -> same_as<_Tp>;
+      }
+          and (__complex_object<_Tp, 1, 2> + _Tp {} == __complex_object<_Tp, 1, 2>)
+          and (__complex_object<_Tp, -1, 5> - __complex_object<_Tp, -1, 5> == _Tp {})
+          and (__complex_object<_Tp, 2, 3> * __complex_object<_Tp, 1, 1>
+                 == __complex_object<_Tp, -1, 5>)
+          and (__complex_object<_Tp, 5, 5> / __complex_object<_Tp, 1, 2>
+                 == __complex_object<_Tp, 3, -1>)
+          and (conj(__complex_object<_Tp, 5, 3>) == __complex_object<_Tp, 5, -3>)
+          // not constexpr: and (abs(__complex_object<_Tp, 3, 4>) == typename _Tp::value_type(5))
+          and (norm(__complex_object<_Tp, 5, 5>) == typename _Tp::value_type(50))
+          and (2 * sizeof(typename _Tp::value_type) == sizeof(_Tp))
+          and (__builtin_bit_cast(_Arr2<typename _Tp::value_type>, __complex_object<_Tp, 1, 2>)
+                 ._M_data[0] == 1);
+
+  /** @internal
+   * Satisfied if @p _Tp implements the std::complex interface.
+   */
+  template <typename _Tp>
+    concept __complex_like = __complex_like_impl<remove_cvref_t<_Tp>>;
+
+  template <typename _Tp>
+    concept __vectorizable_scalar
+      = same_as<remove_cv_t<_Tp>, _Tp>
+          and ((integral<_Tp> and sizeof(_Tp) <= sizeof(0ULL) and not same_as<_Tp, bool>)
+                 or (floating_point<_Tp> and sizeof(_Tp) <= sizeof(double)));
+
+  // [simd.general] p2
+  template <typename _Tp>
+    concept __vectorizable
+      = __vectorizable_scalar<_Tp>
+          or (__complex_like_impl<_Tp> and __vectorizable_scalar<typename _Tp::value_type>
+                and floating_point<typename _Tp::value_type>);
+
+  /** @internal
+   * Describes variants of _Abi.
+   */
+  enum _AbiVariant : unsigned long long
+  {
+    _VecMask = 1 << 0, // default uses vector masks
+    _BitMask = 1 << 1, // switch to bit-masks (AVX512)
+    _MaskVariants = _VecMask | _BitMask,
+    _CxIleav = 1 << 5, // store complex components interleaved (ririri...)
+    _CxCtgus = 1 << 6, // ... or store complex components contigously (rrrr iiii)
+    _CxVariants = _CxIleav | _CxCtgus,
+  };
+
+  /** @internal
+   * Return true iff @p __x is set in @p __flags.
+   */
+  consteval bool
+  __flags_test(_AbiVariant __flags, _AbiVariant __x)
+  { return (__flags | __x) == __flags; }
+
+  /** @internal
+   * Type used whenever no valid integer/value type exists.
+   */
+  struct _InvalidInteger
+  {};
+
+  /** @internal
+   * Alias for a signed integer type T such that sizeof(T) equals _Bytes.
+   *
+   * C++26 [simd.expos.defn]
+   */
+  template <size_t _Bytes>
+    using __integer_from
+      = decltype([] {
+          if constexpr (sizeof(signed char) == _Bytes)
+            return static_cast<signed char>(0);
+          else if constexpr (sizeof(signed short) == _Bytes)
+            return static_cast<signed short>(0);
+          else if constexpr (sizeof(signed int) == _Bytes)
+            return static_cast<signed int>(0);
+          else if constexpr (sizeof(signed long long) == _Bytes)
+            return static_cast<signed long long>(0);
+          else
+            return _InvalidInteger();
+        }());
+
+  /** @internal
+   * Alias for an unsigned integer type T such that sizeof(T) equals _Bytes.
+   */
+  template <size_t _Bytes>
+    using _UInt = make_unsigned_t<__integer_from<_Bytes>>;
+
+  /** @internal
+   * Divide @p __x by @p __y while rounding up instead of down.
+   *
+   * Preconditions: __x >= 0 and __y > 0.
+   */
+  template <typename _Tp>
+    constexpr _Tp
+    __div_ceil(_Tp __x, _Tp __y)
+    { return (__x + __y - 1) / __y; }
+
+  /** @internal
+   * Alias for an unsigned integer type that can store at least @p _NBits bits.
+   */
+  template <int _NBits>
+    requires (_NBits > 0 and _NBits <= 64)
+    using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>;
+
+  /** @internal
+   * Map a given type @p _Tp to an equivalent type.
+   *
+   * This helps with reducing the necessary branches and casts in the implementation as well as
+   * reducing the number of template instantiations.
+   */
+  template <typename _Tp>
+    struct __canonical_vec_type
+    { using type = _Tp; };
+
+  template <typename _Tp>
+    using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type;
+
+  template <std::same_as<long> _Tp>
+    requires (sizeof(_Tp) == sizeof(int))
+    struct __canonical_vec_type<_Tp>
+    { using type = int; };
+
+  template <std::same_as<long> _Tp>
+    requires (sizeof(_Tp) == sizeof(long long))
+    struct __canonical_vec_type<_Tp>
+    { using type = long long; };
+
+  template <std::same_as<unsigned long> _Tp>
+    requires (sizeof(_Tp) == sizeof(unsigned int))
+    struct __canonical_vec_type<_Tp>
+    { using type = unsigned int; };
+
+  template <std::same_as<unsigned long> _Tp>
+    requires (sizeof(_Tp) == sizeof(unsigned long long))
+    struct __canonical_vec_type<_Tp>
+    { using type = unsigned long long; };
+
+  template <typename _Tp>
+    requires std::is_enum_v<_Tp>
+    struct __canonical_vec_type<_Tp>
+    { using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; };
+
+  template <>
+    struct __canonical_vec_type<char>
+    { using type = std::conditional_t<std::is_signed_v<char>, signed char, unsigned char>; };
+
+  template <>
+    struct __canonical_vec_type<char8_t>
+    { using type = unsigned char; };
+
+  template <>
+    struct __canonical_vec_type<char16_t>
+    { using type = uint_least16_t; };
+
+  template <>
+    struct __canonical_vec_type<char32_t>
+    { using type = uint_least32_t; };
+
+  template <>
+    struct __canonical_vec_type<wchar_t>
+    {
+      using type = std::conditional_t<std::is_signed_v<wchar_t>,
+                                      simd::__integer_from<sizeof(wchar_t)>,
+                                      simd::_UInt<sizeof(wchar_t)>>;
+    };
+
+  template <>
+    struct __canonical_vec_type<_Float64>
+    { using type = double; };
+
+  template <>
+    struct __canonical_vec_type<_Float32>
+    { using type = float; };
+
+  /** @internal
+   * This ABI tag describes basic_vec objects that store one element per data member and basic_mask
+   * objects that store one bool data members.
+   *
+   * @tparam _Np   The number of elements, which also matches the number of data members in
+   *               basic_vec and basic_mask.
+   */
+  template <int _Np = 1>
+    struct _ScalarAbi
+    {
+      static constexpr int _S_size = _Np;
+
+      static constexpr int _S_nreg = _Np;
+
+      static constexpr _AbiVariant _S_variant = {};
+
+      template <typename _Tp>
+        using _DataType = __canonical_vec_type_t<_Tp>;
+
+      static constexpr bool _S_is_cx_ileav = false;
+
+      template <size_t>
+        using _MaskDataType = bool;
+
+      template <int _N2, int _Nreg2 = _N2>
+        consteval _ScalarAbi<_N2>
+        _M_resize() const
+        {
+          static_assert(_N2 == _Nreg2);
+          return {};
+        }
+    };
+
+  /** @internal
+   * This ABI tag describes basic_vec objects that store one or more objects declared with the
+   * [[gnu::vector_size(N)]] attribute.
+   * Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects
+   * or bit-mask objects. Which one is used is determined via @p _Var.
+   *
+   * @tparam _Np    The number of elements.
+   * @tparam _Nreg  The number of registers needed to store @p _Np elements.
+   * @tparam _Var   Determines how complex value-types are layed out and whether mask types use
+   *                bit-masks or vector-masks.
+   */
+  template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var
+#ifdef __AVX512F__
+              = _AbiVariant::_BitMask
+#else
+              = _AbiVariant::_VecMask
+#endif
+           >
+    struct _Abi
+    {
+      static constexpr int _S_size = _Np;
+
+      /**\internal
+       * The number of registers needed to represent one basic_vec for the element type that was
+       * used on ABI deduction.
+       *
+       * For _CxCtgus the value applies twice, once per reals and once per imags.
+       *
+       * Examples:
+       * - '_Abi< 8, 2>' for 'int' is 2x 128-bit
+       * - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit
+       * - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit
+       * - '_Abi<10, 1>' for 'int' is 1x 512-bit
+       * - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit
+       * - '_Abi< 8, 2, _CxIleav>' for 'complex<float>' is 2x 256-bit
+       * - '_Abi< 9, 2, _CxIleav>' for 'complex<float>' is 1x 512-bit and 1x 64-bit
+       * - '_Abi< 8, 1, _CxCtgus>' for 'complex<float>' is 2x 256-bit
+       */
+      static constexpr int _S_nreg = _Nreg;
+
+      static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var);
+
+      template <typename _Tp>
+        using _DataType = decltype([] {
+                            static_assert(_S_nreg == 1);
+                            static_assert(not __flags_test(_S_variant, _AbiVariant::_CxIleav));
+                            static_assert(not __flags_test(_S_variant, _AbiVariant::_CxCtgus));
+                            constexpr int __n = __bit_ceil(unsigned(_S_size));
+                            using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]]
+                              = __canonical_vec_type_t<_Tp>;
+                            return _Vp();
+                          }());
+
+      static constexpr bool _S_is_cx_ileav = __flags_test(_S_variant, _AbiVariant::_CxIleav);
+
+      template <size_t _Bytes>
+        using _MaskDataType
+          = decltype([] {
+              static_assert(not _S_is_cx_ileav);
+              if constexpr (__flags_test(_S_variant, _AbiVariant::_BitMask))
+                {
+                  if constexpr (_Nreg > 1)
+                    return _InvalidInteger();
+                  else
+                    return _Bitmask<_S_size>();
+                }
+              else
+                {
+                  constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size));
+                  using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>;
+                  return _Vp();
+                }
+            }());
+
+      template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)>
+        consteval auto
+        _M_resize() const
+        {
+          if constexpr (_N2 == 1 and not __flags_test(_S_variant, _AbiVariant::_CxIleav))
+            return _ScalarAbi<1>();
+          else
+            return _Abi<_N2, _Nreg2, _Var>();
+        }
+    };
+
+  /** @internal
+   * This type is used whenever ABI tag deduction can't give a useful answer.
+   */
+  struct _InvalidAbi
+  { static constexpr int _S_size = 0; };
+
+  /** @internal
+   * Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition
+   * for an enabled basic_vec/basic_mask specialization.
+   */
+  template <typename _Tp>
+    concept __abi_tag
+      = same_as<decltype(_Tp::_S_variant), const _AbiVariant>
+          and (_Tp::_S_size >= _Tp::_S_nreg) and (_Tp::_S_nreg >= 1)
+          and requires(_Tp __x) {
+            { __x.template _M_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>;
+          };
+
+  // Determine if math functions must *raise* floating-point exceptions.
+  // math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions
+  // need to be considered.
+  template <int = 0>
+    requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; }
+    consteval bool
+    __handle_fpexcept_impl(int)
+    { return 0 != (math_errhandling & MATH_ERREXCEPT); }
+
+  // Fallback if math_errhandling doesn't work: implement correct exception behavior.
+  consteval bool
+  __handle_fpexcept_impl(float)
+  { return true; }
+
+  /** @internal
+   * This type can be used as a template parameter for avoiding ODR violations, where code needs to
+   * differ depending on optimization flags (mostly fp-math related).
+   */
+  struct _OptTraits
+  {
+    consteval bool
+    _M_test(int __bit) const
+    { return ((_M_build_flags >> __bit) & 1) == 1; }
+
+    // true iff floating-point operations can signal an exception (allow non-default handler)
+    consteval bool
+    _M_fp_may_signal() const
+    { return _M_test(0); }
+
+    // true iff floating-point operations can raise an exception flag
+    consteval bool
+    _M_fp_may_raise() const
+    { return _M_test(12); }
+
+    consteval bool
+    _M_fast_math() const
+    { return _M_test(1); }
+
+    consteval bool
+    _M_finite_math_only() const
+    { return _M_test(2); }
+
+    consteval bool
+    _M_no_signed_zeros() const
+    { return _M_test(3); }
+
+    consteval bool
+    _M_signed_zeros() const
+    { return not _M_test(3); }
+
+    consteval bool
+    _M_reciprocal_math() const
+    { return _M_test(4); }
+
+    consteval bool
+    _M_no_math_errno() const
+    { return _M_test(5); }
+
+    consteval bool
+    _M_math_errno() const
+    { return not _M_test(5); }
+
+    consteval bool
+    _M_associative_math() const
+    { return _M_test(6); }
+
+    consteval bool
+    _M_conforming_to_STDC_annex_G() const
+    { return _M_test(10) and not _M_finite_math_only(); }
+
+    consteval bool
+    _M_support_snan() const
+    { return _M_test(11); }
+
+    __UINT64_TYPE__ _M_build_flags
+      = 0
+#if not __NO_TRAPPING_MATH__
+          + (1 << 0)
+#endif
+          + (__handle_fpexcept_impl(0) << 12)
+#if __FAST_MATH__
+          + (1 << 1)
+#endif
+#if __FINITE_MATH_ONLY__
+          + (1 << 2)
+#endif
+#if __NO_SIGNED_ZEROS__
+          + (1 << 3)
+#endif
+#if __RECIPROCAL_MATH__
+          + (1 << 4)
+#endif
+#if __NO_MATH_ERRNO__
+          + (1 << 5)
+#endif
+#if __ASSOCIATIVE_MATH__
+          + (1 << 6)
+#endif
+        // bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__
+#if __FLT_EVAL_METHOD__ == 1
+          + (1 << 7)
+#elif __FLT_EVAL_METHOD__ == 2
+          + (2 << 7)
+#elif __FLT_EVAL_METHOD__ != 0
+          + (3 << 7)
+#endif
+
+        // C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If
+        // __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex>
+        // will do so as well. However, Clang never defines the macro.
+#if defined __STDC_IEC_60559_COMPLEX__ or defined __STDC_IEC_559_COMPLEX__ or defined _GLIBCXX_CLANG
+          + (1 << 10)
+#endif
+#if __SUPPORT_SNAN__
+          + (1 << 11)
+#endif
+        ;
+  };
+
+  /** @internal
+   * Return true iff @p __s equals "1".
+   */
+  consteval bool
+  __streq_to_1(const char* __s)
+  { return __s != nullptr and __s[0] == '1' and __s[1] == '\0'; }
+
+  /** @internal
+   * If the macro given as @p feat is defined to 1, expands to a bit set at position @p off.
+   * Otherwise, expand to zero.
+   */
+#define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \
+  (static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off)
+
+#if _GLIBCXX_X86
+
+#define _GLIBCXX_SIMD_ARCH_TRAITS_INIT {                      \
+  _GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__)                         \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__)                    \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__)                   \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__)                   \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__)                  \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__)                 \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__)                 \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__)                 \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__)                    \
+    | _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__)                   \
+    | _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__)                    \
+    | _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__)                   \
+    | _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__)                  \
+    | _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__)                   \
+    | _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__)                    \
+    | _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__)                \
+    | _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__)               \
+    | _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__)               \
+    | _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__)               \
+    | _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__)               \
+    | _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__)           \
+    | _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__)             \
+    | _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__)            \
+    | _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__)             \
+    | _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__)             \
+    | _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__)        \
+    | _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__)             \
+    | _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__)             \
+    | _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__)                \
+    | _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__)           \
+    | _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__)                \
+    | _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__)            \
+    | _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__)           \
+    | _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__)                \
+    | _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__)                \
+    | _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__)     \
+    | _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__)                  \
+    | _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__)                   \
+    | _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__)                    \
+  }
+  // Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches =>
+  // no ODR issue? The same could be said about several other flags above that are not checked
+  // anywhere.
+
+  struct _ArchTraits
+  {
+    __UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT;
+
+    consteval bool
+    _M_test(int __bit) const
+    { return ((_M_flags >> __bit) & 1) == 1; }
+
+    consteval bool
+    _M_have_mmx() const
+    { return _M_test(0); }
+
+    consteval bool
+    _M_have_sse() const
+    { return _M_test(1); }
+
+    consteval bool
+    _M_have_sse2() const
+    { return _M_test(2); }
+
+    consteval bool
+    _M_have_sse3() const
+    { return _M_test(3); }
+
+    consteval bool
+    _M_have_ssse3() const
+    { return _M_test(4); }
+
+    consteval bool
+    _M_have_sse4_1() const
+    { return _M_test(5); }
+
+    consteval bool
+    _M_have_sse4_2() const
+    { return _M_test(6); }
+
+    consteval bool
+    _M_have_popcnt() const
+    { return _M_test(7); }
+
+    consteval bool
+    _M_have_avx() const
+    { return _M_test(8); }
+
+    consteval bool
+    _M_have_f16c() const
+    { return _M_test(9); }
+
+    consteval bool
+    _M_have_bmi() const
+    { return _M_test(10); }
+
+    consteval bool
+    _M_have_bmi2() const
+    { return _M_test(11); }
+
+    consteval bool
+    _M_have_lzcnt() const
+    { return _M_test(12); }
+
+    consteval bool
+    _M_have_avx2() const
+    { return _M_test(13); }
+
+    consteval bool
+    _M_have_fma() const
+    { return _M_test(14); }
+
+    consteval bool
+    _M_have_avx512f() const
+    { return _M_test(15); }
+
+    consteval bool
+    _M_have_avx512cd() const
+    { return _M_test(16); }
+
+    consteval bool
+    _M_have_avx512dq() const
+    { return _M_test(17); }
+
+    consteval bool
+    _M_have_avx512bw() const
+    { return _M_test(18); }
+
+    consteval bool
+    _M_have_avx512vl() const
+    { return _M_test(19); }
+
+    consteval bool
+    _M_have_avx512bitalg() const
+    { return _M_test(20); }
+
+    consteval bool
+    _M_have_avx512vbmi() const
+    { return _M_test(21); }
+
+    consteval bool
+    _M_have_avx512vbmi2() const
+    { return _M_test(22); }
+
+    consteval bool
+    _M_have_avx512ifma() const
+    { return _M_test(23); }
+
+    consteval bool
+    _M_have_avx512vnni() const
+    { return _M_test(24); }
+
+    consteval bool
+    _M_have_avx512vpopcntdq() const
+    { return _M_test(25); }
+
+    consteval bool
+    _M_have_avx512fp16() const
+    { return _M_test(26); }
+
+    consteval bool
+    _M_have_avx512bf16() const
+    { return _M_test(27); }
+
+    consteval bool
+    _M_have_avxifma() const
+    { return _M_test(28); }
+
+    consteval bool
+    _M_have_avxneconvert() const
+    { return _M_test(29); }
+
+    consteval bool
+    _M_have_avxvnni() const
+    { return _M_test(30); }
+
+    consteval bool
+    _M_have_avxvnniint8() const
+    { return _M_test(31); }
+
+    consteval bool
+    _M_have_avxvnniint16() const
+    { return _M_test(32); }
+
+    consteval bool
+    _M_have_avx10_1() const
+    { return _M_test(33); }
+
+    consteval bool
+    _M_have_avx10_2() const
+    { return _M_test(34); }
+
+    consteval bool
+    _M_have_avx512vp2intersect() const
+    { return _M_test(35); }
+
+    consteval bool
+    _M_have_sse4a() const
+    { return _M_test(36); }
+
+    consteval bool
+    _M_have_fma4() const
+    { return _M_test(37); }
+
+    consteval bool
+    _M_have_xop() const
+    { return _M_test(38); }
+
+    template <typename _Tp>
+      consteval bool
+      _M_eval_as_f32() const
+      { return is_same_v<_Tp, _Float16> and not _M_have_avx512fp16(); }
+  };
+
+  template <typename _Tp, _ArchTraits _Traits = {}>
+    consteval auto
+    __native_abi()
+    {
+      constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>);
+      if constexpr (not __vectorizable<_Tp>)
+        return _InvalidAbi();
+      else if constexpr (__complex_like<_Tp>)
+        {
+          constexpr auto __underlying = __native_abi<typename _Tp::value_type>();
+          if constexpr (__underlying._S_size == 1)
+            return _ScalarAbi<1>();
+          else
+            return _Abi<__underlying._S_size / 2, 1,
+                        __underlying._S_variant | _AbiVariant::_CxIleav>();
+        }
+      else if constexpr (_Traits._M_have_avx512fp16())
+        return _Abi<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>();
+      else if constexpr (_Traits._M_have_avx512f())
+        return _Abi<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>();
+      else if constexpr (is_same_v<_Tp, _Float16> and not _Traits._M_have_f16c())
+        return _ScalarAbi<1>();
+      else if constexpr (_Traits._M_have_avx2())
+        return _Abi<32 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+      else if constexpr (_Traits._M_have_avx() and is_floating_point_v<_Tp>)
+        return _Abi<32 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+      else if constexpr (_Traits._M_have_sse2())
+        return _Abi<16 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+      else if constexpr (_Traits._M_have_sse() and is_floating_point_v<_Tp>
+                           and sizeof(_Tp) == sizeof(float))
+        return _Abi<16 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+      else
+        return _ScalarAbi<1>();
+    }
+
+#else
+
+  // scalar fallback
+  // TODO: add more targets
+  struct _ArchTraits
+  {
+    __UINT64_TYPE__ _M_flags = 0;
+
+    constexpr bool
+    _M_test(int __bit) const
+    { return ((_M_flags >> __bit) & 1) == 1; }
+  };
+
+  template <typename _Tp>
+    consteval auto
+    __native_abi()
+    {
+      if constexpr (not __vectorizable<_Tp>)
+        return _InvalidAbi();
+      else
+        return _ScalarAbi<1>();
+    }
+
+#endif
+
+  /** @internal
+   * You must use this type as template argument to function templates that are not declared
+   * always_inline (to avoid issues when linking code compiled with different compiler flags).
+   */
+  struct _TargetTraits
+  : _ArchTraits, _OptTraits
+  {};
+
+  /** @internal
+   * Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of
+   * optimal width.
+   *
+   * @tparam _Tp  A vectorizable type.
+   *
+   * C++26 [simd.expos.abi]
+   */
+  template <typename _Tp>
+    using __native_abi_t = decltype(__native_abi<_Tp>());
+
+  template <typename _Tp, int _Np, _TargetTraits _Target = {}>
+    consteval auto
+    __deduce_abi()
+    {
+      constexpr auto __native = __native_abi<_Tp>();
+      if constexpr (0 == __native._S_size or _Np <= 0)
+        return _InvalidAbi();
+      else if constexpr (_Np == __native._S_size)
+        return __native;
+      else
+        return __native.template _M_resize<_Np>();
+    }
+
+  /** @internal
+   * Alias for an ABI tag @c A such that <tt>basic_vec<_Tp, A></tt> stores @p _Np elements.
+   *
+   * C++26 [simd.expos.abi]
+   */
+  template <typename _Tp, int _Np>
+    using __deduce_abi_t = decltype(__deduce_abi<_Tp, _Np>());
+
+  /** @internal
+   * \c rebind implementation detail for basic_vec, and basic_mask where we know the destination
+   * value-type
+   */
+  template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}>
+    consteval auto
+    __abi_rebind()
+    {
+      if constexpr (_Np <= 0 or not __vectorizable<_Tp>)
+        return _InvalidAbi();
+      else
+        {
+          constexpr auto __native = __native_abi<_Tp>();
+          static_assert(0 != __native._S_size);
+          constexpr int __nreg = __div_ceil(_Np, __native._S_size);
+
+          if constexpr (is_same_v<_A0, _ScalarAbi<_A0::_S_size>>)
+            return __deduce_abi<_Tp, _Np>();
+
+          else if constexpr (__complex_like<_Tp>
+                               and __flags_test(_A0::_S_variant, _AbiVariant::_CxCtgus)
+                               and __flags_test(__native._S_variant, _AbiVariant::_CxIleav))
+            // we need half the number of registers since the number applies twice, to reals and
+            // imaginaries.
+            return _Abi<_Np, __nreg / 2, _A0::_S_variant>();
+
+          else if constexpr (__complex_like<_Tp>
+                               and __flags_test(_A0::_S_variant, _AbiVariant::_CxIleav)
+                               and __flags_test(__native._S_variant, _AbiVariant::_CxCtgus))
+            return _Abi<_Np, __nreg * 2, _A0::_S_variant>();
+
+          else if constexpr (__complex_like<_Tp>)
+            return _Abi<_Np, __nreg, _A0::_S_variant | _AbiVariant::_CxIleav>();
+
+          else if constexpr (_Np == __nreg)
+            return _ScalarAbi<_Np>();
+
+          else
+            return _Abi<_Np, __nreg, _A0::_S_variant & _AbiVariant::_MaskVariants>();
+        }
+    }
+
+  /** @internal
+   * @c rebind implementation detail for basic_mask.
+   *
+   * The important difference here is that we have no information about the actual value-type other
+   * than its @c sizeof. So <tt>_Bytes == 8</tt> could mean <tt>complex<float></tt>, @c double, or
+   * @c int64_t. E.g. <tt>_Np == 4</tt> with AVX w/o AVX2 that's <tt>vector(4) int</tt>,
+   * <tt>vector(4) long long</tt>, or <tt>2x vector(2) long long</tt>.
+   * That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the
+   * value-type doesn't change.
+   */
+  template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}>
+    consteval auto
+    __abi_rebind()
+    {
+      constexpr bool __from_cx = __flags_test(_A0::_S_variant, _AbiVariant::_CxCtgus)
+                                   or __flags_test(_A0::_S_variant, _AbiVariant::_CxIleav);
+
+      if constexpr (_Bytes == 0 or _Np <= 0)
+        return _InvalidAbi();
+
+      // If _Bytes is sizeof(complex<double>) we can be certain it's a mask<complex<double>, _Np>.
+      else if constexpr (_Bytes == sizeof(double) * 2)
+        return __abi_rebind<complex<double>, _Np, _A0>();
+
+      else if constexpr (is_same_v<_A0, _ScalarAbi<_A0::_S_size>>)
+        {
+          if constexpr (_IsOnlyResize)
+            // stick to _ScalarAbi (likely _Float16 without hardware support)
+            return _ScalarAbi<_Np>();
+          else
+            // otherwise, fresh start via __deduce_abi_t using __integer_from
+            return __deduce_abi<__integer_from<_Bytes>, _Np>();
+        }
+
+      // If the source ABI is complex, _Bytes == sizeof(complex<float>) or
+      // sizeof(complex<float16_t>), and _IsOnlyResize is true, then it's a mask<complex<float>,
+      // _Np>
+      else if constexpr (__from_cx and _IsOnlyResize and _Bytes == 2 * sizeof(float))
+        return __abi_rebind<complex<float>, _Np, _A0>();
+      else if constexpr (__from_cx and _IsOnlyResize and _Bytes == 2 * sizeof(_Float16))
+        return __abi_rebind<complex<_Float16>, _Np, _A0>();
+
+#if _GLIBCXX_X86
+      // AVX w/o AVX2:
+      // e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2>
+      // We determine whether _A0 identifies an AVX vector by looking at the size of a native
+      // register. If it's 32, it's a YMM register, otherwise it's 16 or less.
+      else if constexpr (_IsOnlyResize
+                           and _Traits._M_have_avx() and not _Traits._M_have_avx2()
+                           and __bit_ceil(__div_ceil<unsigned>(
+                                            _A0::_S_size, _A0::_S_nreg)) * _Bytes == 32)
+        {
+          if constexpr (_Bytes == sizeof(double))
+            return __abi_rebind<double, _Np, _A0>();
+          else if constexpr (_Bytes == sizeof(float))
+            return __abi_rebind<float, _Np, _A0>();
+          else if constexpr (_Traits._M_have_f16c() and _Bytes == sizeof(_Float16))
+            return __abi_rebind<_Float16, _Np, _A0>();
+          else // impossible
+            static_assert(false);
+        }
+#endif
+
+      else
+        return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>();
+    }
+
+  /** @internal
+   * Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined.
+   *
+   * On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does
+   * compile on basically every other target, though. This is due to the difference in ABI tag:
+   * _Abi<8, 1, 1> vs. _Abi<8, 2, 1>. I know how to define this funtion for libstdc++ to avoid
+   * interconvertible masks. The question is whether we can specify this in general for C++29.
+   */
+  template <typename _To, typename _From>
+  consteval bool
+    __is_mask_conversion_explicit(size_t __b0, size_t __b1)
+    {
+      constexpr int __n = _To::_S_size;
+      static_assert(__n == _From::_S_size);
+#ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION
+      /// C++26 [simd.mask.ctor] uses unconditional explicit
+      return true;
+#else
+      if (__b0 != __b1)
+        return true;
+
+      // everything is better than _ScalarAbi, except when converting to a single bool
+      if constexpr (is_same_v<_To, _ScalarAbi<__n>>)
+        return __n > 1;
+      else if constexpr (is_same_v<_From, _ScalarAbi<__n>>)
+        return true;
+
+      else
+        {
+          constexpr _AbiVariant __f0 = _To::_S_variant;
+          constexpr _AbiVariant __f1 = _From::_S_variant;
+
+          // converting to a bit-mask is better
+          if constexpr ((__f0 & _AbiVariant::_MaskVariants) != (__f1 & _AbiVariant::_MaskVariants))
+            return __flags_test(__f0, _AbiVariant::_VecMask); // to _VecMask is explicit
+
+          // with vec-masks, fewer registers is better
+          else if constexpr (_From::_S_nreg != _To::_S_nreg)
+            return _From::_S_nreg < _To::_S_nreg;
+
+          // differ only on _Cx flags
+          // interleaved complex is worse
+          else if constexpr (__flags_test(__f0, _AbiVariant::_CxIleav))
+            return true;
+          else if constexpr (__flags_test(__f1, _AbiVariant::_CxIleav))
+            return false;
+
+          // prefer non-_Cx over _CxCtgus
+          else if constexpr (__flags_test(__f0, _AbiVariant::_CxCtgus))
+            return true;
+          else
+            __builtin_unreachable();
+        }
+#endif
+    }
+
+  /** @internal
+   * An alias for a signed integer type.
+   *
+   * libstdc++ unconditionally uses @c int here, since it matches the return type of
+   * 'Bit Operation Builtins' in GCC.
+   *
+   * C++26 [simd.expos.defn]
+   */
+  using __simd_size_type = int;
+
+  /** @internal
+   * The width of <tt>basic_vec<T, Abi></tt> if the specialization <tt>basic_vec<T, Abi></tt> is
+   * enabled, or @c 0 otherwise.
+   *
+   * C++26 [simd.expos.defn]
+   */
+  template <typename _Tp, typename _Abi>
+    constexpr __simd_size_type __simd_size_v = 0;
+
+  template <__vectorizable _Tp, __abi_tag _Abi>
+    constexpr __simd_size_type __simd_size_v<_Tp, _Abi> = _Abi::_S_size;
+
+  // integral_constant shortcut
+  template <__simd_size_type _Xp>
+    inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_constant = {};
+
+  // [simd.syn]
+  template <typename _Tp, typename _Abi = __native_abi_t<_Tp>>
+    class basic_vec;
+
+  template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, __native_abi_t<_Tp>>>
+    using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;
+
+  template <size_t _Bytes, typename _Abi = __native_abi_t<__integer_from<_Bytes>>>
+    class basic_mask;
+
+  template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, __native_abi_t<_Tp>>>
+    using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
+
+  /** @internal
+   * Satisfied if @p _Tp is a data-parallel type.
+   *
+   * C++26 [simd.general]
+   */
+  template <typename _Tp>
+    concept __data_parallel_type
+      = __vectorizable<typename _Tp::value_type>
+          and __abi_tag<typename _Tp::abi_type>
+          and _Tp::size() >= 1;
+
+  // [simd.ctor] load constructor constraints
+#ifdef __clang__
+  template <typename _Tp>
+    static constexpr remove_cvref_t<_Tp> __static_sized_range_obj = {};
+#endif
+
+  template <typename _Tp, size_t _Np = 0>
+    concept __static_sized_range
+      = ranges::contiguous_range<_Tp> and ranges::sized_range<_Tp>
+          and requires(_Tp&& __r) {
+#if 1 // PR117849
+            typename integral_constant<size_t, ranges::size(__r)>;
+#else
+            requires (decltype(std::span(__r))::extent != dynamic_extent);
+#endif
+#ifdef __clang__
+            requires (_Np == 0 or ranges::size(__static_sized_range_obj<_Tp>) == _Np);
+#else
+            requires (_Np == 0 or ranges::size(__r) == _Np);
+#endif
+          };
+
+  // [simd.general] value-reserving
+  template <typename _From, typename _To>
+    concept __arithmetic_only_value_preserving_convertible_to
+      = convertible_to<_From, _To> and is_arithmetic_v<_From> and is_arithmetic_v<_To>
+          and not (is_signed_v<_From> and is_unsigned_v<_To>)
+          and numeric_limits<_From>::digits <= numeric_limits<_To>::digits
+          and numeric_limits<_From>::max() <= numeric_limits<_To>::max()
+          and numeric_limits<_From>::lowest() >= numeric_limits<_To>::lowest();
+
+  /** @internal
+   * Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion.
+   *
+   * C++26 [simd.general]
+   */
+  template <typename _From, typename _To>
+    concept __value_preserving_convertible_to
+      = __arithmetic_only_value_preserving_convertible_to<_From, _To>
+          or (__complex_like<_To> and __arithmetic_only_value_preserving_convertible_to<
+                                        _From, typename _To::value_type>);
+
+  /** @internal
+   * The value of the @c _Bytes template argument to a @c basic_mask specialization.
+   *
+   * C++26 [simd.expos.defn]
+   */
+  template <typename _Tp>
+    constexpr size_t __mask_element_size = 0;
+
+  /** @internal
+   * C++26 [simd.expos]
+   */
+  template<typename _Tp>
+    concept __constexpr_wrapper_like
+      = convertible_to<_Tp, decltype(_Tp::value)>
+          and equality_comparable_with<_Tp, decltype(_Tp::value)>
+          and bool_constant<_Tp() == _Tp::value>::value
+          and bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value;
+
+  // [simd.ctor] explicit(...) of broadcast ctor
+  template <typename _From, typename _To>
+    concept __non_narrowing_constexpr_conversion
+      = __constexpr_wrapper_like<_From> and convertible_to<_From, _To>
+          and requires { { _From::value } -> std::convertible_to<_To>; }
+          and static_cast<decltype(_From::value)>(_To(_From::value)) == _From::value
+          and not (std::unsigned_integral<_To> and _From::value < decltype(_From::value)())
+          and _From::value <= std::numeric_limits<_To>::max()
+          and _From::value >= std::numeric_limits<_To>::lowest();
+
+  // [simd.ctor] p4
+  template <typename _From, typename _To>
+    concept __broadcast_constructible
+      = convertible_to<_From, _To> // 4
+          and ((not is_arithmetic_v<remove_cvref_t<_From>>
+                  and not __constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1
+                 or __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2
+                 or __non_narrowing_constexpr_conversion<remove_cvref_t<_From>, _To>); // 4.3
+
+  // __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U)
+  template <typename _From, typename _To>
+    concept __higher_floating_point_rank_than
+      = floating_point<_From> && floating_point<_To>
+          && same_as<common_type_t<_From, _To>, _From>;
+
+  // __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U)
+  template <typename _From, typename _To>
+    concept __higher_integer_rank_than
+      = integral<_From> && integral<_To>
+          && (sizeof(_From) > sizeof(_To) || same_as<common_type_t<_From, _To>, _From>);
+
+  template <typename _From, typename _To>
+    concept __higher_rank_than
+      = __higher_floating_point_rank_than<_From, _To> || __higher_integer_rank_than<_From, _To>;
+
+  struct __convert_flag;
+
+  template <typename _From, typename _To, typename... _Traits>
+    concept __loadstore_convertible_to
+      = same_as<_From, _To>
+          or (__vectorizable<_From> and __vectorizable<_To>
+                and (__value_preserving_convertible_to<_From, _To>
+                       or (std::convertible_to<_From, _To>
+                             and (std::same_as<_Traits, __convert_flag> or ...))));
+
+  template <typename _From, typename _To>
+    concept __simd_generator_convertible_to
+      = std::convertible_to<_From, _To>
+          and (not is_arithmetic_v<_From> or __value_preserving_convertible_to<_From, _To>);
+
+  template <typename _Fp, typename _Tp, __simd_size_type... _Is>
+    requires (__simd_generator_convertible_to<
+                decltype(declval<_Fp>()(__simd_size_constant<_Is>)), _Tp> and ...)
+    constexpr void
+    __simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);
+
+  template <typename _Fp, typename _Tp, __simd_size_type _Np>
+    concept __simd_generator_invokable = requires {
+      __simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>());
+    };
+
+  template <typename _Fp, typename _Tp, __simd_size_type... _Is>
+    requires (not __simd_generator_convertible_to<
+                    decltype(declval<_Fp>()(__simd_size_constant<_Is>)), _Tp>
+                or ...)
+    constexpr void
+    __almost_simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);
+
+  template <typename _Fp, typename _Tp, __simd_size_type _Np>
+    concept __almost_simd_generator_invokable = requires(_Fp&& __gen) {
+      __gen(__simd_size_constant<0>);
+      __almost_simd_generator_invokable_impl<_Fp, _Tp>(
+        make_integer_sequence<__simd_size_type, _Np>());
+    };
+
+  template <typename _Fp>
+    concept __index_permutation_function_nosize = requires(_Fp const& __f)
+      {
+        { __f(0) } -> std::integral;
+      };
+
+  template <typename _Fp, typename _Simd>
+    concept __index_permutation_function_size = requires(_Fp const& __f)
+      {
+        { __f(0, 0) } -> std::integral;
+      };
+
+  template <typename _Fp, typename _Simd>
+    concept __index_permutation_function
+      = __index_permutation_function_size<_Fp, _Simd> or __index_permutation_function_nosize<_Fp>;
+
+  // [simd.expos]
+  template <size_t _Bytes, __abi_tag _Abi>
+    constexpr size_t __mask_element_size<basic_mask<_Bytes, _Abi>> = _Bytes;
+
+  template <typename _Vp>
+    concept __simd_vec_type
+      = same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>>
+          and is_default_constructible_v<_Vp>;
+
+  template <typename _Vp>
+    concept __simd_mask_type
+      = same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>>
+        and is_default_constructible_v<_Vp>;
+
+  template <typename _Vp>
+    concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> or __simd_mask_type<_Vp>;
+
+  template <typename _Vp>
+    concept __simd_floating_point
+      = __simd_vec_type<_Vp> and floating_point<typename _Vp::value_type>;
+
+  template <typename _Vp>
+    concept __simd_integral
+      = __simd_vec_type<_Vp> and integral<typename _Vp::value_type>;
+
+  template <typename _Vp>
+    using __simd_complex_value_type = typename _Vp::value_type::value_type;
+
+  template <typename _Vp>
+    concept __simd_complex
+      = __simd_vec_type<_Vp> and __complex_like_impl<typename _Vp::value_type>;
+
+  template <typename _Tp>
+    using __deduced_vec_t
+      = decltype([] {
+          using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
+          if constexpr (__data_parallel_type<_Up>)
+            return _Up();
+      }());
+
+  static_assert(is_same_v<__deduced_vec_t<int>, void>);
+
+  template <typename _Vp, typename _Tp>
+    using __make_compatible_simd_t
+      = decltype([] {
+          using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
+          if constexpr (__simd_vec_type<_Up>)
+            return _Up();
+          else
+            return vec<_Up, _Vp::size()>();
+      }());
+
+  template <typename... _Ts>
+    concept __math_floating_point = (__simd_floating_point<__deduced_vec_t<_Ts>> or ...);
+
+  template <typename...>
+    struct __math_common_simd_impl;
+
+  template <typename... _Ts>
+    requires __math_floating_point<_Ts...>
+    using __math_common_simd_t = typename __math_common_simd_impl<_Ts...>::type;
+
+  template <typename _T0>
+    struct __math_common_simd_impl<_T0>
+    { using type = __deduced_vec_t<_T0>; };
+
+  template <typename _T0, typename _T1>
+    struct __math_common_simd_impl<_T0, _T1>
+    {
+      using type = decltype([] {
+                     if constexpr (__math_floating_point<_T0> and __math_floating_point<_T1>)
+                       return common_type_t<__deduced_vec_t<_T0>, __deduced_vec_t<_T1>>();
+                     else if constexpr (__math_floating_point<_T0>)
+                       return common_type_t<__deduced_vec_t<_T0>, _T1>();
+                     else if constexpr (__math_floating_point<_T1>)
+                       return common_type_t<_T0, __deduced_vec_t<_T1>>();
+                     // else void
+                   }());
+    };
+
+  template <typename _T0, typename _T1, typename... _TRest>
+    struct __math_common_simd_impl<_T0, _T1, _TRest...>
+    { using type = common_type_t<__math_common_simd_t<_T0, _T1>, _TRest...>; };
+
+  template <typename _T0, typename _T1, typename... _TRest>
+    requires (sizeof...(_TRest) > 0) and is_void_v<__math_common_simd_t<_T0, _T1>>
+    struct __math_common_simd_impl<_T0, _T1, _TRest...>
+    { using type = common_type_t<__math_common_simd_t<_TRest...>, _T0, _T1>; };
+
+  template <typename _BinaryOperation, typename _Tp>
+    concept __reduction_binary_operation
+      = requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) {
+        { __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>;
+      };
+
+  /** @internal
+   * Returns the lowest index @c i where <tt>(__bits >> i) & 1</tt> equals @c 1.
+   */
+  [[__gnu__::__always_inline__]]
+  constexpr __simd_size_type
+  __lowest_bit(std::integral auto __bits)
+  {
+    if constexpr (sizeof(__bits) <= sizeof(int))
+      return __builtin_ctz(__bits);
+    else if constexpr (sizeof(__bits) <= sizeof(long))
+      return __builtin_ctzl(__bits);
+    else if constexpr (sizeof(__bits) <= sizeof(long long))
+      return __builtin_ctzll(__bits);
+    else
+      static_assert(false);
+  }
+
+  /** @internal
+   * Returns the highest index @c i where <tt>(__bits >> i) & 1</tt> equals @c 1.
+   */
+  [[__gnu__::__always_inline__]]
+  constexpr __simd_size_type
+  __highest_bit(std::integral auto __bits)
+  {
+    if constexpr (sizeof(__bits) <= sizeof(int))
+      return sizeof(int) * __CHAR_BIT__ - 1 - __builtin_clz(__bits);
+    else if constexpr (sizeof(__bits) <= sizeof(long))
+      return sizeof(long) * __CHAR_BIT__ - 1 - __builtin_clzl(__bits);
+    else if constexpr (sizeof(__bits) <= sizeof(long long))
+      return sizeof(long long) * __CHAR_BIT__ - 1 - __builtin_clzll(__bits);
+    else
+      static_assert(false);
+  }
+
+  template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap>
+    using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
+
+  // Allow _Tp to be _InvalidInteger for __integer_from<16>
+  template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap>
+    using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
+
+  // LWG???? [simd.expos]
+  template <size_t _Bytes, typename _Ap>
+    using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>;
+
+  template <typename _From, typename _To>
+    concept __simd_vec_bcast = constructible_from<_To, _From>;
+
+  /** @internal
+   * std::pair is not trivially copyable, this one is
+   */
+  template <typename _T0, typename _T1>
+    struct __trivial_pair
+    {
+      _T0 _M_first;
+      _T1 _M_second;
+    };
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_BASE_H
diff --git a/libstdc++-v3/include/bits/simd_x86.h b/libstdc++-v3/include/bits/simd_x86.h
new file mode 100644
index 00000000000..b04c2d04f92
--- /dev/null
+++ b/libstdc++-v3/include/bits/simd_x86.h
@@ -0,0 +1,953 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025      GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ *                       Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_SIMD_X86_H
+#define _GLIBCXX_SIMD_X86_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "vec_ops.h"
+
+#if not _GLIBCXX_X86
+#error "wrong include for this target"
+#endif
+
+#pragma GCC push_options
+// ensure GCC knows about the __builtin_ia32_* calls
+#pragma GCC target("avx2,bmi,bmi2,avx512vl,avx512bw,avx512dq,avx10.2")
+#pragma GCC pop_options
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std::simd
+{
+  static constexpr size_t __x86_max_general_register_size
+#ifdef __x86_64__
+    = 8;
+#else
+    = 4;
+#endif
+
+  /** \internal
+   * Return a bit-mask for the given vector-mask.
+   *
+   * Caveats:
+   * 1. The bit-mask of 2-Byte vector-masks has duplicated entries (because of missing instruction)
+   * 2. The return type internally is 'int', but that fails on conversion to uint64 if the MSB of a
+   * YMM 1/2-Byte vector-mask is set (sign extension). Therefore these helper functions return
+   * unsigned instead.
+   * 3. ZMM inputs are not supported.
+   */
+  [[__gnu__::__always_inline__]]
+  inline unsigned
+  __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 16> __x)
+  { return __builtin_ia32_movmskpd(__vec_bit_cast<double>(__x)); }
+
+  [[__gnu__::__always_inline__]]
+  inline unsigned
+  __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 32> __x)
+  { return __builtin_ia32_movmskpd256(__vec_bit_cast<double>(__x)); }
+
+  [[__gnu__::__always_inline__]]
+  inline unsigned
+  __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 16> __x)
+  { return __builtin_ia32_movmskps(__vec_bit_cast<float>(__x)); }
+
+  template <_ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline unsigned
+    __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 8> __x)
+    {
+#if __has_builtin(__builtin_ia32_pext_di)
+      if constexpr (_Traits._M_have_bmi2())
+        return __builtin_ia32_pext_di(__builtin_bit_cast(unsigned long long, __x),
+                                      0x80000000'80000000ULL);
+#endif
+      return __x86_movmsk(__vec_zero_pad_to_16(__x));
+    }
+
+  [[__gnu__::__always_inline__]]
+  inline unsigned
+  __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 32> __x)
+  { return __builtin_ia32_movmskps256(__vec_bit_cast<float>(__x)); }
+
+  template <__vec_builtin _TV, auto _Traits = _ArchTraits()>
+    requires (sizeof(__vec_value_type<_TV>) <= 2)
+    [[__gnu__::__always_inline__]]
+    inline unsigned
+    __x86_movmsk(_TV __x)
+    {
+      static_assert(__width_of<_TV> > 1);
+      if constexpr (sizeof(__x) == 32)
+        return __builtin_ia32_pmovmskb256(__vec_bit_cast<char>(__x));
+      else if constexpr (sizeof(__x) == 16)
+        return __builtin_ia32_pmovmskb128(__vec_bit_cast<char>(__x));
+      else if constexpr (sizeof(__x) == 8)
+        {
+#if __has_builtin(__builtin_ia32_pext_di)
+          if constexpr (_Traits._M_have_bmi2())
+            return __builtin_ia32_pext_di(__builtin_bit_cast(unsigned long long, __x),
+                                          0x8080'8080'8080'8080ULL);
+#endif
+          return __x86_movmsk(__vec_zero_pad_to_16(__x));
+        }
+      else if constexpr (sizeof(__x) == 4)
+        {
+#if __has_builtin(__builtin_ia32_pext_si)
+          if constexpr (_Traits._M_have_bmi2())
+            return __builtin_ia32_pext_si(__builtin_bit_cast(unsigned int, __x), 0x80808080u);
+#endif
+          return __x86_movmsk(__vec_zero_pad_to_16(__x));
+        }
+      else if constexpr (sizeof(__x) == 2)
+        {
+          auto __bits = __builtin_bit_cast(unsigned short, __x);
+#if __has_builtin(__builtin_ia32_pext_si)
+          if constexpr (_Traits._M_have_bmi2())
+            return __builtin_ia32_pext_si(__bits, 0x00008080u);
+#endif
+          return ((__bits >> 7) & 1) | ((__bits & 0x8000) >> 14);
+        }
+      else
+        static_assert(false);
+    }
+
+  template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline bool
+    __x86_vec_is_zero(_TV __a)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      static_assert(is_integral_v<_Tp>);
+      if constexpr (sizeof(_TV) <= __x86_max_general_register_size)
+        return __builtin_bit_cast(__integer_from<sizeof(_TV)>, __a) == 0;
+      else if constexpr (_Traits._M_have_avx())
+        {
+          if constexpr (sizeof(_TV) == 32)
+            return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
+                                            __vec_bit_cast<long long>(__a));
+          else if constexpr (sizeof(_TV) == 16)
+            return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+                                            __vec_bit_cast<long long>(__a));
+          else if constexpr (sizeof(_TV) < 16)
+            return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
+          else
+            static_assert(false);
+        }
+      else if constexpr (_Traits._M_have_sse4_1())
+        {
+          if constexpr (sizeof(_TV) == 16)
+            return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+                                            __vec_bit_cast<long long>(__a));
+          else if constexpr (sizeof(_TV) < 16)
+            return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
+          else
+            static_assert(false);
+        }
+      else
+        return __x86_movmsk(__a) == 0;
+    }
+
+  template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline int
+    __x86_vec_testz(_TV __a, _TV __b)
+    {
+      static_assert(sizeof(_TV) == 16 or sizeof(_TV) == 32);
+      static_assert(_Traits._M_have_sse4_1());
+      if constexpr (sizeof(_TV) == 32)
+        return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
+                                        __vec_bit_cast<long long>(__b));
+      else
+        return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+                                        __vec_bit_cast<long long>(__b));
+    }
+
+  template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline int
+    __x86_vec_testc(_TV __a, _TV __b)
+    {
+      static_assert(sizeof(_TV) == 16 or sizeof(_TV) == 32);
+      static_assert(_Traits._M_have_sse4_1());
+      if constexpr (sizeof(_TV) == 32)
+        return __builtin_ia32_ptestc256(__vec_bit_cast<long long>(__a),
+                                        __vec_bit_cast<long long>(__b));
+      else
+        return __builtin_ia32_ptestc128(__vec_bit_cast<long long>(__a),
+                                        __vec_bit_cast<long long>(__b));
+    }
+
+  template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline bool
+    __x86_vecmask_all(_TV __k)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      static_assert(is_integral_v<_Tp> and is_signed_v<_Tp>);
+      constexpr int __width = __width_of<_TV>;
+      static_assert(sizeof(__k) <= 32);
+      if constexpr (_Np == __width)
+        {
+          if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+            {
+              using _Ip = __integer_from<sizeof(__k)>;
+              return __builtin_bit_cast(_Ip, __k) == ~_Ip();
+            }
+          else if constexpr (not _Traits._M_have_sse4_1())
+            {
+              constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+              return __x86_movmsk(__k) == __valid_bits;
+            }
+          else if constexpr (sizeof(__k) < 16)
+            return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
+          else
+            return 0 != __x86_vec_testc(__k, ~_TV());
+        }
+      else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+        {
+          using _Ip = __integer_from<sizeof(__k)>;
+          constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+          return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == __valid_bits;
+        }
+      else if constexpr (not _Traits._M_have_sse4_1())
+        {
+          constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+          return (__x86_movmsk(__k) & __valid_bits) == __valid_bits;
+        }
+      else if constexpr (sizeof(__k) < 16)
+        return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
+      else
+        return 0 != __x86_vec_testc(__k, _S_vec_implicit_mask<_Np, _TV>);
+    }
+
+  template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline bool
+    __x86_vecmask_any(_TV __k)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      static_assert(is_integral_v<_Tp> and is_signed_v<_Tp>);
+      constexpr int __width = __width_of<_TV>;
+      static_assert(sizeof(__k) <= 32);
+      if constexpr (_Np == __width)
+        return not __x86_vec_is_zero(__k);
+      else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+        {
+          using _Ip = __integer_from<sizeof(__k)>;
+          constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+          return (__builtin_bit_cast(_Ip, __k) & __valid_bits) != _Ip();
+        }
+      else if constexpr (not _Traits._M_have_sse4_1())
+        {
+          constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+          return (__x86_movmsk(__k) & __valid_bits) != 0;
+        }
+      else if constexpr (sizeof(__k) < 16)
+        return __x86_vecmask_any<_Np>(__vec_zero_pad_to_16(__k));
+      else
+        return 0 == __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
+    }
+
+  template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline bool
+    __x86_vecmask_none(_TV __k)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      static_assert(is_integral_v<_Tp> and is_signed_v<_Tp>);
+      constexpr int __width = __width_of<_TV>;
+      static_assert(sizeof(__k) <= 32);
+      if constexpr (_Np == __width)
+        return __x86_vec_is_zero(__k);
+      else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+        {
+          using _Ip = __integer_from<sizeof(__k)>;
+          constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+          return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == _Ip();
+        }
+      else if constexpr (not _Traits._M_have_sse4_1())
+        {
+          constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+          return (__x86_movmsk(__k) & __valid_bits) == 0;
+        }
+      else if constexpr (sizeof(__k) < 16)
+        return __x86_vecmask_none<_Np>(__vec_zero_pad_to_16(__k));
+      else
+        return 0 != __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
+    }
+
+  enum class _X86Cmp
+  {
+    _Eq = 0,
+    _Lt = 1,
+    _Le = 2,
+    _Unord = 3,
+    _Neq = 4,
+    _Nlt = 5,
+    _Nle = 6,
+  };
+
+  template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    requires is_floating_point_v<__vec_value_type<_TV>>
+    [[__gnu__::__always_inline__]]
+    inline auto
+    __x86_bitmask_cmp(_TV __x, _TV __y)
+    {
+      constexpr int __c = int(_Cmp);
+      using _Tp = __vec_value_type<_TV>;
+      if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+        return __builtin_ia32_cmppd512_mask(__x, __y, __c, -1, 4);
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+        return __builtin_ia32_cmpps512_mask(__x, __y, __c, -1, 4);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+        return __builtin_ia32_cmppd256_mask(__x, __y, __c, -1);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+        return __builtin_ia32_cmpps256_mask(__x, __y, __c, -1);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+        return __builtin_ia32_cmppd128_mask(__x, __y, __c, -1);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+        return __builtin_ia32_cmpps128_mask(__x, __y, __c, -1);
+      else if constexpr (is_same_v<_Tp, _Float16>)
+        {
+          if constexpr (sizeof(_TV) == 64 and _Traits._M_have_avx512fp16())
+            return __builtin_ia32_cmpph512_mask(__x, __y, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and _Traits._M_have_avx512fp16())
+            return __builtin_ia32_cmpph256_mask(__x, __y, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and _Traits._M_have_avx512fp16())
+            return __builtin_ia32_cmpph128_mask(__x, __y, __c, -1);
+          else if constexpr (sizeof(_TV) < 16 and _Traits._M_have_avx512fp16())
+            return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+          else
+            {
+              // without AVX512_FP16, float16_t size needs to match float32_t size
+              // (cf. __native_abi())
+              static_assert(sizeof(_TV) <= 32);
+              return __x86_bitmask_cmp<_Cmp>(__vec_cast<float>(__x), __vec_cast<float>(__y));
+            }
+        }
+      else if constexpr (sizeof(_TV) < 16)
+        return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+      else
+        static_assert(false);
+    }
+
+  template <typename _Tp>
+    using __x86_intrin_int
+      = decltype([] {
+          if constexpr (sizeof(_Tp) == 1)
+            return char();
+          else
+            return __integer_from<sizeof(_Tp)>();
+        }());
+
+  template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    requires is_integral_v<__vec_value_type<_TV>>
+    [[__gnu__::__always_inline__]]
+    inline auto
+    __x86_bitmask_cmp(_TV __x, _TV __y)
+    {
+      constexpr int __c = int(_Cmp);
+      using _Tp = __vec_value_type<_TV>;
+      if constexpr (sizeof(_TV) < 16)
+        return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+      else if constexpr (is_signed_v<_Tp>)
+        {
+          const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
+          const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
+          if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+            return __builtin_ia32_cmpq512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+            return __builtin_ia32_cmpd512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2)
+            return __builtin_ia32_cmpw512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 1)
+            return __builtin_ia32_cmpb512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+            return __builtin_ia32_cmpq256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+            return __builtin_ia32_cmpd256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2)
+            return __builtin_ia32_cmpw256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 1)
+            return __builtin_ia32_cmpb256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+            return __builtin_ia32_cmpq128_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+            return __builtin_ia32_cmpd128_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2)
+            return __builtin_ia32_cmpw128_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 1)
+            return __builtin_ia32_cmpb128_mask(__xi, __yi, __c, -1);
+          else
+            static_assert(false);
+        }
+      else
+        {
+          const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
+          const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
+          if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+            return __builtin_ia32_ucmpq512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+            return __builtin_ia32_ucmpd512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2)
+            return __builtin_ia32_ucmpw512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 1)
+            return __builtin_ia32_ucmpb512_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+            return __builtin_ia32_ucmpq256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+            return __builtin_ia32_ucmpd256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2)
+            return __builtin_ia32_ucmpw256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 1)
+            return __builtin_ia32_ucmpb256_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+            return __builtin_ia32_ucmpq128_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+            return __builtin_ia32_ucmpd128_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2)
+            return __builtin_ia32_ucmpw128_mask(__xi, __yi, __c, -1);
+          else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 1)
+            return __builtin_ia32_ucmpb128_mask(__xi, __yi, __c, -1);
+          else
+            static_assert(false);
+        }
+    }
+
+  template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline auto
+    __x86_bitmask_isinf(_TV __x)
+    {
+      static_assert(_Traits._M_have_avx512dq());
+      using _Tp = __vec_value_type<_TV>;
+      static_assert(is_floating_point_v<_Tp>);
+      if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+        return __builtin_ia32_fpclasspd512_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+        return __builtin_ia32_fpclasspd256_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+        return __builtin_ia32_fpclasspd128_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+        return __builtin_ia32_fpclassps512_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+        return __builtin_ia32_fpclassps256_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+        return __builtin_ia32_fpclassps128_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2 and _Traits._M_have_avx512fp16())
+        return __builtin_ia32_fpclassph512_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2 and _Traits._M_have_avx512fp16())
+        return __builtin_ia32_fpclassph256_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2 and _Traits._M_have_avx512fp16())
+        return __builtin_ia32_fpclassph128_mask(__x, 0x18, -1);
+      else if constexpr (sizeof(_Tp) == 2 and not _Traits._M_have_avx512fp16())
+        return __x86_bitmask_isinf(__vec_cast<float>(__x));
+      else if constexpr (sizeof(_TV) < 16)
+        return __x86_bitmask_isinf(__vec_zero_pad_to_16(__x));
+      else
+        static_assert(false);
+    }
+
+  template <__vec_builtin _KV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline _KV
+    __x86_bit_to_vecmask(std::integral auto __bits)
+    {
+      using _Kp = __vec_value_type<_KV>;
+      static_assert((sizeof(__bits) * __CHAR_BIT__ == __width_of<_KV>)
+                      or (sizeof(__bits) == 1 and __CHAR_BIT__ > __width_of<_KV>));
+
+      if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) == 64)
+        return __builtin_ia32_cvtmask2b512(__bits);
+      else if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) == 32)
+        return __builtin_ia32_cvtmask2b256(__bits);
+      else if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) == 16)
+        return __builtin_ia32_cvtmask2b128(__bits);
+      else if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) <= 8)
+        return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2b128(__bits));
+
+      else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) == 64)
+        return __builtin_ia32_cvtmask2w512(__bits);
+      else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) == 32)
+        return __builtin_ia32_cvtmask2w256(__bits);
+      else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) == 16)
+        return __builtin_ia32_cvtmask2w128(__bits);
+      else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) <= 8)
+        return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2w128(__bits));
+
+      else if constexpr (sizeof(_Kp) == 4 and sizeof(_KV) == 64)
+        return __builtin_ia32_cvtmask2d512(__bits);
+      else if constexpr (sizeof(_Kp) == 4 and sizeof(_KV) == 32)
+        return __builtin_ia32_cvtmask2d256(__bits);
+      else if constexpr (sizeof(_Kp) == 4 and sizeof(_KV) <= 16)
+        return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2d128(__bits));
+
+      else if constexpr (sizeof(_Kp) == 8 and sizeof(_KV) == 64)
+        return __builtin_ia32_cvtmask2q512(__bits);
+      else if constexpr (sizeof(_Kp) == 8 and sizeof(_KV) == 32)
+        return __builtin_ia32_cvtmask2q256(__bits);
+      else if constexpr (sizeof(_Kp) == 8 and sizeof(_KV) == 16)
+        return __builtin_ia32_cvtmask2q128(__bits);
+
+      else
+        static_assert(false);
+    }
+
+  template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    requires is_integral_v<__vec_value_type<_TV>>
+    [[__gnu__::__always_inline__]]
+    constexpr inline _TV
+    __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      using _Ip = __x86_intrin_int<_Tp>;
+      if constexpr (not is_same_v<_Ip, _Tp>)
+        return reinterpret_cast<_TV>(__x86_bitmask_blend(__k, __vec_bit_cast<_Ip>(__t),
+                                                         __vec_bit_cast<_Ip>(__f)));
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+        return __builtin_ia32_blendmq_512_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+        return __builtin_ia32_blendmd_512_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2)
+        return __builtin_ia32_blendmw_512_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 1)
+        return __builtin_ia32_blendmb_512_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+        return __builtin_ia32_blendmq_256_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+        return __builtin_ia32_blendmd_256_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2)
+        return __builtin_ia32_blendmw_256_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 1)
+        return __builtin_ia32_blendmb_256_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+        return __builtin_ia32_blendmq_128_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+        return __builtin_ia32_blendmd_128_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2)
+        return __builtin_ia32_blendmw_128_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 1)
+        return __builtin_ia32_blendmb_128_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) < 16)
+        return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
+                                                            __vec_zero_pad_to_16(__f)));
+      else
+        static_assert(false);
+    }
+
+  template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    requires is_floating_point_v<__vec_value_type<_TV>>
+    [[__gnu__::__always_inline__]]
+    constexpr inline _TV
+    __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+        return __builtin_ia32_blendmpd_512_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+        return __builtin_ia32_blendmps_512_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+        return __builtin_ia32_blendmpd_256_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+        return __builtin_ia32_blendmps_256_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+        return __builtin_ia32_blendmpd_128_mask (__f, __t, __k);
+      else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+        return __builtin_ia32_blendmps_128_mask (__f, __t, __k);
+      else if constexpr (is_same_v<_Tp, _Float16>)
+        {
+          using _Up = __integer_from<sizeof(_Tp)>;
+          return __vec_bit_cast<_Float16>(__x86_bitmask_blend(__k, __vec_bit_cast<_Up>(__t),
+                                                              __vec_bit_cast<_Up>(__f)));
+        }
+      else if constexpr (sizeof(_TV) < 16)
+        return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
+                                                            __vec_zero_pad_to_16(__f)));
+      else
+        static_assert(false);
+    }
+
+  template <int _OutputBits = 4, _ArchTraits _Traits = {}>
+    constexpr _Bitmask<1>
+    __bit_extract_even(_UInt<1> __x)
+    {
+      static_assert(_OutputBits <= 4);
+      constexpr _UInt<1> __mask = 0x55u >> ((4 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+      if constexpr (_Traits._M_have_bmi2())
+        return __builtin_ia32_pext_si(__x, __mask);
+#endif
+      __x &= __mask;
+      __x |= __x >> 1;
+      __x &= 0x33u;
+      __x |= __x >> 2;
+      __x &= 0x0Fu;
+      return __x;
+    }
+
+  template <int _OutputBits = 8, _ArchTraits _Traits = {}>
+    constexpr _Bitmask<1>
+    __bit_extract_even(_UInt<2> __x)
+    {
+      if constexpr (_OutputBits <= 4)
+        return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+      else
+        {
+          static_assert(_OutputBits <= 8);
+          constexpr _UInt<2> __mask = 0x5555u >> ((8 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+          if constexpr (_Traits._M_have_bmi2())
+            return __builtin_ia32_pext_si(__x, __mask);
+#endif
+          __x &= __mask;
+          __x |= __x >> 1;
+          __x &= 0x3333u;
+          __x |= __x >> 2;
+          __x &= 0x0F0Fu;
+          __x |= __x >> 4;
+          return __x;
+        }
+    }
+
+  template <int _OutputBits = 16, _ArchTraits _Traits = {}>
+    constexpr _Bitmask<_OutputBits>
+    __bit_extract_even(_UInt<4> __x)
+    {
+      if constexpr (_OutputBits <= 4)
+        return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+      else if constexpr (_OutputBits <= 8)
+        return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
+      else
+        {
+          static_assert(_OutputBits <= 16);
+          constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+          if constexpr (_Traits._M_have_bmi2())
+            return __builtin_ia32_pext_si(__x, __mask);
+#endif
+          __x &= __mask;
+          __x |= __x >> 1;
+          __x &= 0x3333'3333u;
+          __x |= __x >> 2;
+          __x &= 0x0F0F'0F0Fu;
+          __x |= __x >> 4;
+          __x &= 0x00FF'00FFu;
+          __x |= __x >> 8;
+          return __x;
+        }
+    }
+
+  template <int _OutputBits = 32, _ArchTraits _Traits = {}>
+    constexpr _Bitmask<_OutputBits>
+    __bit_extract_even(_UInt<8> __x)
+    {
+      if constexpr (_OutputBits <= 4)
+        return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+      else if constexpr (_OutputBits <= 8)
+        return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
+      else if constexpr (_OutputBits <= 16)
+        return __bit_extract_even<_OutputBits>(_UInt<4>(__x));
+      else
+        {
+          static_assert(_OutputBits <= 32);
+          constexpr _UInt<8> __mask = 0x5555'5555'5555'5555ull >> ((32 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+          if constexpr (_Traits._M_have_bmi2())
+            {
+#if __has_builtin(__builtin_ia32_pext_di)
+              return __builtin_ia32_pext_di(__x, __mask);
+#else
+              return __builtin_ia32_pext_si(__x, static_cast<unsigned>(__mask))
+                       | (__builtin_ia32_pext_si(__x >> 32, __mask >> 32) << 16);
+#endif
+            }
+#endif
+          __x &= __mask;
+          __x |= __x >> 1;
+          __x &= 0x3333'3333'3333'3333ull;
+          __x |= __x >> 2;
+          __x &= 0x0F0F'0F0F'0F0F'0F0Full;
+          __x |= __x >> 4;
+          __x &= 0x00FF'00FF'00FF'00FFull;
+          __x |= __x >> 8;
+          __x &= 0x0000'FFFF'0000'FFFFull;
+          __x |= __x >> 16;
+          return __x;
+        }
+    }
+
+  // input bits must be 0 for all bits > _InputBits
+  template <int _InputBits = -1, _ArchTraits _Traits = {}>
+    constexpr auto
+    __duplicate_each_bit(unsigned_integral auto __x)
+    {
+      constexpr int __input_bits = _InputBits == -1 ? sizeof(__x) * __CHAR_BIT__ : _InputBits;
+      static_assert(__input_bits >= 1);
+      static_assert(sizeof(__x) * __CHAR_BIT__ >= __input_bits);
+      if constexpr (__input_bits <= 8)
+        {
+          constexpr _UInt<2> __mask = 0x5555u >> ((8 - __input_bits) * 2);
+          if constexpr (__input_bits == 1)
+            return _UInt<1>(__x * 3u);
+#if __has_builtin(__builtin_ia32_pdep_si)
+          else if constexpr (_Traits._M_have_bmi2())
+            return _Bitmask<__input_bits * 2>(3u * __builtin_ia32_pdep_si(__x, __mask));
+#endif
+          else if constexpr (__input_bits == 2) // 0000'00BA
+            return _UInt<1>(((__x + 0b0010u) & 0b0101u) * 3u); // 0B?A -> 0B0A -> BBAA
+          else if constexpr (__input_bits <= 4) // 0000'DCBA
+            {
+              __x = ((__x << 2) | __x ) & 0b0011'0011u; // 00DC'??BA -> 00DC'00BA
+              return _UInt<1>(((__x + 0b0010'0010u) & __mask) * 3u);     // -> DDCC'BBAA
+            }
+          else
+            { // HGFE'DCBA
+              _UInt<2> __y = ((__x << 4) | __x) & 0x0F0Fu; // HGFE'0000'DCBA
+              __y |= __y << 2; // 00HG'??FE'00DC'??BA
+              __y &= 0x3333u;  // 00HG'00FE'00DC'00BA
+              __y += 0x2222u;  // 0H?G'0F?E'0D?C'0B?A
+              return _UInt<2>((__y & __mask) * 3u); // HHGG'FFEE'DDCC'BBAA
+            }
+        }
+      else if constexpr (__input_bits <= 16)
+        {
+          constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - __input_bits) * 2);
+#if __has_builtin(__builtin_ia32_pdep_si)
+          if constexpr (_Traits._M_have_bmi2())
+            return 3u * __builtin_ia32_pdep_si(__x, __mask);
+#endif
+          _UInt<4> __y = ((__x << 8) | __x) & 0x00FF00FFu;
+          __y |= __y << 4;
+          __y &= 0x0F0F'0F0Fu;
+          __y |= __y << 2;
+          __y &= 0x3333'3333u;
+          return ((__y + 0x2222'2222u) & __mask) * 3;
+        }
+      else if constexpr (__input_bits <= 32)
+        {
+          constexpr _UInt<8> __mask = 0x5555'5555'5555'5555u >> ((32 - __input_bits) * 2);
+#if __has_builtin(__builtin_ia32_pdep_si)
+          if constexpr (_Traits._M_have_bmi2())
+            {
+#if __has_builtin(__builtin_ia32_pdep_di)
+              return 3ull * __builtin_ia32_pdep_di(__x, __mask);
+#else
+              const _UInt<8> __hi = 3 * __builtin_ia32_pdep_si(__x >> 16, __mask >> 32);
+              return (3u * __builtin_ia32_pdep_si(__x, static_cast<unsigned>(__mask))) | __hi << 32;
+#endif
+            }
+#endif
+          _UInt<8> __y = ((__x & 0xFFFF'0000ull) << 16) | (__x & 0x0000'FFFFu);
+          __y |= __y << 8;
+          __y &= 0x00FF'00FF'00FF'00FFull;
+          __y |= __y << 4;
+          __y &= 0x0F0F'0F0F'0F0F'0F0Full;
+          __y |= __y << 2;
+          __y &= 0x3333'3333'3333'3333ull;
+          return ((__y + 0x2222'2222'2222'2222ull) & __mask) * 3;
+        }
+      else
+        return __trivial_pair { __duplicate_each_bit(_UInt<4>(__x)),
+                                __duplicate_each_bit<__input_bits - 32>(
+                                  _Bitmask<__input_bits - 32>(__x >> 32)) };
+    }
+
+  template <int _InputBits = -1, typename _U0, typename _U1>
+    constexpr auto
+    __duplicate_each_bit(const __trivial_pair<_U0, _U1>& __x)
+    {
+      static_assert(_InputBits != -1 or is_unsigned_v<_U1>);
+      constexpr int __input_bits = _InputBits == -1 ? (sizeof(_U0) + sizeof(_U1)) * __CHAR_BIT__
+                                                    : _InputBits;
+      constexpr int __in0 = min(int(sizeof(_U0)) * __CHAR_BIT__, __input_bits);
+      constexpr int __in1 = __input_bits - __in0;
+      if constexpr (__in1 == 0)
+        return __duplicate_each_bit<__in0>(__x._M_first);
+      else
+        return __trivial_pair { __duplicate_each_bit<__in0>(__x._M_first),
+                                __duplicate_each_bit<__in1>(__x._M_second) };
+    }
+
+  template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    inline _TV
+    __x86_complex_multiplies(_TV __x, _TV __y)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      using _VO = _VecOps<_TV>;
+
+      static_assert(_Traits._M_have_fma());
+      static_assert(is_floating_point_v<_Tp>);
+
+      if constexpr (not _Traits._M_have_avx512fp16() and sizeof(_Tp) == 2)
+        return __vec_cast<_Tp>(__x86_complex_multiplies(__vec_cast<float>(__x),
+                                                        __vec_cast<float>(__y)));
+      else if constexpr (sizeof(_TV) < 16)
+        return _VO::_S_extract(__x86_complex_multiplies(__vec_zero_pad_to_16(__x),
+                                                        __vec_zero_pad_to_16(__y)));
+
+      else
+        {
+          _TV __x_real = _VO::_S_dup_even(__x);
+          _TV __x_imag = _VO::_S_dup_odd(__x);
+          _TV __y_swapped = _VO::_S_swap_neighbors(__y);
+
+          if constexpr (sizeof(__x) == 16 and sizeof(_Tp) == 2)
+            return __builtin_ia32_vfmaddsubph128_mask(__x_real, __y, __x_imag * __y_swapped, -1);
+          else if constexpr (sizeof(__x) == 32 and sizeof(_Tp) == 2)
+            return __builtin_ia32_vfmaddsubph256_mask(__x_real, __y, __x_imag * __y_swapped, -1);
+          else if constexpr (sizeof(__x) == 64 and sizeof(_Tp) == 2)
+            return __builtin_ia32_vfmaddsubph512_mask(
+                     __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+          else if constexpr (sizeof(__x) == 16 and sizeof(_Tp) == 4)
+            return __builtin_ia32_vfmaddsubps(__x_real, __y, __x_imag * __y_swapped);
+          else if constexpr (sizeof(__x) == 32 and sizeof(_Tp) == 4)
+            return __builtin_ia32_vfmaddsubps256(__x_real, __y, __x_imag * __y_swapped);
+          else if constexpr (sizeof(__x) == 64 and sizeof(_Tp) == 4)
+            return __builtin_ia32_vfmaddsubps512_mask(
+                     __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+          else if constexpr (sizeof(__x) == 16 and sizeof(_Tp) == 8)
+            return __builtin_ia32_vfmaddsubpd(__x_real, __y, __x_imag * __y_swapped);
+          else if constexpr (sizeof(__x) == 32 and sizeof(_Tp) == 8)
+            return __builtin_ia32_vfmaddsubpd256(__x_real, __y, __x_imag * __y_swapped);
+          else if constexpr (sizeof(__x) == 64 and sizeof(_Tp) == 8)
+            return __builtin_ia32_vfmaddsubpd512_mask(
+                     __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+          else
+            static_assert(false);
+        }
+    }
+
+  // FIXME: Work around PR121688
+  template <__vec_builtin _UV, __vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    inline _UV
+    __x86_cvt_f16c(_TV __v)
+    {
+      constexpr bool __from_f16 = is_same_v<__vec_value_type<_TV>, _Float16>;
+      constexpr bool __to_f16 = not __from_f16;
+      if constexpr (__to_f16 and not is_same_v<__vec_value_type<_TV>, float>)
+        return __x86_cvt_f16c<_UV>(__vec_cast<float>(__v));
+      else if constexpr (__from_f16 and not is_same_v<__vec_value_type<_UV>, float>)
+        return __vec_cast<_UV>(__x86_cvt_f16c<__vec_builtin_type<float, __width_of<_TV>>>(__v));
+      else if constexpr (__from_f16)
+        {
+          const auto __vi = __vec_bit_cast<__x86_intrin_int<_Float16>>(__v);
+          if constexpr (sizeof(_TV) == 4)
+            return __vec_split_lo(__builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi)));
+          else if constexpr (sizeof(_TV) == 8)
+            return __builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi));
+          else if constexpr (sizeof(_TV) == 16)
+            return __builtin_ia32_vcvtph2ps256(__vi);
+          else if constexpr (sizeof(_TV) == 32)
+            return __builtin_ia32_vcvtph2ps512_mask(__vi, __vec_builtin_type<float, 16>(), -1, 4);
+          else if constexpr (sizeof(_TV) >= 64)
+            return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
+                                __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
+          else
+            static_assert(false);
+        }
+      else if constexpr (sizeof(_TV) == 8)
+        return reinterpret_cast<_UV>(
+                 __vec_split_lo(__vec_split_lo(__builtin_ia32_vcvtps2ph(
+                                                 __vec_zero_pad_to_16(__v), 4))));
+      else if constexpr (sizeof(_TV) == 16)
+        return reinterpret_cast<_UV>(__vec_split_lo(__builtin_ia32_vcvtps2ph(__v, 4)));
+      else if constexpr (sizeof(_TV) == 32)
+        return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph256(__v, 4));
+      else if constexpr (sizeof(_TV) == 64)
+        return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph512_mask(
+                                       __v, 4, __vec_builtin_type<short, 16>(), -1));
+      else if constexpr (sizeof(_TV) >= 128)
+        return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
+                            __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
+      else
+        static_assert(false);
+    }
+
+  /** \internal
+   * AVX instructions typically work per 128-bit chunk. Horizontal operations thus produce vectors
+   * where the two 128-bit chunks in the center are swapped. This function works as a fix-up step.
+   */
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    inline _TV
+    __x86_swizzle4x64_acbd(_TV __x)
+    {
+      static_assert(sizeof(_TV) == 32);
+      using _UV = __vec_builtin_type_bytes<long long, 32>;
+      return reinterpret_cast<_TV>(__builtin_shufflevector(reinterpret_cast<_UV>(__x), _UV(),
+                                                           0, 2, 1, 3));
+    }
+
+  /** \internal
+   * Like __builtin_convertvector but with a precondition that input values are either 0 or -1.
+   */
+  template <__vec_builtin _To, __vec_builtin _From>
+    [[__gnu__::__always_inline__]]
+    inline _To
+    __x86_cvt_vecmask(_From __k)
+    {
+      using _T0 = __vec_value_type<_From>;
+      using _T1 = __vec_value_type<_To>;
+      if constexpr (sizeof(_From) > sizeof(_To) and sizeof(_From) < 16)
+        {
+          using _ToPadded = __vec_builtin_type_bytes<_T1, sizeof(_To) * 16 / sizeof(_From)>;
+          return _VecOps<_To>::_S_extract(__x86_cvt_vecmask<_ToPadded>(__vec_zero_pad_to_16(__k)));
+        }
+      else if constexpr (sizeof(_T0) == 2 and sizeof(_T1) == 1) // -> packsswb
+        {
+          if constexpr (sizeof(__k) == 16)
+            return reinterpret_cast<_To>(__vec_split_lo(__builtin_ia32_packsswb128(__k, __k)));
+          else if constexpr (sizeof(__k) == 32)
+            return reinterpret_cast<_To>(
+                     __vec_split_lo(__x86_swizzle4x64_acbd(
+                                      __builtin_ia32_packsswb256(__k, __k))));
+          else
+            static_assert(false);
+        }
+      else
+        static_assert(false, "TODO");
+    }
+
+  /** \internal
+   * Overload that concatenates \p __k0 and \p __k1 while converting.
+   */
+  template <__vec_builtin _To, __vec_builtin _From>
+    [[__gnu__::__always_inline__]]
+    inline _To
+    __x86_cvt_vecmask(_From __k0, _From __k1)
+    {
+      using _T0 = __vec_value_type<_From>;
+      using _T1 = __vec_value_type<_To>;
+      static_assert(sizeof(_From) >= 16);
+      if constexpr (sizeof(_T0) == 2 and sizeof(_T1) == 1) // -> packsswb
+        {
+          if constexpr (sizeof(__k0) == 16)
+            return reinterpret_cast<_To>(__builtin_ia32_packsswb128(__k0, __k1));
+          else if constexpr (sizeof(__k0) == 32)
+            return reinterpret_cast<_To>(__x86_swizzle4x64_acbd(
+                                           __builtin_ia32_packsswb256(__k0, __k1)));
+          else
+            static_assert(false);
+        }
+      else
+        static_assert(false, "TODO");
+    }
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_X86_H
diff --git a/libstdc++-v3/include/bits/vec_ops.h b/libstdc++-v3/include/bits/vec_ops.h
new file mode 100644
index 00000000000..3efb320b58a
--- /dev/null
+++ b/libstdc++-v3/include/bits/vec_ops.h
@@ -0,0 +1,592 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025      GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ *                       Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_VEC_OPS_H
+#define _GLIBCXX_VEC_OPS_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_details.h"
+
+#include <bit>
+#include <bits/utility.h>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std::simd
+{
+  template <std::signed_integral _Tp>
+    constexpr bool
+    __signed_has_single_bit(_Tp __x)
+    { return __has_single_bit(make_unsigned_t<_Tp>(__x)); }
+
+  /**
+   * Alias for a vector builtin with given value type and total sizeof.
+   */
+  template <__vectorizable _Tp, size_t _Bytes>
+    requires (__has_single_bit(_Bytes))
+    using __vec_builtin_type_bytes [[__gnu__::__vector_size__(_Bytes)]] = _Tp;
+
+  /**
+   * Alias for a vector builtin with given value type \p _Tp and \p _Width.
+   */
+  template <__vectorizable _Tp, __simd_size_type _Width>
+    requires (__signed_has_single_bit(_Width))
+    using __vec_builtin_type = __vec_builtin_type_bytes<_Tp, sizeof(_Tp) * _Width>;
+
+  /**
+   * Constrain to any vector builtin with given value type and optional width.
+   */
+  template <typename _Tp, typename _ValueType,
+            __simd_size_type _Width = sizeof(_Tp) / sizeof(_ValueType)>
+    concept __vec_builtin_of
+      = not is_arithmetic_v<_Tp> and __vectorizable<_ValueType>
+          and _Width >= 1 and sizeof(_Tp) / sizeof(_ValueType) == _Width
+          and same_as<__vec_builtin_type_bytes<_ValueType, sizeof(_Tp)>, _Tp>
+          and requires(_Tp& __v, _ValueType __x) { __v[0] = __x; };
+
+  /**
+   * Constrain to any vector builtin.
+   */
+  template <typename _Tp>
+    concept __vec_builtin
+      = not is_class_v<_Tp> and requires(const _Tp& __x) {
+        requires __vec_builtin_of<_Tp, remove_cvref_t<decltype(__x[0])>>;
+      };
+
+  /**
+   * Alias for the value type of the given __vec_builtin type \p _Tp.
+   */
+  template <__vec_builtin _Tp>
+    using __vec_value_type = remove_cvref_t<decltype(declval<const _Tp>()[0])>;
+
+  /**
+   * The width (number of value_type elements) of the given vector builtin or arithmetic type.
+   */
+  template <typename _Tp>
+    inline constexpr __simd_size_type __width_of = 1;
+
+  template <typename _Tp>
+    requires __vec_builtin<_Tp>
+    inline constexpr __simd_size_type __width_of<_Tp> = sizeof(_Tp) / sizeof(__vec_value_type<_Tp>);
+
+  /**
+   * Alias for a vector builtin with equal value type and new width \p _Np.
+   */
+  template <__simd_size_type _Np, __vec_builtin _TV>
+    using __resize_vec_builtin_t = __vec_builtin_type<__vec_value_type<_TV>, _Np>;
+
+  template <__vec_builtin _TV>
+    requires (__width_of<_TV> > 1)
+    using __half_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> / 2, _TV>;
+
+  template <__vec_builtin _TV>
+    using __double_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> * 2, _TV>;
+
+  template <typename _Up, __vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr __vec_builtin_type_bytes<_Up, sizeof(_TV)>
+    __vec_bit_cast(_TV __v)
+    { return reinterpret_cast<__vec_builtin_type_bytes<_Up, sizeof(_TV)>>(__v); }
+
+  template <int _Np, __vec_builtin _TV>
+    requires signed_integral<__vec_value_type<_TV>>
+    static constexpr _TV _S_vec_implicit_mask = []<int... _Is> (integer_sequence<int, _Is...>) {
+      return _TV{ (_Is < _Np ? -1 : 0)... };
+    } (make_integer_sequence<int, __width_of<_TV>>());
+
+  /**
+   * Helper function to work around Clang not allowing v[i] in constant expressions.
+   */
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr __vec_value_type<_TV>
+    __vec_get(_TV __v, int __i)
+    {
+#ifdef _GLIBCXX_CLANG
+      if (__builtin_is_constant_evaluated())
+        return __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v)[__i];
+      else
+#endif
+        return __v[__i];
+    }
+
+  /**
+   * Helper function to work around Clang and GCC not allowing assignment to v[i] in constant
+   * expressions.
+   */
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr void
+    __vec_set(_TV& __v, int __i, __vec_value_type<_TV> __x)
+    {
+      if (__builtin_is_constant_evaluated())
+        {
+#ifdef _GLIBCXX_CLANG
+          auto __arr = __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v);
+          __arr[__i] = __x;
+          __v = __builtin_bit_cast(_TV, __arr);
+#else
+          constexpr auto [...__j] = __iota<int[__width_of<_TV>]>;
+          __v = _TV{(__i == __j ? __x : __v[__j])...};
+#endif
+        }
+      else
+        __v[__i] = __x;
+    }
+
+  /**
+   * Return vector builtin with all values from \p __a and \p __b.
+   */
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr __vec_builtin_type<__vec_value_type<_TV>, __width_of<_TV> * 2>
+    __vec_concat(_TV __a, _TV __b)
+    {
+      constexpr int _N0 = __width_of<_TV>;
+#ifdef _GLIBCXX_CLANG
+      using _RV = __vec_builtin_type<__vec_value_type<_TV>, _N0 * 2>;
+      if constexpr (_N0 == 1)
+        return _RV{__a[0], __b[0]};
+      else if constexpr (_N0 == 2)
+        return _RV{__a[0], __a[1], __b[0], __b[1]};
+      else if constexpr (_N0 == 4)
+        return _RV{__a[0], __a[1], __a[2], __a[3],
+                   __b[0], __b[1], __b[2], __b[3]};
+      else if constexpr (_N0 == 8)
+        return _RV{__a[0], __a[1], __a[2], __a[3], __a[4], __a[5], __a[6], __a[7],
+                   __b[0], __b[1], __b[2], __b[3], __b[4], __b[5], __b[6], __b[7]};
+      else if constexpr (_N0 == 16)
+        return _RV{__a[0], __a[1], __a[2], __a[3], __a[4], __a[5], __a[6], __a[7],
+                   __a[8], __a[9], __a[10], __a[11], __a[12], __a[13], __a[14], __a[15],
+                   __b[0], __b[1], __b[2], __b[3], __b[4], __b[5], __b[6], __b[7],
+                   __b[8], __b[9], __b[10], __b[11], __b[12], __b[13], __b[14], __b[15]};
+      else if constexpr (_N0 == 32)
+        return _RV{__a[0], __a[1], __a[2], __a[3], __a[4], __a[5], __a[6], __a[7],
+                   __a[8], __a[9], __a[10], __a[11], __a[12], __a[13], __a[14], __a[15],
+                   __a[16], __a[17], __a[18], __a[19], __a[20], __a[21], __a[22], __a[23],
+                   __a[24], __a[25], __a[26], __a[27], __a[28], __a[29], __a[30], __a[31],
+                   __b[0], __b[1], __b[2], __b[3], __b[4], __b[5], __b[6], __b[7],
+                   __b[8], __b[9], __b[10], __b[11], __b[12], __b[13], __b[14], __b[15],
+                   __b[16], __b[17], __b[18], __b[19], __b[20], __b[21], __b[22], __b[23],
+                   __b[24], __b[25], __b[26], __b[27], __b[28], __b[29], __b[30], __b[31]};
+      else
+        static_assert(false);
+#elif __has_builtin(__integer_pack)
+      return __builtin_shufflevector(__a, __b, __integer_pack(2 * _N0)...);
+#else
+#error "Neither Clang nor GCC?"
+#endif
+    }
+
+  template <int _N0, int _N1, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
+           __vec_builtin... _TVs>
+    [[__gnu__::__always_inline__]]
+    constexpr __vec_builtin_type<__vec_value_type<_TV0>,
+                                 __bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))>
+    __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest)
+    {
+      constexpr auto [...__is] = __iota<int[__bit_ceil(unsigned(_N0 + _N1))]>;
+      const auto __ab = __builtin_shufflevector(
+                          __a, __b, (__is < _N0 ? __is
+                                                : __is < _N0 + _N1 ? __is - _N0 + __width_of<_TV0>
+                                                                   : -1)...);
+      if constexpr (sizeof...(__rest) == 0)
+        return __ab;
+      else
+        return __vec_concat_sized<_N0 + _N1, _Ns...>(__ab, __rest...);
+    }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr __half_vec_builtin_t<_TV>
+    __vec_split_lo(_TV __v)
+    { return __builtin_shufflevector(__v, __v, __integer_pack(__width_of<_TV> / 2)...); }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr __half_vec_builtin_t<_TV>
+    __vec_split_hi(_TV __v)
+    {
+      constexpr int __n = __width_of<_TV> / 2;
+      constexpr auto [...__is] = __iota<int[__n]>;
+      return __half_vec_builtin_t<_TV> {__v[(__n + __is)]...};
+    }
+
+  /**
+   * Return a type with sizeof 16. If the input type is smaller, add zero-padding to \p __x.
+   */
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr auto
+    __vec_zero_pad_to_16(_TV __x)
+    {
+      static_assert(sizeof(_TV) < 16);
+      using _Up = _UInt<sizeof(_TV)>;
+      __vec_builtin_type_bytes<_Up, 16> __tmp = {__builtin_bit_cast(_Up, __x)};
+      return __builtin_bit_cast(__vec_builtin_type_bytes<__vec_value_type<_TV>, 16>, __tmp);
+    }
+
+  /// Return \p __x zero-padded to \p _Bytes bytes.
+  template <size_t _Bytes, __vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr auto
+    __vec_zero_pad_to(_TV __x)
+    {
+      static_assert(sizeof(_TV) <= _Bytes);
+      if constexpr (sizeof(_TV) == _Bytes)
+        return __x;
+      else
+        return __vec_zero_pad_to<_Bytes>(__vec_concat(__x, _TV()));
+    }
+
+#if _GLIBCXX_X86
+  template <__vec_builtin _UV, __vec_builtin _TV>
+    inline _UV
+    __x86_cvt_f16c(_TV __v);
+#endif
+
+  /** \internal
+   * Simple wrapper around __builtin_convertvector to provide static_cast-like syntax.
+   *
+   * Works around GCC failing to use the F16C/AVX512F cvtps2ph/cvtph2ps instructions.
+   */
+  template <__vec_builtin _UV, __vec_builtin _TV, _ArchTraits _Traits = {}>
+    [[__gnu__::__always_inline__]]
+    constexpr _UV
+    __vec_cast(_TV __v)
+    {
+      static_assert(__width_of<_UV> == __width_of<_TV>);
+#if _GLIBCXX_X86
+      constexpr bool __to_f16 = is_same_v<__vec_value_type<_UV>, _Float16>;
+      constexpr bool __from_f16 = is_same_v<__vec_value_type<_TV>, _Float16>;
+      constexpr bool __needs_f16c = _Traits._M_have_f16c() and not _Traits._M_have_avx512fp16()
+                                      and (__to_f16 or __from_f16);
+      if (__needs_f16c and not __builtin_is_constant_evaluated() and not __builtin_constant_p(__v))
+        { // Work around PR121688
+          if constexpr (__needs_f16c)
+            return __x86_cvt_f16c<_UV>(__v);
+        }
+#endif
+      return __builtin_convertvector(__v, _UV);
+    }
+
+  /** \internal
+   * Overload of the above cast function that determines the destination vector type from a given
+   * element type \p _Up and the `__width_of` the argument type.
+   *
+   * Calls the above overload.
+   */
+  template <__vectorizable _Up, __vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr __vec_builtin_type<_Up, __width_of<_TV>>
+    __vec_cast(_TV __v)
+    { return __vec_cast<__vec_builtin_type<_Up, __width_of<_TV>>>(__v); }
+
+  /** \internal
+   * As above, but with additional precondition on possible values of the argument.
+   *
+   * Precondition: __k[i] is either 0 or -1 for all i.
+   */
+  template <__vec_builtin _UV, __vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _UV
+    __vec_mask_cast(_TV __k)
+    {
+      static_assert(signed_integral<__vec_value_type<_UV>>);
+      static_assert(signed_integral<__vec_value_type<_TV>>);
+      // TODO: __builtin_convertvector cannot be optimal because it doesn't consider input and
+      // output can only be 0 or -1.
+      return __builtin_convertvector(__k, _UV);
+    }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _TV
+    __vec_xor(_TV __a, _TV __b)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      if constexpr (is_floating_point_v<_Tp>)
+        {
+          using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+          return __builtin_bit_cast(
+                   _TV, __builtin_bit_cast(_UV, __a) ^ __builtin_bit_cast(_UV, __b));
+        }
+      else
+        return __a ^ __b;
+    }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _TV
+    __vec_or(_TV __a, _TV __b)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      if constexpr (is_floating_point_v<_Tp>)
+        {
+          using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+          return __builtin_bit_cast(
+                   _TV, __builtin_bit_cast(_UV, __a) | __builtin_bit_cast(_UV, __b));
+        }
+      else
+        return __a | __b;
+    }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _TV
+    __vec_and(_TV __a, _TV __b)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      if constexpr (is_floating_point_v<_Tp>)
+        {
+          using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+          return __builtin_bit_cast(
+                   _TV, __builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
+        }
+      else
+        return __a & __b;
+    }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _TV
+    __vec_andnot(_TV __a, _TV __b)
+    {
+      using _Tp = __vec_value_type<_TV>;
+      using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+      return __builtin_bit_cast(
+               _TV, ~__builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
+    }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _TV
+    __vec_not(_TV __a)
+    {
+      using _UV = __vec_builtin_type_bytes<unsigned, sizeof(_TV)>;
+      if constexpr (is_floating_point_v<__vec_value_type<_TV>>)
+        return __builtin_bit_cast(_TV, ~__builtin_bit_cast(_UV, __a));
+      else
+        return ~__a;
+    }
+
+  /**
+   * An object of given type where only the sign bits are 1.
+   */
+  template <__vec_builtin _V>
+    requires std::floating_point<__vec_value_type<_V>>
+    constexpr _V _S_signmask = __vec_xor(_V() + 1, _V() - 1);
+
+  // work around __builtin_constant_p returning false unless passed a variable
+  // (__builtin_constant_p(x[0]) is false while __is_constprop(x[0]) is true)
+  [[__gnu__::__always_inline__]]
+  constexpr bool
+  __is_constprop(const auto& __x)
+  { return __builtin_is_constant_evaluated() or __builtin_constant_p(__x); }
+
+  [[__gnu__::__always_inline__]]
+  constexpr bool
+  __is_constprop(const __complex_like auto& __x)
+  {
+    return __builtin_is_constant_evaluated()
+             or (__is_constprop(__x.real()) and __is_constprop(__x.imag()));
+  }
+
+  [[__gnu__::__always_inline__]]
+  constexpr bool
+  __is_constprop_equal_to(const auto& __x, const auto& __expect)
+  { return (__builtin_is_constant_evaluated() or __builtin_constant_p(__x)) and __x == __expect; }
+
+  template <__vec_builtin _TV, int _Np = __width_of<_TV>,
+            typename = make_integer_sequence<int, _Np>>
+    struct _VecOps;
+
+  template <__vec_builtin _TV, int _Np, int... _Is>
+    struct _VecOps<_TV, _Np, integer_sequence<int, _Is...>>
+    {
+      static_assert(_Np <= __width_of<_TV>);
+
+      using _Tp = __vec_value_type<_TV>;
+
+      using _HV = __half_vec_builtin_t<conditional_t<_Np >= 2, _TV, __double_vec_builtin_t<_TV>>>;
+
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_broadcast_to_even(_Tp __init)
+      { return _TV {((_Is & 1) == 0 ? __init : _Tp())...}; }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_broadcast_to_odd(_Tp __init)
+      { return _TV {((_Is & 1) == 1 ? __init : _Tp())...}; }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr bool
+      _S_all_of(_TV __k) noexcept
+      { return (... and (__k[_Is] != 0)); }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr bool
+      _S_any_of(_TV __k) noexcept
+      { return (... or (__k[_Is] != 0)); }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr bool
+      _S_none_of(_TV __k) noexcept
+      { return (... and (__k[_Is] == 0)); }
+
+      template <typename _Offset = integral_constant<int, 0>>
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_extract(__vec_builtin auto __x, _Offset = {})
+      {
+        static_assert(is_same_v<__vec_value_type<_TV>, __vec_value_type<decltype(__x)>>);
+        return __builtin_shufflevector(__x, decltype(__x)(), (_Is + _Offset::value)...);
+      }
+
+      // swap neighboring elements
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_swap_neighbors(_TV __x)
+      { return __builtin_shufflevector(__x, __x, (_Is ^ 1)...); }
+
+      // duplicate even indexed elements, dropping the odd ones
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_dup_even(_TV __x)
+      { return __builtin_shufflevector(__x, __x, (_Is & ~1)...); }
+
+      // duplicate odd indexed elements, dropping the even ones
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_dup_odd(_TV __x)
+      { return __builtin_shufflevector(__x, __x, (_Is | 1)...); }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr void
+      _S_overwrite_even_elements(_TV& __x, _HV __y) requires (_Np > 1)
+      {
+        constexpr __simd_size_type __n = __width_of<_TV>;
+        __x = __builtin_shufflevector(__x,
+#ifdef _GLIBCXX_CLANG
+                                      __vec_concat(__y, __y),
+#else
+                                      __y,
+#endif
+                                      ((_Is & 1) == 0 ? __n + _Is / 2 : _Is)...);
+      }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr void
+      _S_overwrite_even_elements(_TV& __xl, _TV& __xh, _TV __y)
+      {
+        constexpr __simd_size_type __nl = __width_of<_TV>;
+        constexpr __simd_size_type __nh = __nl * 3 / 2;
+        __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 0 ? __nl + _Is / 2 : _Is)...);
+        __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 0 ? __nh + _Is / 2 : _Is)...);
+      }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr void
+      _S_overwrite_odd_elements(_TV& __x, _HV __y) requires (_Np > 1)
+      {
+        constexpr __simd_size_type __n = __width_of<_TV>;
+        __x = __builtin_shufflevector(__x,
+#ifdef _GLIBCXX_CLANG
+                                      __vec_concat(__y, __y),
+#else
+                                      __y,
+#endif
+                                      ((_Is & 1) == 1 ? __n + _Is / 2 : _Is)...);
+      }
+
+      [[__gnu__::__always_inline__]]
+      static constexpr void
+      _S_overwrite_odd_elements(_TV& __xl, _TV& __xh, _TV __y)
+      {
+        constexpr __simd_size_type __nl = __width_of<_TV>;
+        constexpr __simd_size_type __nh = __nl * 3 / 2;
+        __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 1 ? __nl + _Is / 2 : _Is)...);
+        __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 1 ? __nh + _Is / 2 : _Is)...);
+      }
+
+      // negate every even element (real part of interleaved complex)
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_complex_negate_real(_TV __x)
+      { return __vec_xor(_S_broadcast_to_even(_S_signmask<_TV>[0]), __x); }
+
+      // negate every odd element (imaginary part of interleaved complex)
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_complex_negate_imag(_TV __x)
+      { return __vec_xor(_S_broadcast_to_odd(_S_signmask<_TV>[0]), __x); }
+
+      // Subtract elements with even index, add elements with odd index.
+      [[__gnu__::__always_inline__]]
+      static constexpr _TV
+      _S_addsub(_TV __x, _TV __y)
+      {
+#if 0
+        return __x + _S_complex_negate_imag(__y);
+#else
+        // GCC recognizes this pattern as addsub
+        return __builtin_shufflevector(__x - __y, __x + __y,
+                                       (_Is + (_Is & 1) * __width_of<_TV>)...);
+#endif
+      }
+
+      // true if all elements are know to be equal to __ref at compile time
+      [[__gnu__::__always_inline__]]
+      static constexpr bool
+      _S_is_constprop_equal_to(_TV __x, _Tp __ref)
+      { return (__is_constprop_equal_to(__x[_Is], __ref) and ...); }
+
+      // True iff all elements at even indexes are zero. This includes signed zeros only when
+      // -fno-signed-zeros is in effect.
+      template <_OptTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+      static constexpr bool
+        _S_complex_real_is_constprop_zero(_TV __x)
+        {
+          if constexpr (_Traits._M_conforming_to_STDC_annex_G())
+            {
+              using _Up = _UInt<sizeof(_Tp)>;
+              return (((_Is & 1) == 1 or __is_constprop_equal_to(__builtin_bit_cast(_Up, __x[_Is]),
+                                                                 _Up())) and ...);
+            }
+          else
+            return (((_Is & 1) == 1 or __is_constprop_equal_to(__x[_Is], _Tp())) and ...);
+      }
+
+      // True iff all elements at odd indexes are zero. This includes signed zeros only when
+      // -fno-signed-zeros is in effect.
+      template <_OptTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        static constexpr bool
+        _S_complex_imag_is_constprop_zero(_TV __x)
+        {
+          if constexpr (_Traits._M_conforming_to_STDC_annex_G())
+            {
+              using _Up = _UInt<sizeof(_Tp)>;
+              return (((_Is & 1) == 0 or __is_constprop_equal_to(__builtin_bit_cast(_Up, __x[_Is]),
+                                                                 _Up())) and ...);
+            }
+          else
+            return (((_Is & 1) == 0 or __is_constprop_equal_to(__x[_Is], _Tp())) and ...);
+        }
+    };
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_VEC_OPS_H

[PATCH 01/11] libstdc++: C++26 [simd] details

Reply via email to