libstdc++-v3/ChangeLog:
* include/bits/simd_details.h: New file.
* include/bits/simd_x86.h: New file.
* include/bits/vec_ops.h: New file.
Signed-off-by: Matthias Kretz <[email protected]>
---
libstdc++-v3/include/bits/simd_details.h | 1443 ++++++++++++++++++++++
libstdc++-v3/include/bits/simd_x86.h | 953 ++++++++++++++
libstdc++-v3/include/bits/vec_ops.h | 592 +++++++++
3 files changed, 2988 insertions(+)
create mode 100644 libstdc++-v3/include/bits/simd_details.h
create mode 100644 libstdc++-v3/include/bits/simd_x86.h
create mode 100644 libstdc++-v3/include/bits/vec_ops.h
--
──────────────────────────────────────────────────────────────────────────
Dr. Matthias Kretz https://mattkretz.github.io
GSI Helmholtz Center for Heavy Ion Research https://gsi.de
std::simd
──────────────────────────────────────────────────────────────────────────diff --git a/libstdc++-v3/include/bits/simd_details.h b/libstdc++-v3/include/bits/simd_details.h
new file mode 100644
index 00000000000..f9a793d3a18
--- /dev/null
+++ b/libstdc++-v3/include/bits/simd_details.h
@@ -0,0 +1,1443 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ * Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_SIMD_BASE_H
+#define _GLIBCXX_SIMD_BASE_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include <bit>
+#include <concepts>
+#include <limits>
+#include <complex>
+
+#include <bits/c++config.h>
+#include <bits/ranges_base.h>
+#include <bits/utility.h> // integer_sequence, etc.
+
+#if __CHAR_BIT__ != 8
+// There are simply too many constants and bit operators that currently depend on CHAR_BIT == 8.
+// Generalization to CHAR_BIT != 8 does not make sense without testability (i.e. a test target).
+#error "<simd> is not supported for CHAR_BIT != 8"
+#endif
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+// Work around _GLIBCXX_CLANG not being defined with older libstdc++ when compiling with Clang
+#if __GLIBCXX__ < 20250922 and defined __clang__ and __GNUC_MINOR__ == 2 and not defined _GLIBCXX_CLANG
+#define _GLIBCXX_CLANG __clang__
+#endif
+
+#if defined __x86_64__ && !__SSE2__
+#error "Use of SSE2 is required on x86-64"
+#endif
+
+#if defined __x86_64__ or defined __i386__
+#define _GLIBCXX_X86 1
+#else
+#define _GLIBCXX_X86 0
+#endif
+
+#if !_GLIBCXX_X86
+#error "Not implemented yet. Only supported on x86 for now."
+#endif
+
+#ifndef _GLIBCXX_SIMD_NOEXCEPT
+/** @internal
+ * For unit-testing preconditions, use this macro to remove noexcept.
+ */
+#define _GLIBCXX_SIMD_NOEXCEPT noexcept
+#endif
+
+#if __cpp_deleted_function >= 202403L
+#define _GLIBCXX_DELETE_MSG(msg) delete(msg)
+#else
+#define _GLIBCXX_DELETE_MSG(msg) delete
+#endif
+
+#define _GLIBCXX_SIMD_TOSTRING_IMPL(x) #x
+#define _GLIBCXX_SIMD_TOSTRING(x) _GLIBCXX_SIMD_TOSTRING_IMPL(x)
+#define _GLIBCXX_SIMD_LOC __FILE__ ":" _GLIBCXX_SIMD_TOSTRING(__LINE__) ": "
+
+#if not IFNDR_SIMD_PRECONDITIONS
+#define __glibcxx_simd_precondition(expr, msg, ...) \
+ do { \
+ if (__builtin_expect(!bool(expr), false)) \
+ std::simd::__invoke_ub( \
+ _GLIBCXX_SIMD_LOC "precondition failure in '%s':\n" msg " ('" #expr "' does not hold)", \
+ __PRETTY_FUNCTION__ __VA_OPT__(,) __VA_ARGS__); \
+ } while(false)
+#else
+#define __glibcxx_simd_precondition(expr, msg, ...) \
+ do { \
+ const bool __precondition_result = !bool(expr); \
+ if (__builtin_constant_p(__precondition_result) && __precondition_result) \
+ []() __attribute__((__noinline__, __noipa__, __error__("precondition failure." \
+ "\n" _GLIBCXX_SIMD_LOC "note: " msg " (precondition '" #expr "' does not hold)"))) \
+ { __builtin_unreachable(); }(); \
+ else if (__builtin_expect(__precondition_result, false)) \
+ std::simd::__invoke_ub( \
+ _GLIBCXX_SIMD_LOC "precondition failure in '%s':\n" msg " ('" #expr "' does not hold)", \
+ __PRETTY_FUNCTION__ __VA_OPT__(,) __VA_ARGS__); \
+ } while(false)
+#endif
+
+namespace std::simd
+{
+ template <typename... _Args>
+ [[noreturn, __gnu__::__always_inline__]]
+ inline void
+ __invoke_ub([[maybe_unused]] const char* __msg, [[maybe_unused]] const _Args&... __args)
+ {
+#ifdef _GLIBCXX_ASSERTIONS
+ __builtin_fprintf(stderr, __msg, __args...);
+ __builtin_fprintf(stderr, "\n");
+ __builtin_abort();
+#elif _GLIBCXX_SIMD_TRAP_ON_UB
+ __builtin_trap();
+#else
+ __builtin_unreachable();
+#endif
+ }
+
+ template <typename _Tp>
+ inline constexpr _Tp
+ __iota = [] { static_assert(false, "invalid __iota specialization"); }();
+
+#if __has_builtin(__integer_pack)
+ template <typename _Tp, std::size_t _Np>
+ inline constexpr type_identity_t<_Tp[_Np]>
+ __iota<_Tp[_Np]> = {__integer_pack(_Tp(_Np))...};
+#else
+ template<typename _Tp, typename>
+ struct __iota_array;
+
+ template<typename _Tp, _Tp... _Is>
+ struct __iota_array<_Tp, integer_sequence<_Tp, _Is...>>
+ { static constexpr _Tp _S_data[sizeof...(_Is)] = {_Is...}; };
+
+ template <typename _Tp, std::size_t _Np>
+ inline constexpr auto&
+ __iota<_Tp[_Np]> = __iota_array<_Tp, make_integer_sequence<_Tp, _Np>>::_S_data;
+#endif
+
+ // [simd.general] vectorizable types
+ template <typename _Cp, auto __re, auto __im, typename _Tp = typename _Cp::value_type>
+ constexpr _Cp __complex_object = _Cp {_Tp(__re), _Tp(__im)};
+
+ template <typename _Tp>
+ struct _Arr2
+ { _Tp _M_data[2]; };
+
+ template <typename _Tp>
+ concept __complex_like_impl
+ = requires(_Tp __x) {
+ typename _Tp::value_type;
+ { __x.real() } -> same_as<typename _Tp::value_type>;
+ { __x.imag() } -> same_as<typename _Tp::value_type>;
+ { real(__x) } -> same_as<typename _Tp::value_type>;
+ { imag(__x) } -> same_as<typename _Tp::value_type>;
+ { +__x } -> same_as<_Tp>;
+ { -__x } -> same_as<_Tp>;
+ { __x + __x } -> same_as<_Tp>;
+ { __x - __x } -> same_as<_Tp>;
+ { __x * __x } -> same_as<_Tp>;
+ { __x / __x } -> same_as<_Tp>;
+ { __x += __x } -> same_as<_Tp&>;
+ { __x -= __x } -> same_as<_Tp&>;
+ { __x *= __x } -> same_as<_Tp&>;
+ { __x /= __x } -> same_as<_Tp&>;
+ { abs(__x) } -> same_as<typename _Tp::value_type>;
+ { arg(__x) } -> same_as<typename _Tp::value_type>;
+ { norm(__x) } -> same_as<typename _Tp::value_type>;
+ { conj(__x) } -> same_as<_Tp>;
+ { proj(__x) } -> same_as<_Tp>;
+ }
+ and (__complex_object<_Tp, 1, 2> + _Tp {} == __complex_object<_Tp, 1, 2>)
+ and (__complex_object<_Tp, -1, 5> - __complex_object<_Tp, -1, 5> == _Tp {})
+ and (__complex_object<_Tp, 2, 3> * __complex_object<_Tp, 1, 1>
+ == __complex_object<_Tp, -1, 5>)
+ and (__complex_object<_Tp, 5, 5> / __complex_object<_Tp, 1, 2>
+ == __complex_object<_Tp, 3, -1>)
+ and (conj(__complex_object<_Tp, 5, 3>) == __complex_object<_Tp, 5, -3>)
+ // not constexpr: and (abs(__complex_object<_Tp, 3, 4>) == typename _Tp::value_type(5))
+ and (norm(__complex_object<_Tp, 5, 5>) == typename _Tp::value_type(50))
+ and (2 * sizeof(typename _Tp::value_type) == sizeof(_Tp))
+ and (__builtin_bit_cast(_Arr2<typename _Tp::value_type>, __complex_object<_Tp, 1, 2>)
+ ._M_data[0] == 1);
+
+ /** @internal
+ * Satisfied if @p _Tp implements the std::complex interface.
+ */
+ template <typename _Tp>
+ concept __complex_like = __complex_like_impl<remove_cvref_t<_Tp>>;
+
+ template <typename _Tp>
+ concept __vectorizable_scalar
+ = same_as<remove_cv_t<_Tp>, _Tp>
+ and ((integral<_Tp> and sizeof(_Tp) <= sizeof(0ULL) and not same_as<_Tp, bool>)
+ or (floating_point<_Tp> and sizeof(_Tp) <= sizeof(double)));
+
+ // [simd.general] p2
+ template <typename _Tp>
+ concept __vectorizable
+ = __vectorizable_scalar<_Tp>
+ or (__complex_like_impl<_Tp> and __vectorizable_scalar<typename _Tp::value_type>
+ and floating_point<typename _Tp::value_type>);
+
+ /** @internal
+ * Describes variants of _Abi.
+ */
+ enum _AbiVariant : unsigned long long
+ {
+ _VecMask = 1 << 0, // default uses vector masks
+ _BitMask = 1 << 1, // switch to bit-masks (AVX512)
+ _MaskVariants = _VecMask | _BitMask,
+ _CxIleav = 1 << 5, // store complex components interleaved (ririri...)
+ _CxCtgus = 1 << 6, // ... or store complex components contigously (rrrr iiii)
+ _CxVariants = _CxIleav | _CxCtgus,
+ };
+
+ /** @internal
+ * Return true iff @p __x is set in @p __flags.
+ */
+ consteval bool
+ __flags_test(_AbiVariant __flags, _AbiVariant __x)
+ { return (__flags | __x) == __flags; }
+
+ /** @internal
+ * Type used whenever no valid integer/value type exists.
+ */
+ struct _InvalidInteger
+ {};
+
+ /** @internal
+ * Alias for a signed integer type T such that sizeof(T) equals _Bytes.
+ *
+ * C++26 [simd.expos.defn]
+ */
+ template <size_t _Bytes>
+ using __integer_from
+ = decltype([] {
+ if constexpr (sizeof(signed char) == _Bytes)
+ return static_cast<signed char>(0);
+ else if constexpr (sizeof(signed short) == _Bytes)
+ return static_cast<signed short>(0);
+ else if constexpr (sizeof(signed int) == _Bytes)
+ return static_cast<signed int>(0);
+ else if constexpr (sizeof(signed long long) == _Bytes)
+ return static_cast<signed long long>(0);
+ else
+ return _InvalidInteger();
+ }());
+
+ /** @internal
+ * Alias for an unsigned integer type T such that sizeof(T) equals _Bytes.
+ */
+ template <size_t _Bytes>
+ using _UInt = make_unsigned_t<__integer_from<_Bytes>>;
+
+ /** @internal
+ * Divide @p __x by @p __y while rounding up instead of down.
+ *
+ * Preconditions: __x >= 0 and __y > 0.
+ */
+ template <typename _Tp>
+ constexpr _Tp
+ __div_ceil(_Tp __x, _Tp __y)
+ { return (__x + __y - 1) / __y; }
+
+ /** @internal
+ * Alias for an unsigned integer type that can store at least @p _NBits bits.
+ */
+ template <int _NBits>
+ requires (_NBits > 0 and _NBits <= 64)
+ using _Bitmask = _UInt<__div_ceil(__bit_ceil(unsigned(_NBits)), unsigned(__CHAR_BIT__))>;
+
+ /** @internal
+ * Map a given type @p _Tp to an equivalent type.
+ *
+ * This helps with reducing the necessary branches and casts in the implementation as well as
+ * reducing the number of template instantiations.
+ */
+ template <typename _Tp>
+ struct __canonical_vec_type
+ { using type = _Tp; };
+
+ template <typename _Tp>
+ using __canonical_vec_type_t = typename __canonical_vec_type<_Tp>::type;
+
+ template <std::same_as<long> _Tp>
+ requires (sizeof(_Tp) == sizeof(int))
+ struct __canonical_vec_type<_Tp>
+ { using type = int; };
+
+ template <std::same_as<long> _Tp>
+ requires (sizeof(_Tp) == sizeof(long long))
+ struct __canonical_vec_type<_Tp>
+ { using type = long long; };
+
+ template <std::same_as<unsigned long> _Tp>
+ requires (sizeof(_Tp) == sizeof(unsigned int))
+ struct __canonical_vec_type<_Tp>
+ { using type = unsigned int; };
+
+ template <std::same_as<unsigned long> _Tp>
+ requires (sizeof(_Tp) == sizeof(unsigned long long))
+ struct __canonical_vec_type<_Tp>
+ { using type = unsigned long long; };
+
+ template <typename _Tp>
+ requires std::is_enum_v<_Tp>
+ struct __canonical_vec_type<_Tp>
+ { using type = __canonical_vec_type<std::underlying_type_t<_Tp>>::type; };
+
+ template <>
+ struct __canonical_vec_type<char>
+ { using type = std::conditional_t<std::is_signed_v<char>, signed char, unsigned char>; };
+
+ template <>
+ struct __canonical_vec_type<char8_t>
+ { using type = unsigned char; };
+
+ template <>
+ struct __canonical_vec_type<char16_t>
+ { using type = uint_least16_t; };
+
+ template <>
+ struct __canonical_vec_type<char32_t>
+ { using type = uint_least32_t; };
+
+ template <>
+ struct __canonical_vec_type<wchar_t>
+ {
+ using type = std::conditional_t<std::is_signed_v<wchar_t>,
+ simd::__integer_from<sizeof(wchar_t)>,
+ simd::_UInt<sizeof(wchar_t)>>;
+ };
+
+ template <>
+ struct __canonical_vec_type<_Float64>
+ { using type = double; };
+
+ template <>
+ struct __canonical_vec_type<_Float32>
+ { using type = float; };
+
+ /** @internal
+ * This ABI tag describes basic_vec objects that store one element per data member and basic_mask
+ * objects that store one bool data members.
+ *
+ * @tparam _Np The number of elements, which also matches the number of data members in
+ * basic_vec and basic_mask.
+ */
+ template <int _Np = 1>
+ struct _ScalarAbi
+ {
+ static constexpr int _S_size = _Np;
+
+ static constexpr int _S_nreg = _Np;
+
+ static constexpr _AbiVariant _S_variant = {};
+
+ template <typename _Tp>
+ using _DataType = __canonical_vec_type_t<_Tp>;
+
+ static constexpr bool _S_is_cx_ileav = false;
+
+ template <size_t>
+ using _MaskDataType = bool;
+
+ template <int _N2, int _Nreg2 = _N2>
+ consteval _ScalarAbi<_N2>
+ _M_resize() const
+ {
+ static_assert(_N2 == _Nreg2);
+ return {};
+ }
+ };
+
+ /** @internal
+ * This ABI tag describes basic_vec objects that store one or more objects declared with the
+ * [[gnu::vector_size(N)]] attribute.
+ * Applied to basic_mask objects, this ABI tag either describes corresponding vector-mask objects
+ * or bit-mask objects. Which one is used is determined via @p _Var.
+ *
+ * @tparam _Np The number of elements.
+ * @tparam _Nreg The number of registers needed to store @p _Np elements.
+ * @tparam _Var Determines how complex value-types are layed out and whether mask types use
+ * bit-masks or vector-masks.
+ */
+ template <int _Np, int _Nreg, underlying_type_t<_AbiVariant> _Var
+#ifdef __AVX512F__
+ = _AbiVariant::_BitMask
+#else
+ = _AbiVariant::_VecMask
+#endif
+ >
+ struct _Abi
+ {
+ static constexpr int _S_size = _Np;
+
+ /**\internal
+ * The number of registers needed to represent one basic_vec for the element type that was
+ * used on ABI deduction.
+ *
+ * For _CxCtgus the value applies twice, once per reals and once per imags.
+ *
+ * Examples:
+ * - '_Abi< 8, 2>' for 'int' is 2x 128-bit
+ * - '_Abi< 9, 3>' for 'int' is 2x 128-bit and 1x 32-bit
+ * - '_Abi<10, 3>' for 'int' is 2x 128-bit and 1x 64-bit
+ * - '_Abi<10, 1>' for 'int' is 1x 512-bit
+ * - '_Abi<10, 2>' for 'int' is 1x 256-bit and 1x 64-bit
+ * - '_Abi< 8, 2, _CxIleav>' for 'complex<float>' is 2x 256-bit
+ * - '_Abi< 9, 2, _CxIleav>' for 'complex<float>' is 1x 512-bit and 1x 64-bit
+ * - '_Abi< 8, 1, _CxCtgus>' for 'complex<float>' is 2x 256-bit
+ */
+ static constexpr int _S_nreg = _Nreg;
+
+ static constexpr _AbiVariant _S_variant = static_cast<_AbiVariant>(_Var);
+
+ template <typename _Tp>
+ using _DataType = decltype([] {
+ static_assert(_S_nreg == 1);
+ static_assert(not __flags_test(_S_variant, _AbiVariant::_CxIleav));
+ static_assert(not __flags_test(_S_variant, _AbiVariant::_CxCtgus));
+ constexpr int __n = __bit_ceil(unsigned(_S_size));
+ using _Vp [[__gnu__::__vector_size__(sizeof(_Tp) * __n)]]
+ = __canonical_vec_type_t<_Tp>;
+ return _Vp();
+ }());
+
+ static constexpr bool _S_is_cx_ileav = __flags_test(_S_variant, _AbiVariant::_CxIleav);
+
+ template <size_t _Bytes>
+ using _MaskDataType
+ = decltype([] {
+ static_assert(not _S_is_cx_ileav);
+ if constexpr (__flags_test(_S_variant, _AbiVariant::_BitMask))
+ {
+ if constexpr (_Nreg > 1)
+ return _InvalidInteger();
+ else
+ return _Bitmask<_S_size>();
+ }
+ else
+ {
+ constexpr unsigned __vbytes = _Bytes * __bit_ceil(unsigned(_S_size));
+ using _Vp [[__gnu__::__vector_size__(__vbytes)]] = __integer_from<_Bytes>;
+ return _Vp();
+ }
+ }());
+
+ template <int _N2, int _Nreg2 = __div_ceil(_N2, _S_size)>
+ consteval auto
+ _M_resize() const
+ {
+ if constexpr (_N2 == 1 and not __flags_test(_S_variant, _AbiVariant::_CxIleav))
+ return _ScalarAbi<1>();
+ else
+ return _Abi<_N2, _Nreg2, _Var>();
+ }
+ };
+
+ /** @internal
+ * This type is used whenever ABI tag deduction can't give a useful answer.
+ */
+ struct _InvalidAbi
+ { static constexpr int _S_size = 0; };
+
+ /** @internal
+ * Satisfied if @p _Tp is a valid simd ABI tag. This is a necessary but not sufficient condition
+ * for an enabled basic_vec/basic_mask specialization.
+ */
+ template <typename _Tp>
+ concept __abi_tag
+ = same_as<decltype(_Tp::_S_variant), const _AbiVariant>
+ and (_Tp::_S_size >= _Tp::_S_nreg) and (_Tp::_S_nreg >= 1)
+ and requires(_Tp __x) {
+ { __x.template _M_resize<_Tp::_S_size, _Tp::_S_nreg>() } -> same_as<_Tp>;
+ };
+
+ // Determine if math functions must *raise* floating-point exceptions.
+ // math_errhandling may expand to an extern symbol, in which case we must assume fp exceptions
+ // need to be considered.
+ template <int = 0>
+ requires requires { typename bool_constant<0 != (math_errhandling & MATH_ERREXCEPT)>; }
+ consteval bool
+ __handle_fpexcept_impl(int)
+ { return 0 != (math_errhandling & MATH_ERREXCEPT); }
+
+ // Fallback if math_errhandling doesn't work: implement correct exception behavior.
+ consteval bool
+ __handle_fpexcept_impl(float)
+ { return true; }
+
+ /** @internal
+ * This type can be used as a template parameter for avoiding ODR violations, where code needs to
+ * differ depending on optimization flags (mostly fp-math related).
+ */
+ struct _OptTraits
+ {
+ consteval bool
+ _M_test(int __bit) const
+ { return ((_M_build_flags >> __bit) & 1) == 1; }
+
+ // true iff floating-point operations can signal an exception (allow non-default handler)
+ consteval bool
+ _M_fp_may_signal() const
+ { return _M_test(0); }
+
+ // true iff floating-point operations can raise an exception flag
+ consteval bool
+ _M_fp_may_raise() const
+ { return _M_test(12); }
+
+ consteval bool
+ _M_fast_math() const
+ { return _M_test(1); }
+
+ consteval bool
+ _M_finite_math_only() const
+ { return _M_test(2); }
+
+ consteval bool
+ _M_no_signed_zeros() const
+ { return _M_test(3); }
+
+ consteval bool
+ _M_signed_zeros() const
+ { return not _M_test(3); }
+
+ consteval bool
+ _M_reciprocal_math() const
+ { return _M_test(4); }
+
+ consteval bool
+ _M_no_math_errno() const
+ { return _M_test(5); }
+
+ consteval bool
+ _M_math_errno() const
+ { return not _M_test(5); }
+
+ consteval bool
+ _M_associative_math() const
+ { return _M_test(6); }
+
+ consteval bool
+ _M_conforming_to_STDC_annex_G() const
+ { return _M_test(10) and not _M_finite_math_only(); }
+
+ consteval bool
+ _M_support_snan() const
+ { return _M_test(11); }
+
+ __UINT64_TYPE__ _M_build_flags
+ = 0
+#if not __NO_TRAPPING_MATH__
+ + (1 << 0)
+#endif
+ + (__handle_fpexcept_impl(0) << 12)
+#if __FAST_MATH__
+ + (1 << 1)
+#endif
+#if __FINITE_MATH_ONLY__
+ + (1 << 2)
+#endif
+#if __NO_SIGNED_ZEROS__
+ + (1 << 3)
+#endif
+#if __RECIPROCAL_MATH__
+ + (1 << 4)
+#endif
+#if __NO_MATH_ERRNO__
+ + (1 << 5)
+#endif
+#if __ASSOCIATIVE_MATH__
+ + (1 << 6)
+#endif
+ // bits 7, 8, and 9 reserved for __FLT_EVAL_METHOD__
+#if __FLT_EVAL_METHOD__ == 1
+ + (1 << 7)
+#elif __FLT_EVAL_METHOD__ == 2
+ + (2 << 7)
+#elif __FLT_EVAL_METHOD__ != 0
+ + (3 << 7)
+#endif
+
+ // C Annex G defines the behavior of complex<T> where T is IEC60559 floating-point. If
+ // __STDC_IEC_60559_COMPLEX__ is defined then Annex G is implemented - and simd<complex>
+ // will do so as well. However, Clang never defines the macro.
+#if defined __STDC_IEC_60559_COMPLEX__ or defined __STDC_IEC_559_COMPLEX__ or defined _GLIBCXX_CLANG
+ + (1 << 10)
+#endif
+#if __SUPPORT_SNAN__
+ + (1 << 11)
+#endif
+ ;
+ };
+
+ /** @internal
+ * Return true iff @p __s equals "1".
+ */
+ consteval bool
+ __streq_to_1(const char* __s)
+ { return __s != nullptr and __s[0] == '1' and __s[1] == '\0'; }
+
+ /** @internal
+ * If the macro given as @p feat is defined to 1, expands to a bit set at position @p off.
+ * Otherwise, expand to zero.
+ */
+#define _GLIBCXX_SIMD_ARCH_FLAG(off, feat) \
+ (static_cast<__UINT64_TYPE__>(std::simd::__streq_to_1(_GLIBCXX_SIMD_TOSTRING_IMPL(feat))) << off)
+
+#if _GLIBCXX_X86
+
+#define _GLIBCXX_SIMD_ARCH_TRAITS_INIT { \
+ _GLIBCXX_SIMD_ARCH_FLAG(0, __MMX__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 1, __SSE__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 2, __SSE2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 3, __SSE3__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 4, __SSSE3__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 5, __SSE4_1__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 6, __SSE4_2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 7, __POPCNT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 8, __AVX__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG( 9, __F16C__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(10, __BMI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(11, __BMI2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(12, __LZCNT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(13, __AVX2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(14, __FMA__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(15, __AVX512F__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(16, __AVX512CD__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(17, __AVX512DQ__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(18, __AVX512BW__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(19, __AVX512VL__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(20, __AVX512BITALG__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(21, __AVX512VBMI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(22, __AVX512VBMI2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(23, __AVX512IFMA__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(24, __AVX512VNNI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(25, __AVX512VPOPCNTDQ__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(26, __AVX512FP16__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(27, __AVX512BF16__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(28, __AVXIFMA__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(29, __AVXNECONVERT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(30, __AVXVNNI__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(31, __AVXVNNIINT8__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(32, __AVXVNNIINT16__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(33, __AVX10_1__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(34, __AVX10_2__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(35, __AVX512VP2INTERSECT__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(36, __SSE4A__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(37, __FMA4__) \
+ | _GLIBCXX_SIMD_ARCH_FLAG(38, __XOP__) \
+ }
+ // Should this include __APX_F__? I don't think it's relevant for use in constexpr-if branches =>
+ // no ODR issue? The same could be said about several other flags above that are not checked
+ // anywhere.
+
+ struct _ArchTraits
+ {
+ __UINT64_TYPE__ _M_flags = _GLIBCXX_SIMD_ARCH_TRAITS_INIT;
+
+ consteval bool
+ _M_test(int __bit) const
+ { return ((_M_flags >> __bit) & 1) == 1; }
+
+ consteval bool
+ _M_have_mmx() const
+ { return _M_test(0); }
+
+ consteval bool
+ _M_have_sse() const
+ { return _M_test(1); }
+
+ consteval bool
+ _M_have_sse2() const
+ { return _M_test(2); }
+
+ consteval bool
+ _M_have_sse3() const
+ { return _M_test(3); }
+
+ consteval bool
+ _M_have_ssse3() const
+ { return _M_test(4); }
+
+ consteval bool
+ _M_have_sse4_1() const
+ { return _M_test(5); }
+
+ consteval bool
+ _M_have_sse4_2() const
+ { return _M_test(6); }
+
+ consteval bool
+ _M_have_popcnt() const
+ { return _M_test(7); }
+
+ consteval bool
+ _M_have_avx() const
+ { return _M_test(8); }
+
+ consteval bool
+ _M_have_f16c() const
+ { return _M_test(9); }
+
+ consteval bool
+ _M_have_bmi() const
+ { return _M_test(10); }
+
+ consteval bool
+ _M_have_bmi2() const
+ { return _M_test(11); }
+
+ consteval bool
+ _M_have_lzcnt() const
+ { return _M_test(12); }
+
+ consteval bool
+ _M_have_avx2() const
+ { return _M_test(13); }
+
+ consteval bool
+ _M_have_fma() const
+ { return _M_test(14); }
+
+ consteval bool
+ _M_have_avx512f() const
+ { return _M_test(15); }
+
+ consteval bool
+ _M_have_avx512cd() const
+ { return _M_test(16); }
+
+ consteval bool
+ _M_have_avx512dq() const
+ { return _M_test(17); }
+
+ consteval bool
+ _M_have_avx512bw() const
+ { return _M_test(18); }
+
+ consteval bool
+ _M_have_avx512vl() const
+ { return _M_test(19); }
+
+ consteval bool
+ _M_have_avx512bitalg() const
+ { return _M_test(20); }
+
+ consteval bool
+ _M_have_avx512vbmi() const
+ { return _M_test(21); }
+
+ consteval bool
+ _M_have_avx512vbmi2() const
+ { return _M_test(22); }
+
+ consteval bool
+ _M_have_avx512ifma() const
+ { return _M_test(23); }
+
+ consteval bool
+ _M_have_avx512vnni() const
+ { return _M_test(24); }
+
+ consteval bool
+ _M_have_avx512vpopcntdq() const
+ { return _M_test(25); }
+
+ consteval bool
+ _M_have_avx512fp16() const
+ { return _M_test(26); }
+
+ consteval bool
+ _M_have_avx512bf16() const
+ { return _M_test(27); }
+
+ consteval bool
+ _M_have_avxifma() const
+ { return _M_test(28); }
+
+ consteval bool
+ _M_have_avxneconvert() const
+ { return _M_test(29); }
+
+ consteval bool
+ _M_have_avxvnni() const
+ { return _M_test(30); }
+
+ consteval bool
+ _M_have_avxvnniint8() const
+ { return _M_test(31); }
+
+ consteval bool
+ _M_have_avxvnniint16() const
+ { return _M_test(32); }
+
+ consteval bool
+ _M_have_avx10_1() const
+ { return _M_test(33); }
+
+ consteval bool
+ _M_have_avx10_2() const
+ { return _M_test(34); }
+
+ consteval bool
+ _M_have_avx512vp2intersect() const
+ { return _M_test(35); }
+
+ consteval bool
+ _M_have_sse4a() const
+ { return _M_test(36); }
+
+ consteval bool
+ _M_have_fma4() const
+ { return _M_test(37); }
+
+ consteval bool
+ _M_have_xop() const
+ { return _M_test(38); }
+
+ template <typename _Tp>
+ consteval bool
+ _M_eval_as_f32() const
+ { return is_same_v<_Tp, _Float16> and not _M_have_avx512fp16(); }
+ };
+
+ template <typename _Tp, _ArchTraits _Traits = {}>
+ consteval auto
+ __native_abi()
+ {
+ constexpr int __adj_sizeof = sizeof(_Tp) * (1 + is_same_v<_Tp, _Float16>);
+ if constexpr (not __vectorizable<_Tp>)
+ return _InvalidAbi();
+ else if constexpr (__complex_like<_Tp>)
+ {
+ constexpr auto __underlying = __native_abi<typename _Tp::value_type>();
+ if constexpr (__underlying._S_size == 1)
+ return _ScalarAbi<1>();
+ else
+ return _Abi<__underlying._S_size / 2, 1,
+ __underlying._S_variant | _AbiVariant::_CxIleav>();
+ }
+ else if constexpr (_Traits._M_have_avx512fp16())
+ return _Abi<64 / sizeof(_Tp), 1, _AbiVariant::_BitMask>();
+ else if constexpr (_Traits._M_have_avx512f())
+ return _Abi<64 / __adj_sizeof, 1, _AbiVariant::_BitMask>();
+ else if constexpr (is_same_v<_Tp, _Float16> and not _Traits._M_have_f16c())
+ return _ScalarAbi<1>();
+ else if constexpr (_Traits._M_have_avx2())
+ return _Abi<32 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+ else if constexpr (_Traits._M_have_avx() and is_floating_point_v<_Tp>)
+ return _Abi<32 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+ else if constexpr (_Traits._M_have_sse2())
+ return _Abi<16 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+ else if constexpr (_Traits._M_have_sse() and is_floating_point_v<_Tp>
+ and sizeof(_Tp) == sizeof(float))
+ return _Abi<16 / __adj_sizeof, 1, _AbiVariant::_VecMask>();
+ else
+ return _ScalarAbi<1>();
+ }
+
+#else
+
+ // scalar fallback
+ // TODO: add more targets
+ struct _ArchTraits
+ {
+ __UINT64_TYPE__ _M_flags = 0;
+
+ constexpr bool
+ _M_test(int __bit) const
+ { return ((_M_flags >> __bit) & 1) == 1; }
+ };
+
+ template <typename _Tp>
+ consteval auto
+ __native_abi()
+ {
+ if constexpr (not __vectorizable<_Tp>)
+ return _InvalidAbi();
+ else
+ return _ScalarAbi<1>();
+ }
+
+#endif
+
+ /** @internal
+ * You must use this type as template argument to function templates that are not declared
+ * always_inline (to avoid issues when linking code compiled with different compiler flags).
+ */
+ struct _TargetTraits
+ : _ArchTraits, _OptTraits
+ {};
+
+ /** @internal
+ * Alias for an ABI tag such that basic_vec<_Tp, __native_abi_t_<_Tp>> stores one SIMD register of
+ * optimal width.
+ *
+ * @tparam _Tp A vectorizable type.
+ *
+ * C++26 [simd.expos.abi]
+ */
+ template <typename _Tp>
+ using __native_abi_t = decltype(__native_abi<_Tp>());
+
+ template <typename _Tp, int _Np, _TargetTraits _Target = {}>
+ consteval auto
+ __deduce_abi()
+ {
+ constexpr auto __native = __native_abi<_Tp>();
+ if constexpr (0 == __native._S_size or _Np <= 0)
+ return _InvalidAbi();
+ else if constexpr (_Np == __native._S_size)
+ return __native;
+ else
+ return __native.template _M_resize<_Np>();
+ }
+
+ /** @internal
+ * Alias for an ABI tag @c A such that <tt>basic_vec<_Tp, A></tt> stores @p _Np elements.
+ *
+ * C++26 [simd.expos.abi]
+ */
+ template <typename _Tp, int _Np>
+ using __deduce_abi_t = decltype(__deduce_abi<_Tp, _Np>());
+
+ /** @internal
+ * \c rebind implementation detail for basic_vec, and basic_mask where we know the destination
+ * value-type
+ */
+ template <typename _Tp, int _Np, __abi_tag _A0, _ArchTraits = {}>
+ consteval auto
+ __abi_rebind()
+ {
+ if constexpr (_Np <= 0 or not __vectorizable<_Tp>)
+ return _InvalidAbi();
+ else
+ {
+ constexpr auto __native = __native_abi<_Tp>();
+ static_assert(0 != __native._S_size);
+ constexpr int __nreg = __div_ceil(_Np, __native._S_size);
+
+ if constexpr (is_same_v<_A0, _ScalarAbi<_A0::_S_size>>)
+ return __deduce_abi<_Tp, _Np>();
+
+ else if constexpr (__complex_like<_Tp>
+ and __flags_test(_A0::_S_variant, _AbiVariant::_CxCtgus)
+ and __flags_test(__native._S_variant, _AbiVariant::_CxIleav))
+ // we need half the number of registers since the number applies twice, to reals and
+ // imaginaries.
+ return _Abi<_Np, __nreg / 2, _A0::_S_variant>();
+
+ else if constexpr (__complex_like<_Tp>
+ and __flags_test(_A0::_S_variant, _AbiVariant::_CxIleav)
+ and __flags_test(__native._S_variant, _AbiVariant::_CxCtgus))
+ return _Abi<_Np, __nreg * 2, _A0::_S_variant>();
+
+ else if constexpr (__complex_like<_Tp>)
+ return _Abi<_Np, __nreg, _A0::_S_variant | _AbiVariant::_CxIleav>();
+
+ else if constexpr (_Np == __nreg)
+ return _ScalarAbi<_Np>();
+
+ else
+ return _Abi<_Np, __nreg, _A0::_S_variant & _AbiVariant::_MaskVariants>();
+ }
+ }
+
+ /** @internal
+ * @c rebind implementation detail for basic_mask.
+ *
+ * The important difference here is that we have no information about the actual value-type other
+ * than its @c sizeof. So <tt>_Bytes == 8</tt> could mean <tt>complex<float></tt>, @c double, or
+ * @c int64_t. E.g. <tt>_Np == 4</tt> with AVX w/o AVX2 that's <tt>vector(4) int</tt>,
+ * <tt>vector(4) long long</tt>, or <tt>2x vector(2) long long</tt>.
+ * That's why this overload has the additional @p _IsOnlyResize parameter, which tells us that the
+ * value-type doesn't change.
+ */
+ template <size_t _Bytes, int _Np, __abi_tag _A0, bool _IsOnlyResize, _ArchTraits _Traits = {}>
+ consteval auto
+ __abi_rebind()
+ {
+ constexpr bool __from_cx = __flags_test(_A0::_S_variant, _AbiVariant::_CxCtgus)
+ or __flags_test(_A0::_S_variant, _AbiVariant::_CxIleav);
+
+ if constexpr (_Bytes == 0 or _Np <= 0)
+ return _InvalidAbi();
+
+ // If _Bytes is sizeof(complex<double>) we can be certain it's a mask<complex<double>, _Np>.
+ else if constexpr (_Bytes == sizeof(double) * 2)
+ return __abi_rebind<complex<double>, _Np, _A0>();
+
+ else if constexpr (is_same_v<_A0, _ScalarAbi<_A0::_S_size>>)
+ {
+ if constexpr (_IsOnlyResize)
+ // stick to _ScalarAbi (likely _Float16 without hardware support)
+ return _ScalarAbi<_Np>();
+ else
+ // otherwise, fresh start via __deduce_abi_t using __integer_from
+ return __deduce_abi<__integer_from<_Bytes>, _Np>();
+ }
+
+ // If the source ABI is complex, _Bytes == sizeof(complex<float>) or
+ // sizeof(complex<float16_t>), and _IsOnlyResize is true, then it's a mask<complex<float>,
+ // _Np>
+ else if constexpr (__from_cx and _IsOnlyResize and _Bytes == 2 * sizeof(float))
+ return __abi_rebind<complex<float>, _Np, _A0>();
+ else if constexpr (__from_cx and _IsOnlyResize and _Bytes == 2 * sizeof(_Float16))
+ return __abi_rebind<complex<_Float16>, _Np, _A0>();
+
+#if _GLIBCXX_X86
+ // AVX w/o AVX2:
+ // e.g. resize_t<8, mask<float, Whatever>> needs to be _Abi<8, 1> not _Abi<8, 2>
+ // We determine whether _A0 identifies an AVX vector by looking at the size of a native
+ // register. If it's 32, it's a YMM register, otherwise it's 16 or less.
+ else if constexpr (_IsOnlyResize
+ and _Traits._M_have_avx() and not _Traits._M_have_avx2()
+ and __bit_ceil(__div_ceil<unsigned>(
+ _A0::_S_size, _A0::_S_nreg)) * _Bytes == 32)
+ {
+ if constexpr (_Bytes == sizeof(double))
+ return __abi_rebind<double, _Np, _A0>();
+ else if constexpr (_Bytes == sizeof(float))
+ return __abi_rebind<float, _Np, _A0>();
+ else if constexpr (_Traits._M_have_f16c() and _Bytes == sizeof(_Float16))
+ return __abi_rebind<_Float16, _Np, _A0>();
+ else // impossible
+ static_assert(false);
+ }
+#endif
+
+ else
+ return __abi_rebind<__integer_from<_Bytes>, _Np, _A0>();
+ }
+
+ /** @internal
+ * Returns true unless _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION is defined.
+ *
+ * On IvyBridge, (vec<float> == 0.f) == (rebind_t<int, vec<float>> == 0) does not compile. It does
+ * compile on basically every other target, though. This is due to the difference in ABI tag:
+ * _Abi<8, 1, 1> vs. _Abi<8, 2, 1>. I know how to define this funtion for libstdc++ to avoid
+ * interconvertible masks. The question is whether we can specify this in general for C++29.
+ */
+ template <typename _To, typename _From>
+ consteval bool
+ __is_mask_conversion_explicit(size_t __b0, size_t __b1)
+ {
+ constexpr int __n = _To::_S_size;
+ static_assert(__n == _From::_S_size);
+#ifndef _GLIBCXX_SIMD_COND_EXPLICIT_MASK_CONVERSION
+ /// C++26 [simd.mask.ctor] uses unconditional explicit
+ return true;
+#else
+ if (__b0 != __b1)
+ return true;
+
+ // everything is better than _ScalarAbi, except when converting to a single bool
+ if constexpr (is_same_v<_To, _ScalarAbi<__n>>)
+ return __n > 1;
+ else if constexpr (is_same_v<_From, _ScalarAbi<__n>>)
+ return true;
+
+ else
+ {
+ constexpr _AbiVariant __f0 = _To::_S_variant;
+ constexpr _AbiVariant __f1 = _From::_S_variant;
+
+ // converting to a bit-mask is better
+ if constexpr ((__f0 & _AbiVariant::_MaskVariants) != (__f1 & _AbiVariant::_MaskVariants))
+ return __flags_test(__f0, _AbiVariant::_VecMask); // to _VecMask is explicit
+
+ // with vec-masks, fewer registers is better
+ else if constexpr (_From::_S_nreg != _To::_S_nreg)
+ return _From::_S_nreg < _To::_S_nreg;
+
+ // differ only on _Cx flags
+ // interleaved complex is worse
+ else if constexpr (__flags_test(__f0, _AbiVariant::_CxIleav))
+ return true;
+ else if constexpr (__flags_test(__f1, _AbiVariant::_CxIleav))
+ return false;
+
+ // prefer non-_Cx over _CxCtgus
+ else if constexpr (__flags_test(__f0, _AbiVariant::_CxCtgus))
+ return true;
+ else
+ __builtin_unreachable();
+ }
+#endif
+ }
+
+ /** @internal
+ * An alias for a signed integer type.
+ *
+ * libstdc++ unconditionally uses @c int here, since it matches the return type of
+ * 'Bit Operation Builtins' in GCC.
+ *
+ * C++26 [simd.expos.defn]
+ */
+ using __simd_size_type = int;
+
+ /** @internal
+ * The width of <tt>basic_vec<T, Abi></tt> if the specialization <tt>basic_vec<T, Abi></tt> is
+ * enabled, or @c 0 otherwise.
+ *
+ * C++26 [simd.expos.defn]
+ */
+ template <typename _Tp, typename _Abi>
+ constexpr __simd_size_type __simd_size_v = 0;
+
+ template <__vectorizable _Tp, __abi_tag _Abi>
+ constexpr __simd_size_type __simd_size_v<_Tp, _Abi> = _Abi::_S_size;
+
+ // integral_constant shortcut
+ template <__simd_size_type _Xp>
+ inline constexpr integral_constant<__simd_size_type, _Xp> __simd_size_constant = {};
+
+ // [simd.syn]
+ template <typename _Tp, typename _Abi = __native_abi_t<_Tp>>
+ class basic_vec;
+
+ template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, __native_abi_t<_Tp>>>
+ using vec = basic_vec<_Tp, __deduce_abi_t<_Tp, _Np>>;
+
+ template <size_t _Bytes, typename _Abi = __native_abi_t<__integer_from<_Bytes>>>
+ class basic_mask;
+
+ template <typename _Tp, __simd_size_type _Np = __simd_size_v<_Tp, __native_abi_t<_Tp>>>
+ using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
+
+ /** @internal
+ * Satisfied if @p _Tp is a data-parallel type.
+ *
+ * C++26 [simd.general]
+ */
+ template <typename _Tp>
+ concept __data_parallel_type
+ = __vectorizable<typename _Tp::value_type>
+ and __abi_tag<typename _Tp::abi_type>
+ and _Tp::size() >= 1;
+
+ // [simd.ctor] load constructor constraints
+#ifdef __clang__
+ template <typename _Tp>
+ static constexpr remove_cvref_t<_Tp> __static_sized_range_obj = {};
+#endif
+
+ template <typename _Tp, size_t _Np = 0>
+ concept __static_sized_range
+ = ranges::contiguous_range<_Tp> and ranges::sized_range<_Tp>
+ and requires(_Tp&& __r) {
+#if 1 // PR117849
+ typename integral_constant<size_t, ranges::size(__r)>;
+#else
+ requires (decltype(std::span(__r))::extent != dynamic_extent);
+#endif
+#ifdef __clang__
+ requires (_Np == 0 or ranges::size(__static_sized_range_obj<_Tp>) == _Np);
+#else
+ requires (_Np == 0 or ranges::size(__r) == _Np);
+#endif
+ };
+
+ // [simd.general] value-reserving
+ template <typename _From, typename _To>
+ concept __arithmetic_only_value_preserving_convertible_to
+ = convertible_to<_From, _To> and is_arithmetic_v<_From> and is_arithmetic_v<_To>
+ and not (is_signed_v<_From> and is_unsigned_v<_To>)
+ and numeric_limits<_From>::digits <= numeric_limits<_To>::digits
+ and numeric_limits<_From>::max() <= numeric_limits<_To>::max()
+ and numeric_limits<_From>::lowest() >= numeric_limits<_To>::lowest();
+
+ /** @internal
+ * Satisfied if the conversion from @p _From to @p _To is a value-preserving conversion.
+ *
+ * C++26 [simd.general]
+ */
+ template <typename _From, typename _To>
+ concept __value_preserving_convertible_to
+ = __arithmetic_only_value_preserving_convertible_to<_From, _To>
+ or (__complex_like<_To> and __arithmetic_only_value_preserving_convertible_to<
+ _From, typename _To::value_type>);
+
+ /** @internal
+ * The value of the @c _Bytes template argument to a @c basic_mask specialization.
+ *
+ * C++26 [simd.expos.defn]
+ */
+ template <typename _Tp>
+ constexpr size_t __mask_element_size = 0;
+
+ /** @internal
+ * C++26 [simd.expos]
+ */
+ template<typename _Tp>
+ concept __constexpr_wrapper_like
+ = convertible_to<_Tp, decltype(_Tp::value)>
+ and equality_comparable_with<_Tp, decltype(_Tp::value)>
+ and bool_constant<_Tp() == _Tp::value>::value
+ and bool_constant<static_cast<decltype(_Tp::value)>(_Tp()) == _Tp::value>::value;
+
+ // [simd.ctor] explicit(...) of broadcast ctor
+ template <typename _From, typename _To>
+ concept __non_narrowing_constexpr_conversion
+ = __constexpr_wrapper_like<_From> and convertible_to<_From, _To>
+ and requires { { _From::value } -> std::convertible_to<_To>; }
+ and static_cast<decltype(_From::value)>(_To(_From::value)) == _From::value
+ and not (std::unsigned_integral<_To> and _From::value < decltype(_From::value)())
+ and _From::value <= std::numeric_limits<_To>::max()
+ and _From::value >= std::numeric_limits<_To>::lowest();
+
+ // [simd.ctor] p4
+ template <typename _From, typename _To>
+ concept __broadcast_constructible
+ = convertible_to<_From, _To> // 4
+ and ((not is_arithmetic_v<remove_cvref_t<_From>>
+ and not __constexpr_wrapper_like<remove_cvref_t<_From>>) // 4.1
+ or __value_preserving_convertible_to<remove_cvref_t<_From>, _To> // 4.2
+ or __non_narrowing_constexpr_conversion<remove_cvref_t<_From>, _To>); // 4.3
+
+ // __higher_floating_point_rank_than<_Tp, U> (_Tp has higher or equal floating point rank than U)
+ template <typename _From, typename _To>
+ concept __higher_floating_point_rank_than
+ = floating_point<_From> && floating_point<_To>
+ && same_as<common_type_t<_From, _To>, _From>;
+
+ // __higher_integer_rank_than<_Tp, U> (_Tp has higher or equal integer rank than U)
+ template <typename _From, typename _To>
+ concept __higher_integer_rank_than
+ = integral<_From> && integral<_To>
+ && (sizeof(_From) > sizeof(_To) || same_as<common_type_t<_From, _To>, _From>);
+
+ template <typename _From, typename _To>
+ concept __higher_rank_than
+ = __higher_floating_point_rank_than<_From, _To> || __higher_integer_rank_than<_From, _To>;
+
+ struct __convert_flag;
+
+ template <typename _From, typename _To, typename... _Traits>
+ concept __loadstore_convertible_to
+ = same_as<_From, _To>
+ or (__vectorizable<_From> and __vectorizable<_To>
+ and (__value_preserving_convertible_to<_From, _To>
+ or (std::convertible_to<_From, _To>
+ and (std::same_as<_Traits, __convert_flag> or ...))));
+
+ template <typename _From, typename _To>
+ concept __simd_generator_convertible_to
+ = std::convertible_to<_From, _To>
+ and (not is_arithmetic_v<_From> or __value_preserving_convertible_to<_From, _To>);
+
+ template <typename _Fp, typename _Tp, __simd_size_type... _Is>
+ requires (__simd_generator_convertible_to<
+ decltype(declval<_Fp>()(__simd_size_constant<_Is>)), _Tp> and ...)
+ constexpr void
+ __simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);
+
+ template <typename _Fp, typename _Tp, __simd_size_type _Np>
+ concept __simd_generator_invokable = requires {
+ __simd_generator_invokable_impl<_Fp, _Tp>(make_integer_sequence<__simd_size_type, _Np>());
+ };
+
+ template <typename _Fp, typename _Tp, __simd_size_type... _Is>
+ requires (not __simd_generator_convertible_to<
+ decltype(declval<_Fp>()(__simd_size_constant<_Is>)), _Tp>
+ or ...)
+ constexpr void
+ __almost_simd_generator_invokable_impl(integer_sequence<__simd_size_type, _Is...>);
+
+ template <typename _Fp, typename _Tp, __simd_size_type _Np>
+ concept __almost_simd_generator_invokable = requires(_Fp&& __gen) {
+ __gen(__simd_size_constant<0>);
+ __almost_simd_generator_invokable_impl<_Fp, _Tp>(
+ make_integer_sequence<__simd_size_type, _Np>());
+ };
+
+ template <typename _Fp>
+ concept __index_permutation_function_nosize = requires(_Fp const& __f)
+ {
+ { __f(0) } -> std::integral;
+ };
+
+ template <typename _Fp, typename _Simd>
+ concept __index_permutation_function_size = requires(_Fp const& __f)
+ {
+ { __f(0, 0) } -> std::integral;
+ };
+
+ template <typename _Fp, typename _Simd>
+ concept __index_permutation_function
+ = __index_permutation_function_size<_Fp, _Simd> or __index_permutation_function_nosize<_Fp>;
+
+ // [simd.expos]
+ template <size_t _Bytes, __abi_tag _Abi>
+ constexpr size_t __mask_element_size<basic_mask<_Bytes, _Abi>> = _Bytes;
+
+ template <typename _Vp>
+ concept __simd_vec_type
+ = same_as<_Vp, basic_vec<typename _Vp::value_type, typename _Vp::abi_type>>
+ and is_default_constructible_v<_Vp>;
+
+ template <typename _Vp>
+ concept __simd_mask_type
+ = same_as<_Vp, basic_mask<__mask_element_size<_Vp>, typename _Vp::abi_type>>
+ and is_default_constructible_v<_Vp>;
+
+ template <typename _Vp>
+ concept __simd_vec_or_mask_type = __simd_vec_type<_Vp> or __simd_mask_type<_Vp>;
+
+ template <typename _Vp>
+ concept __simd_floating_point
+ = __simd_vec_type<_Vp> and floating_point<typename _Vp::value_type>;
+
+ template <typename _Vp>
+ concept __simd_integral
+ = __simd_vec_type<_Vp> and integral<typename _Vp::value_type>;
+
+ template <typename _Vp>
+ using __simd_complex_value_type = typename _Vp::value_type::value_type;
+
+ template <typename _Vp>
+ concept __simd_complex
+ = __simd_vec_type<_Vp> and __complex_like_impl<typename _Vp::value_type>;
+
+ template <typename _Tp>
+ using __deduced_vec_t
+ = decltype([] {
+ using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
+ if constexpr (__data_parallel_type<_Up>)
+ return _Up();
+ }());
+
+ static_assert(is_same_v<__deduced_vec_t<int>, void>);
+
+ template <typename _Vp, typename _Tp>
+ using __make_compatible_simd_t
+ = decltype([] {
+ using _Up = decltype(declval<const _Tp&>() + declval<const _Tp&>());
+ if constexpr (__simd_vec_type<_Up>)
+ return _Up();
+ else
+ return vec<_Up, _Vp::size()>();
+ }());
+
+ template <typename... _Ts>
+ concept __math_floating_point = (__simd_floating_point<__deduced_vec_t<_Ts>> or ...);
+
+ template <typename...>
+ struct __math_common_simd_impl;
+
+ template <typename... _Ts>
+ requires __math_floating_point<_Ts...>
+ using __math_common_simd_t = typename __math_common_simd_impl<_Ts...>::type;
+
+ template <typename _T0>
+ struct __math_common_simd_impl<_T0>
+ { using type = __deduced_vec_t<_T0>; };
+
+ template <typename _T0, typename _T1>
+ struct __math_common_simd_impl<_T0, _T1>
+ {
+ using type = decltype([] {
+ if constexpr (__math_floating_point<_T0> and __math_floating_point<_T1>)
+ return common_type_t<__deduced_vec_t<_T0>, __deduced_vec_t<_T1>>();
+ else if constexpr (__math_floating_point<_T0>)
+ return common_type_t<__deduced_vec_t<_T0>, _T1>();
+ else if constexpr (__math_floating_point<_T1>)
+ return common_type_t<_T0, __deduced_vec_t<_T1>>();
+ // else void
+ }());
+ };
+
+ template <typename _T0, typename _T1, typename... _TRest>
+ struct __math_common_simd_impl<_T0, _T1, _TRest...>
+ { using type = common_type_t<__math_common_simd_t<_T0, _T1>, _TRest...>; };
+
+ template <typename _T0, typename _T1, typename... _TRest>
+ requires (sizeof...(_TRest) > 0) and is_void_v<__math_common_simd_t<_T0, _T1>>
+ struct __math_common_simd_impl<_T0, _T1, _TRest...>
+ { using type = common_type_t<__math_common_simd_t<_TRest...>, _T0, _T1>; };
+
+ template <typename _BinaryOperation, typename _Tp>
+ concept __reduction_binary_operation
+ = requires (const _BinaryOperation __binary_op, const vec<_Tp, 1> __v) {
+ { __binary_op(__v, __v) } -> same_as<vec<_Tp, 1>>;
+ };
+
+ /** @internal
+ * Returns the lowest index @c i where <tt>(__bits >> i) & 1</tt> equals @c 1.
+ */
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ __lowest_bit(std::integral auto __bits)
+ {
+ if constexpr (sizeof(__bits) <= sizeof(int))
+ return __builtin_ctz(__bits);
+ else if constexpr (sizeof(__bits) <= sizeof(long))
+ return __builtin_ctzl(__bits);
+ else if constexpr (sizeof(__bits) <= sizeof(long long))
+ return __builtin_ctzll(__bits);
+ else
+ static_assert(false);
+ }
+
+ /** @internal
+ * Returns the highest index @c i where <tt>(__bits >> i) & 1</tt> equals @c 1.
+ */
+ [[__gnu__::__always_inline__]]
+ constexpr __simd_size_type
+ __highest_bit(std::integral auto __bits)
+ {
+ if constexpr (sizeof(__bits) <= sizeof(int))
+ return sizeof(int) * __CHAR_BIT__ - 1 - __builtin_clz(__bits);
+ else if constexpr (sizeof(__bits) <= sizeof(long))
+ return sizeof(long) * __CHAR_BIT__ - 1 - __builtin_clzl(__bits);
+ else if constexpr (sizeof(__bits) <= sizeof(long long))
+ return sizeof(long long) * __CHAR_BIT__ - 1 - __builtin_clzll(__bits);
+ else
+ static_assert(false);
+ }
+
+ template <__vectorizable _Tp, __simd_size_type _Np, __abi_tag _Ap>
+ using __similar_mask = basic_mask<sizeof(_Tp), decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
+
+ // Allow _Tp to be _InvalidInteger for __integer_from<16>
+ template <typename _Tp, __simd_size_type _Np, __abi_tag _Ap>
+ using __similar_vec = basic_vec<_Tp, decltype(__abi_rebind<_Tp, _Np, _Ap>())>;
+
+ // LWG???? [simd.expos]
+ template <size_t _Bytes, typename _Ap>
+ using __simd_vec_from_mask_t = __similar_vec<__integer_from<_Bytes>, _Ap::_S_size, _Ap>;
+
+ template <typename _From, typename _To>
+ concept __simd_vec_bcast = constructible_from<_To, _From>;
+
+ /** @internal
+ * std::pair is not trivially copyable, this one is
+ */
+ template <typename _T0, typename _T1>
+ struct __trivial_pair
+ {
+ _T0 _M_first;
+ _T1 _M_second;
+ };
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_BASE_H
diff --git a/libstdc++-v3/include/bits/simd_x86.h b/libstdc++-v3/include/bits/simd_x86.h
new file mode 100644
index 00000000000..b04c2d04f92
--- /dev/null
+++ b/libstdc++-v3/include/bits/simd_x86.h
@@ -0,0 +1,953 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ * Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_SIMD_X86_H
+#define _GLIBCXX_SIMD_X86_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "vec_ops.h"
+
+#if not _GLIBCXX_X86
+#error "wrong include for this target"
+#endif
+
+#pragma GCC push_options
+// ensure GCC knows about the __builtin_ia32_* calls
+#pragma GCC target("avx2,bmi,bmi2,avx512vl,avx512bw,avx512dq,avx10.2")
+#pragma GCC pop_options
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std::simd
+{
+ static constexpr size_t __x86_max_general_register_size
+#ifdef __x86_64__
+ = 8;
+#else
+ = 4;
+#endif
+
+ /** \internal
+ * Return a bit-mask for the given vector-mask.
+ *
+ * Caveats:
+ * 1. The bit-mask of 2-Byte vector-masks has duplicated entries (because of missing instruction)
+ * 2. The return type internally is 'int', but that fails on conversion to uint64 if the MSB of a
+ * YMM 1/2-Byte vector-mask is set (sign extension). Therefore these helper functions return
+ * unsigned instead.
+ * 3. ZMM inputs are not supported.
+ */
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 16> __x)
+ { return __builtin_ia32_movmskpd(__vec_bit_cast<double>(__x)); }
+
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<8>, 32> __x)
+ { return __builtin_ia32_movmskpd256(__vec_bit_cast<double>(__x)); }
+
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 16> __x)
+ { return __builtin_ia32_movmskps(__vec_bit_cast<float>(__x)); }
+
+ template <_ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 8> __x)
+ {
+#if __has_builtin(__builtin_ia32_pext_di)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_di(__builtin_bit_cast(unsigned long long, __x),
+ 0x80000000'80000000ULL);
+#endif
+ return __x86_movmsk(__vec_zero_pad_to_16(__x));
+ }
+
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(__vec_builtin_type_bytes<__integer_from<4>, 32> __x)
+ { return __builtin_ia32_movmskps256(__vec_bit_cast<float>(__x)); }
+
+ template <__vec_builtin _TV, auto _Traits = _ArchTraits()>
+ requires (sizeof(__vec_value_type<_TV>) <= 2)
+ [[__gnu__::__always_inline__]]
+ inline unsigned
+ __x86_movmsk(_TV __x)
+ {
+ static_assert(__width_of<_TV> > 1);
+ if constexpr (sizeof(__x) == 32)
+ return __builtin_ia32_pmovmskb256(__vec_bit_cast<char>(__x));
+ else if constexpr (sizeof(__x) == 16)
+ return __builtin_ia32_pmovmskb128(__vec_bit_cast<char>(__x));
+ else if constexpr (sizeof(__x) == 8)
+ {
+#if __has_builtin(__builtin_ia32_pext_di)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_di(__builtin_bit_cast(unsigned long long, __x),
+ 0x8080'8080'8080'8080ULL);
+#endif
+ return __x86_movmsk(__vec_zero_pad_to_16(__x));
+ }
+ else if constexpr (sizeof(__x) == 4)
+ {
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__builtin_bit_cast(unsigned int, __x), 0x80808080u);
+#endif
+ return __x86_movmsk(__vec_zero_pad_to_16(__x));
+ }
+ else if constexpr (sizeof(__x) == 2)
+ {
+ auto __bits = __builtin_bit_cast(unsigned short, __x);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__bits, 0x00008080u);
+#endif
+ return ((__bits >> 7) & 1) | ((__bits & 0x8000) >> 14);
+ }
+ else
+ static_assert(false);
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vec_is_zero(_TV __a)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp>);
+ if constexpr (sizeof(_TV) <= __x86_max_general_register_size)
+ return __builtin_bit_cast(__integer_from<sizeof(_TV)>, __a) == 0;
+ else if constexpr (_Traits._M_have_avx())
+ {
+ if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__a));
+ else if constexpr (sizeof(_TV) == 16)
+ return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__a));
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
+ else
+ static_assert(false);
+ }
+ else if constexpr (_Traits._M_have_sse4_1())
+ {
+ if constexpr (sizeof(_TV) == 16)
+ return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__a));
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_vec_is_zero(__vec_zero_pad_to_16(__a));
+ else
+ static_assert(false);
+ }
+ else
+ return __x86_movmsk(__a) == 0;
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline int
+ __x86_vec_testz(_TV __a, _TV __b)
+ {
+ static_assert(sizeof(_TV) == 16 or sizeof(_TV) == 32);
+ static_assert(_Traits._M_have_sse4_1());
+ if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_ptestz256(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ else
+ return __builtin_ia32_ptestz128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline int
+ __x86_vec_testc(_TV __a, _TV __b)
+ {
+ static_assert(sizeof(_TV) == 16 or sizeof(_TV) == 32);
+ static_assert(_Traits._M_have_sse4_1());
+ if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_ptestc256(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ else
+ return __builtin_ia32_ptestc128(__vec_bit_cast<long long>(__a),
+ __vec_bit_cast<long long>(__b));
+ }
+
+ template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vecmask_all(_TV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp> and is_signed_v<_Tp>);
+ constexpr int __width = __width_of<_TV>;
+ static_assert(sizeof(__k) <= 32);
+ if constexpr (_Np == __width)
+ {
+ if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ return __builtin_bit_cast(_Ip, __k) == ~_Ip();
+ }
+ else if constexpr (not _Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return __x86_movmsk(__k) == __valid_bits;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 != __x86_vec_testc(__k, ~_TV());
+ }
+ else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+ return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == __valid_bits;
+ }
+ else if constexpr (not _Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return (__x86_movmsk(__k) & __valid_bits) == __valid_bits;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_all<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 != __x86_vec_testc(__k, _S_vec_implicit_mask<_Np, _TV>);
+ }
+
+ template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vecmask_any(_TV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp> and is_signed_v<_Tp>);
+ constexpr int __width = __width_of<_TV>;
+ static_assert(sizeof(__k) <= 32);
+ if constexpr (_Np == __width)
+ return not __x86_vec_is_zero(__k);
+ else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+ return (__builtin_bit_cast(_Ip, __k) & __valid_bits) != _Ip();
+ }
+ else if constexpr (not _Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return (__x86_movmsk(__k) & __valid_bits) != 0;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_any<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 == __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
+ }
+
+ template <int _Np, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline bool
+ __x86_vecmask_none(_TV __k)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_integral_v<_Tp> and is_signed_v<_Tp>);
+ constexpr int __width = __width_of<_TV>;
+ static_assert(sizeof(__k) <= 32);
+ if constexpr (_Np == __width)
+ return __x86_vec_is_zero(__k);
+ else if constexpr (sizeof(__k) <= __x86_max_general_register_size)
+ {
+ using _Ip = __integer_from<sizeof(__k)>;
+ constexpr _Ip __valid_bits = (_Ip(1) << (_Np * sizeof(_Tp) * __CHAR_BIT__)) - 1;
+ return (__builtin_bit_cast(_Ip, __k) & __valid_bits) == _Ip();
+ }
+ else if constexpr (not _Traits._M_have_sse4_1())
+ {
+ constexpr unsigned __valid_bits = (1u << (sizeof(_Tp) == 2 ? _Np * 2 : _Np)) - 1;
+ return (__x86_movmsk(__k) & __valid_bits) == 0;
+ }
+ else if constexpr (sizeof(__k) < 16)
+ return __x86_vecmask_none<_Np>(__vec_zero_pad_to_16(__k));
+ else
+ return 0 != __x86_vec_testz(__k, _S_vec_implicit_mask<_Np, _TV>);
+ }
+
+ enum class _X86Cmp
+ {
+ _Eq = 0,
+ _Lt = 1,
+ _Le = 2,
+ _Unord = 3,
+ _Neq = 4,
+ _Nlt = 5,
+ _Nle = 6,
+ };
+
+ template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_floating_point_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ inline auto
+ __x86_bitmask_cmp(_TV __x, _TV __y)
+ {
+ constexpr int __c = int(_Cmp);
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+ return __builtin_ia32_cmppd512_mask(__x, __y, __c, -1, 4);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpps512_mask(__x, __y, __c, -1, 4);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+ return __builtin_ia32_cmppd256_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpps256_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+ return __builtin_ia32_cmppd128_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpps128_mask(__x, __y, __c, -1);
+ else if constexpr (is_same_v<_Tp, _Float16>)
+ {
+ if constexpr (sizeof(_TV) == 64 and _Traits._M_have_avx512fp16())
+ return __builtin_ia32_cmpph512_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and _Traits._M_have_avx512fp16())
+ return __builtin_ia32_cmpph256_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and _Traits._M_have_avx512fp16())
+ return __builtin_ia32_cmpph128_mask(__x, __y, __c, -1);
+ else if constexpr (sizeof(_TV) < 16 and _Traits._M_have_avx512fp16())
+ return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+ else
+ {
+ // without AVX512_FP16, float16_t size needs to match float32_t size
+ // (cf. __native_abi())
+ static_assert(sizeof(_TV) <= 32);
+ return __x86_bitmask_cmp<_Cmp>(__vec_cast<float>(__x), __vec_cast<float>(__y));
+ }
+ }
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+ else
+ static_assert(false);
+ }
+
+ template <typename _Tp>
+ using __x86_intrin_int
+ = decltype([] {
+ if constexpr (sizeof(_Tp) == 1)
+ return char();
+ else
+ return __integer_from<sizeof(_Tp)>();
+ }());
+
+ template <_X86Cmp _Cmp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_integral_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ inline auto
+ __x86_bitmask_cmp(_TV __x, _TV __y)
+ {
+ constexpr int __c = int(_Cmp);
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (sizeof(_TV) < 16)
+ return __x86_bitmask_cmp<_Cmp>(__vec_zero_pad_to_16(__x), __vec_zero_pad_to_16(__y));
+ else if constexpr (is_signed_v<_Tp>)
+ {
+ const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
+ const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
+ if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+ return __builtin_ia32_cmpq512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpd512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2)
+ return __builtin_ia32_cmpw512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 1)
+ return __builtin_ia32_cmpb512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+ return __builtin_ia32_cmpq256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpd256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2)
+ return __builtin_ia32_cmpw256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 1)
+ return __builtin_ia32_cmpb256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+ return __builtin_ia32_cmpq128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+ return __builtin_ia32_cmpd128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2)
+ return __builtin_ia32_cmpw128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 1)
+ return __builtin_ia32_cmpb128_mask(__xi, __yi, __c, -1);
+ else
+ static_assert(false);
+ }
+ else
+ {
+ const auto __xi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__x);
+ const auto __yi = __vec_bit_cast<__x86_intrin_int<_Tp>>(__y);
+ if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+ return __builtin_ia32_ucmpq512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+ return __builtin_ia32_ucmpd512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2)
+ return __builtin_ia32_ucmpw512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 1)
+ return __builtin_ia32_ucmpb512_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+ return __builtin_ia32_ucmpq256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+ return __builtin_ia32_ucmpd256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2)
+ return __builtin_ia32_ucmpw256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 1)
+ return __builtin_ia32_ucmpb256_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+ return __builtin_ia32_ucmpq128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+ return __builtin_ia32_ucmpd128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2)
+ return __builtin_ia32_ucmpw128_mask(__xi, __yi, __c, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 1)
+ return __builtin_ia32_ucmpb128_mask(__xi, __yi, __c, -1);
+ else
+ static_assert(false);
+ }
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline auto
+ __x86_bitmask_isinf(_TV __x)
+ {
+ static_assert(_Traits._M_have_avx512dq());
+ using _Tp = __vec_value_type<_TV>;
+ static_assert(is_floating_point_v<_Tp>);
+ if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+ return __builtin_ia32_fpclasspd512_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+ return __builtin_ia32_fpclasspd256_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+ return __builtin_ia32_fpclasspd128_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+ return __builtin_ia32_fpclassps512_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+ return __builtin_ia32_fpclassps256_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+ return __builtin_ia32_fpclassps128_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2 and _Traits._M_have_avx512fp16())
+ return __builtin_ia32_fpclassph512_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2 and _Traits._M_have_avx512fp16())
+ return __builtin_ia32_fpclassph256_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2 and _Traits._M_have_avx512fp16())
+ return __builtin_ia32_fpclassph128_mask(__x, 0x18, -1);
+ else if constexpr (sizeof(_Tp) == 2 and not _Traits._M_have_avx512fp16())
+ return __x86_bitmask_isinf(__vec_cast<float>(__x));
+ else if constexpr (sizeof(_TV) < 16)
+ return __x86_bitmask_isinf(__vec_zero_pad_to_16(__x));
+ else
+ static_assert(false);
+ }
+
+ template <__vec_builtin _KV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline _KV
+ __x86_bit_to_vecmask(std::integral auto __bits)
+ {
+ using _Kp = __vec_value_type<_KV>;
+ static_assert((sizeof(__bits) * __CHAR_BIT__ == __width_of<_KV>)
+ or (sizeof(__bits) == 1 and __CHAR_BIT__ > __width_of<_KV>));
+
+ if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2b512(__bits);
+ else if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2b256(__bits);
+ else if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) == 16)
+ return __builtin_ia32_cvtmask2b128(__bits);
+ else if constexpr (sizeof(_Kp) == 1 and sizeof(_KV) <= 8)
+ return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2b128(__bits));
+
+ else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2w512(__bits);
+ else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2w256(__bits);
+ else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) == 16)
+ return __builtin_ia32_cvtmask2w128(__bits);
+ else if constexpr (sizeof(_Kp) == 2 and sizeof(_KV) <= 8)
+ return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2w128(__bits));
+
+ else if constexpr (sizeof(_Kp) == 4 and sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2d512(__bits);
+ else if constexpr (sizeof(_Kp) == 4 and sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2d256(__bits);
+ else if constexpr (sizeof(_Kp) == 4 and sizeof(_KV) <= 16)
+ return _VecOps<_KV>::_S_extract(__builtin_ia32_cvtmask2d128(__bits));
+
+ else if constexpr (sizeof(_Kp) == 8 and sizeof(_KV) == 64)
+ return __builtin_ia32_cvtmask2q512(__bits);
+ else if constexpr (sizeof(_Kp) == 8 and sizeof(_KV) == 32)
+ return __builtin_ia32_cvtmask2q256(__bits);
+ else if constexpr (sizeof(_Kp) == 8 and sizeof(_KV) == 16)
+ return __builtin_ia32_cvtmask2q128(__bits);
+
+ else
+ static_assert(false);
+ }
+
+ template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_integral_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ constexpr inline _TV
+ __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ using _Ip = __x86_intrin_int<_Tp>;
+ if constexpr (not is_same_v<_Ip, _Tp>)
+ return reinterpret_cast<_TV>(__x86_bitmask_blend(__k, __vec_bit_cast<_Ip>(__t),
+ __vec_bit_cast<_Ip>(__f)));
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmq_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmd_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 2)
+ return __builtin_ia32_blendmw_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 1)
+ return __builtin_ia32_blendmb_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmq_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmd_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 2)
+ return __builtin_ia32_blendmw_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 1)
+ return __builtin_ia32_blendmb_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmq_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmd_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 2)
+ return __builtin_ia32_blendmw_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 1)
+ return __builtin_ia32_blendmb_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) < 16)
+ return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
+ __vec_zero_pad_to_16(__f)));
+ else
+ static_assert(false);
+ }
+
+ template <unsigned_integral _Kp, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ requires is_floating_point_v<__vec_value_type<_TV>>
+ [[__gnu__::__always_inline__]]
+ constexpr inline _TV
+ __x86_bitmask_blend(_Kp __k, _TV __t, _TV __f)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmpd_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 64 and sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmps_512_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmpd_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 32 and sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmps_256_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 8)
+ return __builtin_ia32_blendmpd_128_mask (__f, __t, __k);
+ else if constexpr (sizeof(_TV) == 16 and sizeof(_Tp) == 4)
+ return __builtin_ia32_blendmps_128_mask (__f, __t, __k);
+ else if constexpr (is_same_v<_Tp, _Float16>)
+ {
+ using _Up = __integer_from<sizeof(_Tp)>;
+ return __vec_bit_cast<_Float16>(__x86_bitmask_blend(__k, __vec_bit_cast<_Up>(__t),
+ __vec_bit_cast<_Up>(__f)));
+ }
+ else if constexpr (sizeof(_TV) < 16)
+ return _VecOps<_TV>::_S_extract(__x86_bitmask_blend(__k, __vec_zero_pad_to_16(__t),
+ __vec_zero_pad_to_16(__f)));
+ else
+ static_assert(false);
+ }
+
+ template <int _OutputBits = 4, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<1>
+ __bit_extract_even(_UInt<1> __x)
+ {
+ static_assert(_OutputBits <= 4);
+ constexpr _UInt<1> __mask = 0x55u >> ((4 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__x, __mask);
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x33u;
+ __x |= __x >> 2;
+ __x &= 0x0Fu;
+ return __x;
+ }
+
+ template <int _OutputBits = 8, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<1>
+ __bit_extract_even(_UInt<2> __x)
+ {
+ if constexpr (_OutputBits <= 4)
+ return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+ else
+ {
+ static_assert(_OutputBits <= 8);
+ constexpr _UInt<2> __mask = 0x5555u >> ((8 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__x, __mask);
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x3333u;
+ __x |= __x >> 2;
+ __x &= 0x0F0Fu;
+ __x |= __x >> 4;
+ return __x;
+ }
+ }
+
+ template <int _OutputBits = 16, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<_OutputBits>
+ __bit_extract_even(_UInt<4> __x)
+ {
+ if constexpr (_OutputBits <= 4)
+ return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+ else if constexpr (_OutputBits <= 8)
+ return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
+ else
+ {
+ static_assert(_OutputBits <= 16);
+ constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return __builtin_ia32_pext_si(__x, __mask);
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x3333'3333u;
+ __x |= __x >> 2;
+ __x &= 0x0F0F'0F0Fu;
+ __x |= __x >> 4;
+ __x &= 0x00FF'00FFu;
+ __x |= __x >> 8;
+ return __x;
+ }
+ }
+
+ template <int _OutputBits = 32, _ArchTraits _Traits = {}>
+ constexpr _Bitmask<_OutputBits>
+ __bit_extract_even(_UInt<8> __x)
+ {
+ if constexpr (_OutputBits <= 4)
+ return __bit_extract_even<_OutputBits>(_UInt<1>(__x));
+ else if constexpr (_OutputBits <= 8)
+ return __bit_extract_even<_OutputBits>(_UInt<2>(__x));
+ else if constexpr (_OutputBits <= 16)
+ return __bit_extract_even<_OutputBits>(_UInt<4>(__x));
+ else
+ {
+ static_assert(_OutputBits <= 32);
+ constexpr _UInt<8> __mask = 0x5555'5555'5555'5555ull >> ((32 - _OutputBits) * 2);
+#if __has_builtin(__builtin_ia32_pext_si)
+ if constexpr (_Traits._M_have_bmi2())
+ {
+#if __has_builtin(__builtin_ia32_pext_di)
+ return __builtin_ia32_pext_di(__x, __mask);
+#else
+ return __builtin_ia32_pext_si(__x, static_cast<unsigned>(__mask))
+ | (__builtin_ia32_pext_si(__x >> 32, __mask >> 32) << 16);
+#endif
+ }
+#endif
+ __x &= __mask;
+ __x |= __x >> 1;
+ __x &= 0x3333'3333'3333'3333ull;
+ __x |= __x >> 2;
+ __x &= 0x0F0F'0F0F'0F0F'0F0Full;
+ __x |= __x >> 4;
+ __x &= 0x00FF'00FF'00FF'00FFull;
+ __x |= __x >> 8;
+ __x &= 0x0000'FFFF'0000'FFFFull;
+ __x |= __x >> 16;
+ return __x;
+ }
+ }
+
+ // input bits must be 0 for all bits > _InputBits
+ template <int _InputBits = -1, _ArchTraits _Traits = {}>
+ constexpr auto
+ __duplicate_each_bit(unsigned_integral auto __x)
+ {
+ constexpr int __input_bits = _InputBits == -1 ? sizeof(__x) * __CHAR_BIT__ : _InputBits;
+ static_assert(__input_bits >= 1);
+ static_assert(sizeof(__x) * __CHAR_BIT__ >= __input_bits);
+ if constexpr (__input_bits <= 8)
+ {
+ constexpr _UInt<2> __mask = 0x5555u >> ((8 - __input_bits) * 2);
+ if constexpr (__input_bits == 1)
+ return _UInt<1>(__x * 3u);
+#if __has_builtin(__builtin_ia32_pdep_si)
+ else if constexpr (_Traits._M_have_bmi2())
+ return _Bitmask<__input_bits * 2>(3u * __builtin_ia32_pdep_si(__x, __mask));
+#endif
+ else if constexpr (__input_bits == 2) // 0000'00BA
+ return _UInt<1>(((__x + 0b0010u) & 0b0101u) * 3u); // 0B?A -> 0B0A -> BBAA
+ else if constexpr (__input_bits <= 4) // 0000'DCBA
+ {
+ __x = ((__x << 2) | __x ) & 0b0011'0011u; // 00DC'??BA -> 00DC'00BA
+ return _UInt<1>(((__x + 0b0010'0010u) & __mask) * 3u); // -> DDCC'BBAA
+ }
+ else
+ { // HGFE'DCBA
+ _UInt<2> __y = ((__x << 4) | __x) & 0x0F0Fu; // HGFE'0000'DCBA
+ __y |= __y << 2; // 00HG'??FE'00DC'??BA
+ __y &= 0x3333u; // 00HG'00FE'00DC'00BA
+ __y += 0x2222u; // 0H?G'0F?E'0D?C'0B?A
+ return _UInt<2>((__y & __mask) * 3u); // HHGG'FFEE'DDCC'BBAA
+ }
+ }
+ else if constexpr (__input_bits <= 16)
+ {
+ constexpr _UInt<4> __mask = 0x5555'5555u >> ((16 - __input_bits) * 2);
+#if __has_builtin(__builtin_ia32_pdep_si)
+ if constexpr (_Traits._M_have_bmi2())
+ return 3u * __builtin_ia32_pdep_si(__x, __mask);
+#endif
+ _UInt<4> __y = ((__x << 8) | __x) & 0x00FF00FFu;
+ __y |= __y << 4;
+ __y &= 0x0F0F'0F0Fu;
+ __y |= __y << 2;
+ __y &= 0x3333'3333u;
+ return ((__y + 0x2222'2222u) & __mask) * 3;
+ }
+ else if constexpr (__input_bits <= 32)
+ {
+ constexpr _UInt<8> __mask = 0x5555'5555'5555'5555u >> ((32 - __input_bits) * 2);
+#if __has_builtin(__builtin_ia32_pdep_si)
+ if constexpr (_Traits._M_have_bmi2())
+ {
+#if __has_builtin(__builtin_ia32_pdep_di)
+ return 3ull * __builtin_ia32_pdep_di(__x, __mask);
+#else
+ const _UInt<8> __hi = 3 * __builtin_ia32_pdep_si(__x >> 16, __mask >> 32);
+ return (3u * __builtin_ia32_pdep_si(__x, static_cast<unsigned>(__mask))) | __hi << 32;
+#endif
+ }
+#endif
+ _UInt<8> __y = ((__x & 0xFFFF'0000ull) << 16) | (__x & 0x0000'FFFFu);
+ __y |= __y << 8;
+ __y &= 0x00FF'00FF'00FF'00FFull;
+ __y |= __y << 4;
+ __y &= 0x0F0F'0F0F'0F0F'0F0Full;
+ __y |= __y << 2;
+ __y &= 0x3333'3333'3333'3333ull;
+ return ((__y + 0x2222'2222'2222'2222ull) & __mask) * 3;
+ }
+ else
+ return __trivial_pair { __duplicate_each_bit(_UInt<4>(__x)),
+ __duplicate_each_bit<__input_bits - 32>(
+ _Bitmask<__input_bits - 32>(__x >> 32)) };
+ }
+
+ template <int _InputBits = -1, typename _U0, typename _U1>
+ constexpr auto
+ __duplicate_each_bit(const __trivial_pair<_U0, _U1>& __x)
+ {
+ static_assert(_InputBits != -1 or is_unsigned_v<_U1>);
+ constexpr int __input_bits = _InputBits == -1 ? (sizeof(_U0) + sizeof(_U1)) * __CHAR_BIT__
+ : _InputBits;
+ constexpr int __in0 = min(int(sizeof(_U0)) * __CHAR_BIT__, __input_bits);
+ constexpr int __in1 = __input_bits - __in0;
+ if constexpr (__in1 == 0)
+ return __duplicate_each_bit<__in0>(__x._M_first);
+ else
+ return __trivial_pair { __duplicate_each_bit<__in0>(__x._M_first),
+ __duplicate_each_bit<__in1>(__x._M_second) };
+ }
+
+ template <__vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ inline _TV
+ __x86_complex_multiplies(_TV __x, _TV __y)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ using _VO = _VecOps<_TV>;
+
+ static_assert(_Traits._M_have_fma());
+ static_assert(is_floating_point_v<_Tp>);
+
+ if constexpr (not _Traits._M_have_avx512fp16() and sizeof(_Tp) == 2)
+ return __vec_cast<_Tp>(__x86_complex_multiplies(__vec_cast<float>(__x),
+ __vec_cast<float>(__y)));
+ else if constexpr (sizeof(_TV) < 16)
+ return _VO::_S_extract(__x86_complex_multiplies(__vec_zero_pad_to_16(__x),
+ __vec_zero_pad_to_16(__y)));
+
+ else
+ {
+ _TV __x_real = _VO::_S_dup_even(__x);
+ _TV __x_imag = _VO::_S_dup_odd(__x);
+ _TV __y_swapped = _VO::_S_swap_neighbors(__y);
+
+ if constexpr (sizeof(__x) == 16 and sizeof(_Tp) == 2)
+ return __builtin_ia32_vfmaddsubph128_mask(__x_real, __y, __x_imag * __y_swapped, -1);
+ else if constexpr (sizeof(__x) == 32 and sizeof(_Tp) == 2)
+ return __builtin_ia32_vfmaddsubph256_mask(__x_real, __y, __x_imag * __y_swapped, -1);
+ else if constexpr (sizeof(__x) == 64 and sizeof(_Tp) == 2)
+ return __builtin_ia32_vfmaddsubph512_mask(
+ __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+ else if constexpr (sizeof(__x) == 16 and sizeof(_Tp) == 4)
+ return __builtin_ia32_vfmaddsubps(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 32 and sizeof(_Tp) == 4)
+ return __builtin_ia32_vfmaddsubps256(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 64 and sizeof(_Tp) == 4)
+ return __builtin_ia32_vfmaddsubps512_mask(
+ __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+ else if constexpr (sizeof(__x) == 16 and sizeof(_Tp) == 8)
+ return __builtin_ia32_vfmaddsubpd(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 32 and sizeof(_Tp) == 8)
+ return __builtin_ia32_vfmaddsubpd256(__x_real, __y, __x_imag * __y_swapped);
+ else if constexpr (sizeof(__x) == 64 and sizeof(_Tp) == 8)
+ return __builtin_ia32_vfmaddsubpd512_mask(
+ __x_real, __y, __x_imag * __y_swapped, -1, 0x04);
+
+ else
+ static_assert(false);
+ }
+ }
+
+ // FIXME: Work around PR121688
+ template <__vec_builtin _UV, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ inline _UV
+ __x86_cvt_f16c(_TV __v)
+ {
+ constexpr bool __from_f16 = is_same_v<__vec_value_type<_TV>, _Float16>;
+ constexpr bool __to_f16 = not __from_f16;
+ if constexpr (__to_f16 and not is_same_v<__vec_value_type<_TV>, float>)
+ return __x86_cvt_f16c<_UV>(__vec_cast<float>(__v));
+ else if constexpr (__from_f16 and not is_same_v<__vec_value_type<_UV>, float>)
+ return __vec_cast<_UV>(__x86_cvt_f16c<__vec_builtin_type<float, __width_of<_TV>>>(__v));
+ else if constexpr (__from_f16)
+ {
+ const auto __vi = __vec_bit_cast<__x86_intrin_int<_Float16>>(__v);
+ if constexpr (sizeof(_TV) == 4)
+ return __vec_split_lo(__builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi)));
+ else if constexpr (sizeof(_TV) == 8)
+ return __builtin_ia32_vcvtph2ps(__vec_zero_pad_to_16(__vi));
+ else if constexpr (sizeof(_TV) == 16)
+ return __builtin_ia32_vcvtph2ps256(__vi);
+ else if constexpr (sizeof(_TV) == 32)
+ return __builtin_ia32_vcvtph2ps512_mask(__vi, __vec_builtin_type<float, 16>(), -1, 4);
+ else if constexpr (sizeof(_TV) >= 64)
+ return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
+ __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
+ else
+ static_assert(false);
+ }
+ else if constexpr (sizeof(_TV) == 8)
+ return reinterpret_cast<_UV>(
+ __vec_split_lo(__vec_split_lo(__builtin_ia32_vcvtps2ph(
+ __vec_zero_pad_to_16(__v), 4))));
+ else if constexpr (sizeof(_TV) == 16)
+ return reinterpret_cast<_UV>(__vec_split_lo(__builtin_ia32_vcvtps2ph(__v, 4)));
+ else if constexpr (sizeof(_TV) == 32)
+ return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph256(__v, 4));
+ else if constexpr (sizeof(_TV) == 64)
+ return reinterpret_cast<_UV>(__builtin_ia32_vcvtps2ph512_mask(
+ __v, 4, __vec_builtin_type<short, 16>(), -1));
+ else if constexpr (sizeof(_TV) >= 128)
+ return __vec_concat(__x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_lo(__v)),
+ __x86_cvt_f16c<__half_vec_builtin_t<_UV>>(__vec_split_hi(__v)));
+ else
+ static_assert(false);
+ }
+
+ /** \internal
+ * AVX instructions typically work per 128-bit chunk. Horizontal operations thus produce vectors
+ * where the two 128-bit chunks in the center are swapped. This function works as a fix-up step.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ inline _TV
+ __x86_swizzle4x64_acbd(_TV __x)
+ {
+ static_assert(sizeof(_TV) == 32);
+ using _UV = __vec_builtin_type_bytes<long long, 32>;
+ return reinterpret_cast<_TV>(__builtin_shufflevector(reinterpret_cast<_UV>(__x), _UV(),
+ 0, 2, 1, 3));
+ }
+
+ /** \internal
+ * Like __builtin_convertvector but with a precondition that input values are either 0 or -1.
+ */
+ template <__vec_builtin _To, __vec_builtin _From>
+ [[__gnu__::__always_inline__]]
+ inline _To
+ __x86_cvt_vecmask(_From __k)
+ {
+ using _T0 = __vec_value_type<_From>;
+ using _T1 = __vec_value_type<_To>;
+ if constexpr (sizeof(_From) > sizeof(_To) and sizeof(_From) < 16)
+ {
+ using _ToPadded = __vec_builtin_type_bytes<_T1, sizeof(_To) * 16 / sizeof(_From)>;
+ return _VecOps<_To>::_S_extract(__x86_cvt_vecmask<_ToPadded>(__vec_zero_pad_to_16(__k)));
+ }
+ else if constexpr (sizeof(_T0) == 2 and sizeof(_T1) == 1) // -> packsswb
+ {
+ if constexpr (sizeof(__k) == 16)
+ return reinterpret_cast<_To>(__vec_split_lo(__builtin_ia32_packsswb128(__k, __k)));
+ else if constexpr (sizeof(__k) == 32)
+ return reinterpret_cast<_To>(
+ __vec_split_lo(__x86_swizzle4x64_acbd(
+ __builtin_ia32_packsswb256(__k, __k))));
+ else
+ static_assert(false);
+ }
+ else
+ static_assert(false, "TODO");
+ }
+
+ /** \internal
+ * Overload that concatenates \p __k0 and \p __k1 while converting.
+ */
+ template <__vec_builtin _To, __vec_builtin _From>
+ [[__gnu__::__always_inline__]]
+ inline _To
+ __x86_cvt_vecmask(_From __k0, _From __k1)
+ {
+ using _T0 = __vec_value_type<_From>;
+ using _T1 = __vec_value_type<_To>;
+ static_assert(sizeof(_From) >= 16);
+ if constexpr (sizeof(_T0) == 2 and sizeof(_T1) == 1) // -> packsswb
+ {
+ if constexpr (sizeof(__k0) == 16)
+ return reinterpret_cast<_To>(__builtin_ia32_packsswb128(__k0, __k1));
+ else if constexpr (sizeof(__k0) == 32)
+ return reinterpret_cast<_To>(__x86_swizzle4x64_acbd(
+ __builtin_ia32_packsswb256(__k0, __k1)));
+ else
+ static_assert(false);
+ }
+ else
+ static_assert(false, "TODO");
+ }
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_X86_H
diff --git a/libstdc++-v3/include/bits/vec_ops.h b/libstdc++-v3/include/bits/vec_ops.h
new file mode 100644
index 00000000000..3efb320b58a
--- /dev/null
+++ b/libstdc++-v3/include/bits/vec_ops.h
@@ -0,0 +1,592 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025 GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ * Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_VEC_OPS_H
+#define _GLIBCXX_VEC_OPS_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_details.h"
+
+#include <bit>
+#include <bits/utility.h>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std::simd
+{
+ template <std::signed_integral _Tp>
+ constexpr bool
+ __signed_has_single_bit(_Tp __x)
+ { return __has_single_bit(make_unsigned_t<_Tp>(__x)); }
+
+ /**
+ * Alias for a vector builtin with given value type and total sizeof.
+ */
+ template <__vectorizable _Tp, size_t _Bytes>
+ requires (__has_single_bit(_Bytes))
+ using __vec_builtin_type_bytes [[__gnu__::__vector_size__(_Bytes)]] = _Tp;
+
+ /**
+ * Alias for a vector builtin with given value type \p _Tp and \p _Width.
+ */
+ template <__vectorizable _Tp, __simd_size_type _Width>
+ requires (__signed_has_single_bit(_Width))
+ using __vec_builtin_type = __vec_builtin_type_bytes<_Tp, sizeof(_Tp) * _Width>;
+
+ /**
+ * Constrain to any vector builtin with given value type and optional width.
+ */
+ template <typename _Tp, typename _ValueType,
+ __simd_size_type _Width = sizeof(_Tp) / sizeof(_ValueType)>
+ concept __vec_builtin_of
+ = not is_arithmetic_v<_Tp> and __vectorizable<_ValueType>
+ and _Width >= 1 and sizeof(_Tp) / sizeof(_ValueType) == _Width
+ and same_as<__vec_builtin_type_bytes<_ValueType, sizeof(_Tp)>, _Tp>
+ and requires(_Tp& __v, _ValueType __x) { __v[0] = __x; };
+
+ /**
+ * Constrain to any vector builtin.
+ */
+ template <typename _Tp>
+ concept __vec_builtin
+ = not is_class_v<_Tp> and requires(const _Tp& __x) {
+ requires __vec_builtin_of<_Tp, remove_cvref_t<decltype(__x[0])>>;
+ };
+
+ /**
+ * Alias for the value type of the given __vec_builtin type \p _Tp.
+ */
+ template <__vec_builtin _Tp>
+ using __vec_value_type = remove_cvref_t<decltype(declval<const _Tp>()[0])>;
+
+ /**
+ * The width (number of value_type elements) of the given vector builtin or arithmetic type.
+ */
+ template <typename _Tp>
+ inline constexpr __simd_size_type __width_of = 1;
+
+ template <typename _Tp>
+ requires __vec_builtin<_Tp>
+ inline constexpr __simd_size_type __width_of<_Tp> = sizeof(_Tp) / sizeof(__vec_value_type<_Tp>);
+
+ /**
+ * Alias for a vector builtin with equal value type and new width \p _Np.
+ */
+ template <__simd_size_type _Np, __vec_builtin _TV>
+ using __resize_vec_builtin_t = __vec_builtin_type<__vec_value_type<_TV>, _Np>;
+
+ template <__vec_builtin _TV>
+ requires (__width_of<_TV> > 1)
+ using __half_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> / 2, _TV>;
+
+ template <__vec_builtin _TV>
+ using __double_vec_builtin_t = __resize_vec_builtin_t<__width_of<_TV> * 2, _TV>;
+
+ template <typename _Up, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type_bytes<_Up, sizeof(_TV)>
+ __vec_bit_cast(_TV __v)
+ { return reinterpret_cast<__vec_builtin_type_bytes<_Up, sizeof(_TV)>>(__v); }
+
+ template <int _Np, __vec_builtin _TV>
+ requires signed_integral<__vec_value_type<_TV>>
+ static constexpr _TV _S_vec_implicit_mask = []<int... _Is> (integer_sequence<int, _Is...>) {
+ return _TV{ (_Is < _Np ? -1 : 0)... };
+ } (make_integer_sequence<int, __width_of<_TV>>());
+
+ /**
+ * Helper function to work around Clang not allowing v[i] in constant expressions.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_value_type<_TV>
+ __vec_get(_TV __v, int __i)
+ {
+#ifdef _GLIBCXX_CLANG
+ if (__builtin_is_constant_evaluated())
+ return __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v)[__i];
+ else
+#endif
+ return __v[__i];
+ }
+
+ /**
+ * Helper function to work around Clang and GCC not allowing assignment to v[i] in constant
+ * expressions.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr void
+ __vec_set(_TV& __v, int __i, __vec_value_type<_TV> __x)
+ {
+ if (__builtin_is_constant_evaluated())
+ {
+#ifdef _GLIBCXX_CLANG
+ auto __arr = __builtin_bit_cast(array<__vec_value_type<_TV>, __width_of<_TV>>, __v);
+ __arr[__i] = __x;
+ __v = __builtin_bit_cast(_TV, __arr);
+#else
+ constexpr auto [...__j] = __iota<int[__width_of<_TV>]>;
+ __v = _TV{(__i == __j ? __x : __v[__j])...};
+#endif
+ }
+ else
+ __v[__i] = __x;
+ }
+
+ /**
+ * Return vector builtin with all values from \p __a and \p __b.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<__vec_value_type<_TV>, __width_of<_TV> * 2>
+ __vec_concat(_TV __a, _TV __b)
+ {
+ constexpr int _N0 = __width_of<_TV>;
+#ifdef _GLIBCXX_CLANG
+ using _RV = __vec_builtin_type<__vec_value_type<_TV>, _N0 * 2>;
+ if constexpr (_N0 == 1)
+ return _RV{__a[0], __b[0]};
+ else if constexpr (_N0 == 2)
+ return _RV{__a[0], __a[1], __b[0], __b[1]};
+ else if constexpr (_N0 == 4)
+ return _RV{__a[0], __a[1], __a[2], __a[3],
+ __b[0], __b[1], __b[2], __b[3]};
+ else if constexpr (_N0 == 8)
+ return _RV{__a[0], __a[1], __a[2], __a[3], __a[4], __a[5], __a[6], __a[7],
+ __b[0], __b[1], __b[2], __b[3], __b[4], __b[5], __b[6], __b[7]};
+ else if constexpr (_N0 == 16)
+ return _RV{__a[0], __a[1], __a[2], __a[3], __a[4], __a[5], __a[6], __a[7],
+ __a[8], __a[9], __a[10], __a[11], __a[12], __a[13], __a[14], __a[15],
+ __b[0], __b[1], __b[2], __b[3], __b[4], __b[5], __b[6], __b[7],
+ __b[8], __b[9], __b[10], __b[11], __b[12], __b[13], __b[14], __b[15]};
+ else if constexpr (_N0 == 32)
+ return _RV{__a[0], __a[1], __a[2], __a[3], __a[4], __a[5], __a[6], __a[7],
+ __a[8], __a[9], __a[10], __a[11], __a[12], __a[13], __a[14], __a[15],
+ __a[16], __a[17], __a[18], __a[19], __a[20], __a[21], __a[22], __a[23],
+ __a[24], __a[25], __a[26], __a[27], __a[28], __a[29], __a[30], __a[31],
+ __b[0], __b[1], __b[2], __b[3], __b[4], __b[5], __b[6], __b[7],
+ __b[8], __b[9], __b[10], __b[11], __b[12], __b[13], __b[14], __b[15],
+ __b[16], __b[17], __b[18], __b[19], __b[20], __b[21], __b[22], __b[23],
+ __b[24], __b[25], __b[26], __b[27], __b[28], __b[29], __b[30], __b[31]};
+ else
+ static_assert(false);
+#elif __has_builtin(__integer_pack)
+ return __builtin_shufflevector(__a, __b, __integer_pack(2 * _N0)...);
+#else
+#error "Neither Clang nor GCC?"
+#endif
+ }
+
+ template <int _N0, int _N1, int... _Ns, __vec_builtin _TV0, __vec_builtin _TV1,
+ __vec_builtin... _TVs>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<__vec_value_type<_TV0>,
+ __bit_ceil(unsigned(_N0 + (_N1 + ... + _Ns)))>
+ __vec_concat_sized(const _TV0& __a, const _TV1& __b, const _TVs&... __rest)
+ {
+ constexpr auto [...__is] = __iota<int[__bit_ceil(unsigned(_N0 + _N1))]>;
+ const auto __ab = __builtin_shufflevector(
+ __a, __b, (__is < _N0 ? __is
+ : __is < _N0 + _N1 ? __is - _N0 + __width_of<_TV0>
+ : -1)...);
+ if constexpr (sizeof...(__rest) == 0)
+ return __ab;
+ else
+ return __vec_concat_sized<_N0 + _N1, _Ns...>(__ab, __rest...);
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __half_vec_builtin_t<_TV>
+ __vec_split_lo(_TV __v)
+ { return __builtin_shufflevector(__v, __v, __integer_pack(__width_of<_TV> / 2)...); }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __half_vec_builtin_t<_TV>
+ __vec_split_hi(_TV __v)
+ {
+ constexpr int __n = __width_of<_TV> / 2;
+ constexpr auto [...__is] = __iota<int[__n]>;
+ return __half_vec_builtin_t<_TV> {__v[(__n + __is)]...};
+ }
+
+ /**
+ * Return a type with sizeof 16. If the input type is smaller, add zero-padding to \p __x.
+ */
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ __vec_zero_pad_to_16(_TV __x)
+ {
+ static_assert(sizeof(_TV) < 16);
+ using _Up = _UInt<sizeof(_TV)>;
+ __vec_builtin_type_bytes<_Up, 16> __tmp = {__builtin_bit_cast(_Up, __x)};
+ return __builtin_bit_cast(__vec_builtin_type_bytes<__vec_value_type<_TV>, 16>, __tmp);
+ }
+
+ /// Return \p __x zero-padded to \p _Bytes bytes.
+ template <size_t _Bytes, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr auto
+ __vec_zero_pad_to(_TV __x)
+ {
+ static_assert(sizeof(_TV) <= _Bytes);
+ if constexpr (sizeof(_TV) == _Bytes)
+ return __x;
+ else
+ return __vec_zero_pad_to<_Bytes>(__vec_concat(__x, _TV()));
+ }
+
+#if _GLIBCXX_X86
+ template <__vec_builtin _UV, __vec_builtin _TV>
+ inline _UV
+ __x86_cvt_f16c(_TV __v);
+#endif
+
+ /** \internal
+ * Simple wrapper around __builtin_convertvector to provide static_cast-like syntax.
+ *
+ * Works around GCC failing to use the F16C/AVX512F cvtps2ph/cvtph2ps instructions.
+ */
+ template <__vec_builtin _UV, __vec_builtin _TV, _ArchTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ constexpr _UV
+ __vec_cast(_TV __v)
+ {
+ static_assert(__width_of<_UV> == __width_of<_TV>);
+#if _GLIBCXX_X86
+ constexpr bool __to_f16 = is_same_v<__vec_value_type<_UV>, _Float16>;
+ constexpr bool __from_f16 = is_same_v<__vec_value_type<_TV>, _Float16>;
+ constexpr bool __needs_f16c = _Traits._M_have_f16c() and not _Traits._M_have_avx512fp16()
+ and (__to_f16 or __from_f16);
+ if (__needs_f16c and not __builtin_is_constant_evaluated() and not __builtin_constant_p(__v))
+ { // Work around PR121688
+ if constexpr (__needs_f16c)
+ return __x86_cvt_f16c<_UV>(__v);
+ }
+#endif
+ return __builtin_convertvector(__v, _UV);
+ }
+
+ /** \internal
+ * Overload of the above cast function that determines the destination vector type from a given
+ * element type \p _Up and the `__width_of` the argument type.
+ *
+ * Calls the above overload.
+ */
+ template <__vectorizable _Up, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr __vec_builtin_type<_Up, __width_of<_TV>>
+ __vec_cast(_TV __v)
+ { return __vec_cast<__vec_builtin_type<_Up, __width_of<_TV>>>(__v); }
+
+ /** \internal
+ * As above, but with additional precondition on possible values of the argument.
+ *
+ * Precondition: __k[i] is either 0 or -1 for all i.
+ */
+ template <__vec_builtin _UV, __vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _UV
+ __vec_mask_cast(_TV __k)
+ {
+ static_assert(signed_integral<__vec_value_type<_UV>>);
+ static_assert(signed_integral<__vec_value_type<_TV>>);
+ // TODO: __builtin_convertvector cannot be optimal because it doesn't consider input and
+ // output can only be 0 or -1.
+ return __builtin_convertvector(__k, _UV);
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_xor(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (is_floating_point_v<_Tp>)
+ {
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, __builtin_bit_cast(_UV, __a) ^ __builtin_bit_cast(_UV, __b));
+ }
+ else
+ return __a ^ __b;
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_or(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (is_floating_point_v<_Tp>)
+ {
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, __builtin_bit_cast(_UV, __a) | __builtin_bit_cast(_UV, __b));
+ }
+ else
+ return __a | __b;
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_and(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ if constexpr (is_floating_point_v<_Tp>)
+ {
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, __builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
+ }
+ else
+ return __a & __b;
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_andnot(_TV __a, _TV __b)
+ {
+ using _Tp = __vec_value_type<_TV>;
+ using _UV = __vec_builtin_type<__integer_from<sizeof(_Tp)>, __width_of<_TV>>;
+ return __builtin_bit_cast(
+ _TV, ~__builtin_bit_cast(_UV, __a) & __builtin_bit_cast(_UV, __b));
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_not(_TV __a)
+ {
+ using _UV = __vec_builtin_type_bytes<unsigned, sizeof(_TV)>;
+ if constexpr (is_floating_point_v<__vec_value_type<_TV>>)
+ return __builtin_bit_cast(_TV, ~__builtin_bit_cast(_UV, __a));
+ else
+ return ~__a;
+ }
+
+ /**
+ * An object of given type where only the sign bits are 1.
+ */
+ template <__vec_builtin _V>
+ requires std::floating_point<__vec_value_type<_V>>
+ constexpr _V _S_signmask = __vec_xor(_V() + 1, _V() - 1);
+
+ // work around __builtin_constant_p returning false unless passed a variable
+ // (__builtin_constant_p(x[0]) is false while __is_constprop(x[0]) is true)
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ __is_constprop(const auto& __x)
+ { return __builtin_is_constant_evaluated() or __builtin_constant_p(__x); }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ __is_constprop(const __complex_like auto& __x)
+ {
+ return __builtin_is_constant_evaluated()
+ or (__is_constprop(__x.real()) and __is_constprop(__x.imag()));
+ }
+
+ [[__gnu__::__always_inline__]]
+ constexpr bool
+ __is_constprop_equal_to(const auto& __x, const auto& __expect)
+ { return (__builtin_is_constant_evaluated() or __builtin_constant_p(__x)) and __x == __expect; }
+
+ template <__vec_builtin _TV, int _Np = __width_of<_TV>,
+ typename = make_integer_sequence<int, _Np>>
+ struct _VecOps;
+
+ template <__vec_builtin _TV, int _Np, int... _Is>
+ struct _VecOps<_TV, _Np, integer_sequence<int, _Is...>>
+ {
+ static_assert(_Np <= __width_of<_TV>);
+
+ using _Tp = __vec_value_type<_TV>;
+
+ using _HV = __half_vec_builtin_t<conditional_t<_Np >= 2, _TV, __double_vec_builtin_t<_TV>>>;
+
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_broadcast_to_even(_Tp __init)
+ { return _TV {((_Is & 1) == 0 ? __init : _Tp())...}; }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_broadcast_to_odd(_Tp __init)
+ { return _TV {((_Is & 1) == 1 ? __init : _Tp())...}; }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_all_of(_TV __k) noexcept
+ { return (... and (__k[_Is] != 0)); }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_any_of(_TV __k) noexcept
+ { return (... or (__k[_Is] != 0)); }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_none_of(_TV __k) noexcept
+ { return (... and (__k[_Is] == 0)); }
+
+ template <typename _Offset = integral_constant<int, 0>>
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_extract(__vec_builtin auto __x, _Offset = {})
+ {
+ static_assert(is_same_v<__vec_value_type<_TV>, __vec_value_type<decltype(__x)>>);
+ return __builtin_shufflevector(__x, decltype(__x)(), (_Is + _Offset::value)...);
+ }
+
+ // swap neighboring elements
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_swap_neighbors(_TV __x)
+ { return __builtin_shufflevector(__x, __x, (_Is ^ 1)...); }
+
+ // duplicate even indexed elements, dropping the odd ones
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_dup_even(_TV __x)
+ { return __builtin_shufflevector(__x, __x, (_Is & ~1)...); }
+
+ // duplicate odd indexed elements, dropping the even ones
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_dup_odd(_TV __x)
+ { return __builtin_shufflevector(__x, __x, (_Is | 1)...); }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_even_elements(_TV& __x, _HV __y) requires (_Np > 1)
+ {
+ constexpr __simd_size_type __n = __width_of<_TV>;
+ __x = __builtin_shufflevector(__x,
+#ifdef _GLIBCXX_CLANG
+ __vec_concat(__y, __y),
+#else
+ __y,
+#endif
+ ((_Is & 1) == 0 ? __n + _Is / 2 : _Is)...);
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_even_elements(_TV& __xl, _TV& __xh, _TV __y)
+ {
+ constexpr __simd_size_type __nl = __width_of<_TV>;
+ constexpr __simd_size_type __nh = __nl * 3 / 2;
+ __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 0 ? __nl + _Is / 2 : _Is)...);
+ __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 0 ? __nh + _Is / 2 : _Is)...);
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_odd_elements(_TV& __x, _HV __y) requires (_Np > 1)
+ {
+ constexpr __simd_size_type __n = __width_of<_TV>;
+ __x = __builtin_shufflevector(__x,
+#ifdef _GLIBCXX_CLANG
+ __vec_concat(__y, __y),
+#else
+ __y,
+#endif
+ ((_Is & 1) == 1 ? __n + _Is / 2 : _Is)...);
+ }
+
+ [[__gnu__::__always_inline__]]
+ static constexpr void
+ _S_overwrite_odd_elements(_TV& __xl, _TV& __xh, _TV __y)
+ {
+ constexpr __simd_size_type __nl = __width_of<_TV>;
+ constexpr __simd_size_type __nh = __nl * 3 / 2;
+ __xl = __builtin_shufflevector(__xl, __y, ((_Is & 1) == 1 ? __nl + _Is / 2 : _Is)...);
+ __xh = __builtin_shufflevector(__xh, __y, ((_Is & 1) == 1 ? __nh + _Is / 2 : _Is)...);
+ }
+
+ // negate every even element (real part of interleaved complex)
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_complex_negate_real(_TV __x)
+ { return __vec_xor(_S_broadcast_to_even(_S_signmask<_TV>[0]), __x); }
+
+ // negate every odd element (imaginary part of interleaved complex)
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_complex_negate_imag(_TV __x)
+ { return __vec_xor(_S_broadcast_to_odd(_S_signmask<_TV>[0]), __x); }
+
+ // Subtract elements with even index, add elements with odd index.
+ [[__gnu__::__always_inline__]]
+ static constexpr _TV
+ _S_addsub(_TV __x, _TV __y)
+ {
+#if 0
+ return __x + _S_complex_negate_imag(__y);
+#else
+ // GCC recognizes this pattern as addsub
+ return __builtin_shufflevector(__x - __y, __x + __y,
+ (_Is + (_Is & 1) * __width_of<_TV>)...);
+#endif
+ }
+
+ // true if all elements are know to be equal to __ref at compile time
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_is_constprop_equal_to(_TV __x, _Tp __ref)
+ { return (__is_constprop_equal_to(__x[_Is], __ref) and ...); }
+
+ // True iff all elements at even indexes are zero. This includes signed zeros only when
+ // -fno-signed-zeros is in effect.
+ template <_OptTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_complex_real_is_constprop_zero(_TV __x)
+ {
+ if constexpr (_Traits._M_conforming_to_STDC_annex_G())
+ {
+ using _Up = _UInt<sizeof(_Tp)>;
+ return (((_Is & 1) == 1 or __is_constprop_equal_to(__builtin_bit_cast(_Up, __x[_Is]),
+ _Up())) and ...);
+ }
+ else
+ return (((_Is & 1) == 1 or __is_constprop_equal_to(__x[_Is], _Tp())) and ...);
+ }
+
+ // True iff all elements at odd indexes are zero. This includes signed zeros only when
+ // -fno-signed-zeros is in effect.
+ template <_OptTraits _Traits = {}>
+ [[__gnu__::__always_inline__]]
+ static constexpr bool
+ _S_complex_imag_is_constprop_zero(_TV __x)
+ {
+ if constexpr (_Traits._M_conforming_to_STDC_annex_G())
+ {
+ using _Up = _UInt<sizeof(_Tp)>;
+ return (((_Is & 1) == 0 or __is_constprop_equal_to(__builtin_bit_cast(_Up, __x[_Is]),
+ _Up())) and ...);
+ }
+ else
+ return (((_Is & 1) == 0 or __is_constprop_equal_to(__x[_Is], _Tp())) and ...);
+ }
+ };
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_VEC_OPS_H