[PATCH 06/11] libstdc++: Implement C++26 [simd] basic_vec

Matthias Kretz Wed, 08 Oct 2025 05:06:50 -0700


This implements basic_vec for all but complex value types.


libstdc++-v3/ChangeLog:

        * include/bits/simd_vec.h: New file.

Signed-off-by: Matthias Kretz <[email protected]>
---
 libstdc++-v3/include/bits/simd_vec.h | 2130 ++++++++++++++++++++++++++
 1 file changed, 2130 insertions(+)
 create mode 100644 libstdc++-v3/include/bits/simd_vec.h


--
──────────────────────────────────────────────────────────────────────────
 Dr. Matthias Kretz                           https://mattkretz.github.io
 GSI Helmholtz Center for Heavy Ion Research               https://gsi.de
 std::simd
──────────────────────────────────────────────────────────────────────────

diff --git a/libstdc++-v3/include/bits/simd_vec.h b/libstdc++-v3/include/bits/simd_vec.h
new file mode 100644
index 00000000000..1b1f0ef2047
--- /dev/null
+++ b/libstdc++-v3/include/bits/simd_vec.h
@@ -0,0 +1,2130 @@
+/* SPDX-License-Identifier: GPL-3.0-or-later WITH GCC-exception-3.1 */
+/* Copyright © 2025      GSI Helmholtzzentrum fuer Schwerionenforschung GmbH
+ *                       Matthias Kretz <[email protected]>
+ */
+
+#ifndef _GLIBCXX_SIMD_VEC_H
+#define _GLIBCXX_SIMD_VEC_H 1
+
+#ifdef _GLIBCXX_SYSHDR
+#pragma GCC system_header
+#endif
+
+#if __cplusplus >= 202400L
+
+#include "simd_mask.h"
+#include "simd_flags.h"
+
+#include <bits/utility.h>
+#include <bits/stl_function.h>
+#include <cmath>
+
+// psabi warnings are bogus because the ABI of the internal types never leaks into user code
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpsabi"
+
+namespace std::simd
+{
+  // disabled basic_vec
+  template <typename _Tp, typename _Abi>
+    class basic_vec
+    {
+    public:
+      using value_type = _Tp;
+
+      using abi_type = _Abi;
+
+      using mask_type = basic_mask<0, void>; // disabled
+
+#define _GLIBCXX_DELETE_SIMD                                                                    \
+      _GLIBCXX_DELETE_MSG("This specialization is disabled because of an invalid combination "  \
+          "of template arguments to basic_vec.")
+
+      basic_vec() = _GLIBCXX_DELETE_SIMD;
+
+      ~basic_vec() = _GLIBCXX_DELETE_SIMD;
+
+      basic_vec(const basic_vec&) = _GLIBCXX_DELETE_SIMD;
+
+      basic_vec& operator=(const basic_vec&) = _GLIBCXX_DELETE_SIMD;
+
+#undef _GLIBCXX_DELETE_SIMD
+    };
+
+  template <typename _Tp, typename _Abi>
+    class _BinaryOps
+    {
+      using _Vp = basic_vec<_Tp, _Abi>;
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator+(const _Vp& __x, const _Vp& __y) noexcept
+      {
+        _Vp __r = __x;
+        __r += __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator-(const _Vp& __x, const _Vp& __y) noexcept
+      {
+        _Vp __r = __x;
+        __r -= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator*(const _Vp& __x, const _Vp& __y) noexcept
+      {
+        _Vp __r = __x;
+        __r *= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator/(const _Vp& __x, const _Vp& __y) noexcept
+      {
+        _Vp __r = __x;
+        __r /= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator%(const _Vp& __x, const _Vp& __y) noexcept
+      requires requires (_Tp __a) { __a % __a; }
+      {
+        _Vp __r = __x;
+        __r %= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator&(const _Vp& __x, const _Vp& __y) noexcept
+      requires requires (_Tp __a) { __a & __a; }
+      {
+        _Vp __r = __x;
+        __r &= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator|(const _Vp& __x, const _Vp& __y) noexcept
+      requires requires (_Tp __a) { __a | __a; }
+      {
+        _Vp __r = __x;
+        __r |= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator^(const _Vp& __x, const _Vp& __y) noexcept
+      requires requires (_Tp __a) { __a ^ __a; }
+      {
+        _Vp __r = __x;
+        __r ^= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator<<(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires (_Tp __a) { __a << __a; }
+      {
+        _Vp __r = __x;
+        __r <<= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator<<(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires (_Tp __a, __simd_size_type __b) { __a << __b; }
+      {
+        _Vp __r = __x;
+        __r <<= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator>>(const _Vp& __x, const _Vp& __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires (_Tp __a) { __a >> __a; }
+      {
+        _Vp __r = __x;
+        __r >>= __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr _Vp
+      operator>>(const _Vp& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires (_Tp __a, __simd_size_type __b) { __a >> __b; }
+      {
+        _Vp __r = __x;
+        __r >>= __y;
+        return __r;
+      }
+    };
+
+  struct _LoadCtorTag
+  {};
+
+  template <typename _Cx, __vec_builtin _TV, _TargetTraits = {}>
+    [[__gnu__::__cold__]]
+    constexpr _TV
+    __cx_redo_mul(_TV __r, const _TV __x, const _TV __y, const auto __nan, int __n)
+    {
+      // redo multiplication using scalar complex-mul on (NaN, NaN) results
+      alignas(_TV) __vec_value_type<_TV> __arr[__width_of<_TV>] = {};
+      for (int __i = 0; __i < __n; __i += 2)
+        {
+          if (__nan[__i] and __nan[__i + 1])
+            {
+              using _Tc = typename _Cx::value_type;
+              const _Cx __cx(_Tc(__x[__i]), _Tc(__x[__i + 1]));
+              const _Cx __cy(_Tc(__y[__i]), _Tc(__y[__i + 1]));
+              const _Cx __cr = __cx * __cy;
+              __arr[__i] = __cr.real();
+              __arr[__i + 1] = __cr.imag();
+            }
+          else
+            {
+              __arr[__i] = __r[__i];
+              __arr[__i + 1] = __r[__i + 1];
+            }
+        }
+      return __builtin_bit_cast(_TV, __arr);
+    }
+
+  template <typename _Cx, __vec_builtin _TV, _TargetTraits = {}>
+    [[__gnu__::__cold__]]
+    constexpr void
+    __cxctgus_redo_mul(_TV& __re0, _TV& __im0, const _TV __re1, const _TV __im1,
+                       const _TV __re, const _TV __im, const auto __nan, int __n)
+    {
+      alignas(_TV) __vec_value_type<_TV> __arr_re[__width_of<_TV>] = {};
+      alignas(_TV) __vec_value_type<_TV> __arr_im[__width_of<_TV>] = {};
+      for (int __i = 0; __i < __n; ++__i)
+        {
+          if (__nan[__i])
+            {
+              const _Cx __c0(__re0[__i], __im0[__i]);
+              const _Cx __c1(__re1[__i], __im1[__i]);
+              const _Cx __cr = __c0 * __c1;
+              __arr_re[__i] = __cr.real();
+              __arr_im[__i] = __cr.imag();
+            }
+          else
+            {
+              __arr_re[__i] = __re[__i];
+              __arr_im[__i] = __im[__i];
+            }
+        }
+      __re0 = __builtin_bit_cast(_TV, __arr_re);
+      __im0 = __builtin_bit_cast(_TV, __arr_im);
+    }
+
+  template <typename _Cx, floating_point _Tp, _TargetTraits = {}>
+    [[__gnu__::__always_inline__]]
+    constexpr void
+    __cxctgus_redo_mul(_Tp& __re0, _Tp& __im0, const _Tp __re1, const _Tp __im1,
+                       const _Tp, const _Tp, const auto, int)
+    {
+      const _Cx __c0(__re0, __im0);
+      const _Cx __c1(__re1, __im1);
+      const _Cx __cr = __c0 * __c1;
+      __re0 = __cr.real();
+      __im0 = __cr.imag();
+    }
+
+  template <integral _Tp>
+    inline constexpr _Tp __max_shift
+      = (sizeof(_Tp) < sizeof(int) ? sizeof(int) : sizeof(_Tp)) * __CHAR_BIT__;
+
+  template <__vectorizable _Tp, __abi_tag _Ap>
+    requires (_Ap::_S_nreg == 1)
+      and (not __complex_like<_Tp>)
+    class basic_vec<_Tp, _Ap>
+    : _BinaryOps<_Tp, _Ap>
+    {
+      template <typename, typename>
+        friend class basic_vec;
+
+      template <size_t, typename>
+        friend class basic_mask;
+
+      static constexpr int _S_size = _Ap::_S_size;
+
+      static constexpr int _S_full_size = __bit_ceil(unsigned(_S_size));
+
+      static constexpr bool _S_is_scalar = is_same_v<_Ap, _ScalarAbi<_Ap::_S_size>>;
+
+      static_assert(not _S_is_scalar or _S_size == 1);
+
+      static constexpr bool _S_use_bitmask = [] {
+        if constexpr (_S_is_scalar)
+          return false;
+        else
+          return __flags_test(_Ap::_S_variant, _AbiVariant::_BitMask);
+      }();
+
+      using _DataType = typename _Ap::template _DataType<_Tp>;
+
+      _DataType _M_data;
+
+      static constexpr bool _S_is_partial = sizeof(_M_data) > sizeof(_Tp) * _S_size;
+
+    public:
+      using value_type = _Tp;
+
+      using abi_type = _Ap;
+
+      using mask_type = basic_mask<sizeof(_Tp), abi_type>;
+
+      using iterator = __iterator<basic_vec>;
+
+      using const_iterator = __iterator<const basic_vec>;
+
+      constexpr iterator
+      begin() noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      begin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      cbegin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr default_sentinel_t
+      end() const noexcept
+      { return {}; }
+
+      constexpr default_sentinel_t
+      cend() const noexcept
+      { return {}; }
+
+      static constexpr auto size = __simd_size_constant<_S_size>;
+
+      // internal but public API ----------------------------------------------
+      [[__gnu__::__always_inline__]]
+      static constexpr basic_vec
+      _S_init(_DataType __x)
+      {
+        basic_vec __r;
+        __r._M_data = __x;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr const _DataType&
+      _M_get() const
+      { return _M_data; }
+
+      [[__gnu__::__always_inline__]]
+      constexpr bool
+      _M_is_constprop() const
+      { return __builtin_constant_p(_M_data); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr auto
+      _M_concat_data() const
+      {
+        if constexpr (_S_is_scalar)
+          return __vec_builtin_type<value_type, 1>{_M_data};
+        else
+          return _M_data;
+      }
+
+      template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_vec
+        _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
+        {
+          using _Xp = basic_vec<value_type, _A0>;
+          basic_vec __r;
+          if constexpr (_S_is_scalar)
+            {
+              constexpr __simd_size_type __j = [&] consteval {
+                if constexpr (__index_permutation_function_nosize<_Fp>)
+                  return __idxmap(_Offset);
+                else
+                  return __idxmap(_Offset, _Size);
+              }();
+              if constexpr (__j == simd::zero_element or __j == simd::uninit_element)
+                return basic_vec();
+              else
+                static_assert(__j >= 0 and __j < _Xp::_S_size);
+              __r._M_data = __x[__j];
+            }
+          else
+            {
+              auto __idxmap2 = [=](auto __i) consteval {
+                if constexpr (int(__i + _Offset) >= _Size) // _S_full_size > _Size
+                  return __simd_size_constant<simd::uninit_element>;
+                else if constexpr (__index_permutation_function_nosize<_Fp>)
+                  return __simd_size_constant<__idxmap(__i + _Offset)>;
+                else
+                  return __simd_size_constant<__idxmap(__i + _Offset, _Size)>;
+              };
+              constexpr auto __adj_idx = [](auto __i) {
+                constexpr int __j = __i;
+                if constexpr (__j == simd::zero_element)
+                  return __simd_size_constant<__bit_ceil(unsigned(_Xp::_S_size))>;
+                else if constexpr (__j == simd::uninit_element)
+                  return __simd_size_constant<-1>;
+                else
+                  {
+                    static_assert(__j >= 0 and __j < _Xp::_S_size);
+                    return __simd_size_constant<__j>;
+                  }
+              };
+              constexpr bool __needs_zero_element = [&] {
+                constexpr auto [...__is] = __iota<int[_S_size]>;
+                return ((__idxmap2(__simd_size_constant<__is>).value == simd::zero_element) || ...);
+              }();
+              constexpr auto [...__is] = __iota<int[_S_full_size]>;
+              if constexpr (_A0::_S_nreg == 2 and not __needs_zero_element)
+                {
+                  __r._M_data = __builtin_shufflevector(
+                                  __x._M_data0._M_data, __x._M_data1._M_data,
+                                  __adj_idx(__idxmap2(__simd_size_constant<__is>)).value...);
+                }
+              else
+                {
+                  __r._M_data = __builtin_shufflevector(
+                                  __x._M_concat_data(), decltype(__x._M_concat_data())(),
+                                  __adj_idx(__idxmap2(__simd_size_constant<__is>)).value...);
+                }
+            }
+          return __r;
+        }
+
+      using _HalfVec = __similar_vec<value_type, _S_size / 2, _Ap>;
+
+      [[__gnu__::__always_inline__]]
+      constexpr void
+      _M_complex_set_real(const _HalfVec& __x) requires ((_S_size & 1) == 0)
+      {
+        if (_M_is_constprop() and __x._M_is_constprop())
+          {
+            constexpr auto [...__is] = __iota<int[_S_size]>;
+            _M_data = _DataType { ((__is & 1) == 0 ? value_type(__x[__is / 2]) : _M_data[__is])...};
+          }
+        else if constexpr (_S_size == 2)
+          _M_data[0] = __x[0];
+        else
+          _VecOps<_DataType>::_S_overwrite_even_elements(_M_data, __x);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr void
+      _M_complex_set_imag(const _HalfVec& __x) requires ((_S_size & 1) == 0)
+      {
+        if (_M_is_constprop() and __x._M_is_constprop())
+          {
+            constexpr auto [...__is] = __iota<int[_S_size]>;
+            _M_data = _DataType { ((__is & 1) == 1 ? value_type(__x[__is / 2]) : _M_data[__is])...};
+          }
+        else if constexpr (_S_size == 2)
+          _M_data[1] = __x[0];
+        else
+          _VecOps<_DataType>::_S_overwrite_odd_elements(_M_data, __x);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      _M_complex_conj() const
+      {
+        static_assert((_S_size & 1) == 0);
+        return _VecOps<_DataType>::_S_complex_negate_imag(_M_data);
+      }
+
+      template <typename _CxVec, _TargetTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr void
+        _M_complex_multiply_with(basic_vec __yvec)
+        {
+          const _DataType __x = _M_data;
+          const _DataType __y = __yvec._M_data;
+          static_assert((_S_size & 1) == 0);
+          using _VO = _VecOps<_DataType>;
+          if constexpr (_Traits.template _M_eval_as_f32<value_type>())
+            {
+              using _Vf = rebind_t<float, basic_vec>;
+              _Vf __xf = _Vf(*this);
+              __xf.template _M_complex_multiply_with<_CxVec>(_Vf(__yvec));
+              *this = basic_vec(__xf);
+              return;
+            }
+          else if (_VecOps<_DataType, _S_size>::_S_complex_imag_is_constprop_zero(__x))
+            {
+              if (_VecOps<_DataType, _S_size>::_S_complex_imag_is_constprop_zero(__y))
+                _M_data = __x * __y;
+              else
+                {
+                  if (_Traits._M_conforming_to_STDC_annex_G())
+                    {
+                      auto __a = _VO::_S_dup_even(__x) * __y;
+                      auto __b = _DataType() * _VO::_S_swap_neighbors(__y);
+#if SIMD_DIAGNOSE_INDETERMINATE_SIGNED_ZERO
+                      //if (_SuperImpl::_S_any_of(_SuperImpl::_S_equal_to(__a, 0))) // __b is ±0 by construction
+#endif
+                      _M_data = _VO::_S_addsub(__a, __b);
+                    }
+                  else
+                    _M_data = _VO::_S_dup_even(__x) * __y;
+                }
+            }
+          else if (_VecOps<_DataType, _S_size>::_S_complex_imag_is_constprop_zero(__y))
+            {
+              if (_Traits._M_conforming_to_STDC_annex_G())
+                _M_data = _VO::_S_addsub(_VO::_S_dup_even(__y) * __x,
+                                         _DataType() * _VO::_S_swap_neighbors(__x));
+              else
+                _M_data = _VO::_S_dup_even(__y) * __x;
+            }
+          else if (_VecOps<_DataType, _S_size>::_S_complex_real_is_constprop_zero(__y))
+            {
+              if (_Traits._M_conforming_to_STDC_annex_G())
+                _M_data = _VO::_S_addsub(_DataType(), _VO::_S_dup_odd(__y)
+                                           * _VO::_S_swap_neighbors(__x));
+              else
+                _M_data = _VO::_S_dup_odd(__y)
+                            * _VO::_S_complex_negate_real(_VO::_S_swap_neighbors(__x));
+            }
+          else if (_VecOps<_DataType, _S_size>::_S_complex_real_is_constprop_zero(__x))
+            {
+              if (_Traits._M_conforming_to_STDC_annex_G())
+                _M_data = _VO::_S_addsub(_DataType(), _VO::_S_dup_odd(__x)
+                                           * _VO::_S_swap_neighbors(__y));
+              else
+                _M_data = _VO::_S_dup_odd(__x)
+                            * _VO::_S_complex_negate_real(_VO::_S_swap_neighbors(__y));
+            }
+          else
+            {
+#if _GLIBCXX_X86
+              if (_Traits._M_have_fma() and not __builtin_is_constant_evaluated()
+                    and not (__builtin_constant_p(__x) and __builtin_constant_p(__y)))
+                {
+                  if constexpr (_Traits._M_have_fma())
+                    _M_data = __x86_complex_multiplies(__x, __y);
+                }
+              else
+#endif
+                _M_data = _VO::_S_addsub(_VO::_S_dup_even(__x) * __y,
+                                         _VO::_S_dup_odd(__x) * _VO::_S_swap_neighbors(__y));
+              mask_type __nan = _M_isnan();
+              if (_Traits._M_conforming_to_STDC_annex_G() and __nan._M_any_of()) [[unlikely]]
+                _M_data = __cx_redo_mul<typename _CxVec::value_type>(_M_data, __x, __y, __nan,
+                                                                     _S_size);
+            }
+        }
+
+      template <typename _Cx, _TargetTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        static constexpr void
+        _S_cxctgus_mul(basic_vec& __re0, basic_vec& __im0, basic_vec __re1, basic_vec __im1)
+        {
+#if 0 // TODO
+          if constexpr (_S_is_scalar)
+            __x = value_type(__x._M_real, __x._M_imag) * value_type(__y._M_real, __y._M_imag);
+          else
+            {
+              using _TV = _RealSimd::_DataType;
+              using _VO = _VecOps<_TV, _S_size>;
+              if (_VO::_S_is_constprop_equal_to(__x._M_imag, _TV()))
+                {
+                  if (_VO::_S_is_constprop_equal_to(__y._M_imag, _TV()))
+                    {
+                      __x._M_real *= __y._M_real;
+                      return __x;
+                    }
+                  else if (_VO::_S_is_constprop_equal_to(__y._M_real, _TV()))
+                    {
+                      __x._M_imag = __x._M_real * __y._M_imag;
+                      __x._M_real = _RealSimd();
+                      return __x;
+                    }
+                  else if constexpr (not _Traits._M_conforming_to_STDC_annex_G())
+                    { // the sign of zero can come out wrong here
+                      __x._M_imag = __x._M_real * __y._M_imag;
+                      __x._M_real = __x._M_real * __y._M_real;
+                      return __x;
+                    }
+                }
+              else if (_VO::_S_is_constprop_equal_to(__y._M_imag, _TV()))
+                {
+                  if constexpr (not _Traits._M_conforming_to_STDC_annex_G())
+                    { // the sign of zero can come out wrong here
+                      __x._M_real *= __y._M_real;
+                      __x._M_imag *= __y._M_real;
+                      return __x;
+                    }
+                }
+              else if (_VO::_S_is_constprop_equal_to(__y._M_real, _TV()))
+                {
+                  const _RealSimd __r = - __x._M_imag * __y._M_imag;
+                  __x._M_imag = __x._M_real * __y._M_imag;
+          __x._M_real = __r;
+                }
+            }
+#endif
+          if constexpr (_S_is_scalar)
+            {
+              const _Cx __c0(__re0._M_data, __im0._M_data);
+              const _Cx __c1(__re1._M_data, __im1._M_data);
+              const _Cx __cr = __c0 * __c1;
+              __re0._M_data = __cr.real();
+              __im0._M_data = __cr.imag();
+            }
+          else if constexpr (_Traits.template _M_eval_as_f32<value_type>())
+            {
+              using _Vf = rebind_t<float, basic_vec>;
+              _Vf __re0f = __re0;
+              _Vf __im0f = __im0;
+              _Vf::template _S_cxctgus_mul<_Cx>(__re0f, __im0f, __re1, __im1);
+              __re0 = basic_vec(__re0f);
+              __im0 = basic_vec(__im0f);
+            }
+          else
+            {
+              basic_vec __re = __re0 * __re1 - __im0 * __im1;
+              basic_vec __im = __re0 * __im1 + __im0 * __re1;
+              const auto __nan = __re._M_isnan() and __im._M_isnan();
+              if (any_of(__nan)) [[unlikely]]
+                __cxctgus_redo_mul<_Cx>(__re0._M_data, __im0._M_data, __re1._M_data, __im1._M_data,
+                                        __re._M_data, __im._M_data, __nan._M_data, _S_size);
+              else
+                {
+                  __re0 = __re;
+                  __im0 = __im;
+                }
+            }
+        }
+
+      template <typename _Vp>
+        [[__gnu__::__always_inline__]]
+        constexpr auto
+        _M_chunk() const noexcept
+        {
+          constexpr int __n = _S_size / _Vp::_S_size;
+          constexpr int __rem = _S_size % _Vp::_S_size;
+          constexpr auto [...__is] = __iota<int[__n]>;
+          if constexpr (__rem == 0)
+            {
+              if constexpr (_Vp::_S_is_scalar)
+                return array<_Vp, __n> {_Vp::_S_init(_M_data[__is])...};
+              else
+                return array<_Vp, __n> {
+                  _Vp::_S_init(
+                    _VecOps<typename _Vp::_DataType>::_S_extract(
+                      _M_data, integral_constant<int, __is * _Vp::_S_size>()))...
+              };
+            }
+          else
+            {
+              using _Rest = resize_t<__rem, _Vp>;
+              _Rest __rest;
+              if constexpr (_Rest::_S_size > 1)
+                __rest = _VecOps<typename _Rest::_DataType>::_S_extract(
+                           _M_data, integral_constant<int, __n * _Vp::_S_size>());
+              else
+                __rest = _M_data[__n * _Vp::_S_size];
+              return tuple {
+                _Vp::_S_init(
+                  _VecOps<typename _Vp::_DataType>::_S_extract(
+                    _M_data, integral_constant<int, __is * _Vp::_S_size>()))...,
+                __rest
+              };
+            }
+        }
+
+      template <typename _A0, typename... _As>
+        [[__gnu__::__always_inline__]]
+        constexpr void
+        _M_assign_from(auto _Offset, const basic_vec<value_type, _A0>& __x0,
+                       const basic_vec<value_type, _As>&... __xs)
+        {
+          if constexpr (_Offset.value >= _A0::_S_size)
+            // make the pack as small as possible
+            _M_assign_from(integral_constant<int, _Offset.value - _A0::_S_size>(), __xs...);
+          else if constexpr (_A0::_S_size >= _S_size + _Offset.value)
+            {
+              if constexpr (_S_size == 1)
+                _M_data = __x0[_Offset];
+              else
+                _M_data = _VecOps<_DataType>::_S_extract(__x0._M_concat_data(), _Offset);
+            }
+          else
+            _M_data = _VecOps<_DataType>::_S_extract(
+                        __vec_concat_sized<__x0.size(), __xs.size()...>(__x0._M_concat_data(),
+                                                                        __xs._M_concat_data()...),
+                        _Offset);
+        }
+
+      template <typename _A0>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_vec
+        _S_concat(const basic_vec<value_type, _A0>& __x0) noexcept
+        { return static_cast<const basic_vec&>(__x0); }
+
+      template <typename... _As>
+        requires (sizeof...(_As) > 1)
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_vec
+        _S_concat(const basic_vec<value_type, _As>&... __xs) noexcept
+        {
+          using _A0 = _As...[0];
+          using _A1 = _As...[1];
+          if constexpr (not _S_is_partial
+                          and ((not basic_vec<value_type, _As>::_S_is_partial
+                                  and _As::_S_size * sizeof...(_As) == _S_size) and ...))
+            return basic_vec::_S_init(__vec_concat(__xs._M_concat_data()...));
+
+          else
+            {
+              constexpr bool __simple_inserts
+                = sizeof...(_As) == 2 and _A1::_S_size <= 2
+                    and is_same_v<_DataType, typename basic_vec<value_type, _A0>::_DataType>;
+              if (not __builtin_is_constant_evaluated() and __simple_inserts)
+                {
+                  if constexpr (__simple_inserts)
+                    {
+                      const auto& __x0 = __xs...[0];
+                      const auto& __x1 = __xs...[1];
+                      basic_vec __r;
+                      __r._M_data = __x0._M_data;
+                      if constexpr (_A1::_S_size == 1)
+                        __r._M_data[_S_size - 1] = __x1[0];
+                      else
+                        {
+                          for (int __i = __x0.size.value; __i < _S_size; ++__i)
+                            __r._M_data[__i] = __x1._M_data[__i - __x0.size.value];
+                        }
+                      return __r;
+                    }
+                }
+              else
+                return basic_vec::_S_init(__vec_concat_sized<_As::_S_size...>(
+                                            __xs._M_concat_data()...));
+            }
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr auto
+      _M_reduce_1(auto __binary_op) const
+      {
+        static_assert(__has_single_bit(unsigned(_S_size)));
+        auto [__a, __b] = chunk<_S_size / 2>(*this);
+        return __binary_op(__a, __b);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr value_type
+      _M_reduce_tail(const auto& __rest, auto __binary_op) const
+      {
+        if constexpr (__rest.size() > _S_size)
+          {
+            auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
+            return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
+          }
+        else if constexpr (_S_is_scalar)
+          return __binary_op(*this, __rest)._M_data;
+        else if constexpr (__rest.size() == _S_size)
+          return __binary_op(*this, __rest)._M_reduce(__binary_op);
+        else
+          return _M_reduce_1(__binary_op)._M_reduce_tail(__rest, __binary_op);
+      }
+
+      template <int _Shift, _ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr basic_vec
+        _M_elements_shifted_down() const
+        {
+          static_assert(_Shift < _S_size and _Shift > 0);
+#ifdef __SSE2__
+          if (not __builtin_is_constant_evaluated() and not _M_is_constprop())
+            {
+              if constexpr (sizeof(_M_data) == 16)
+                return reinterpret_cast<_DataType>(
+                         __builtin_ia32_psrldqi128(__vec_bit_cast<long long>(_M_data),
+                                                   _Shift * sizeof(value_type) * 8));
+              else
+                {
+                  const auto __x = reinterpret_cast<__vec_builtin_type_bytes<long long, 16>>(
+                                     __vec_zero_pad_to_16(_M_data));
+                  const auto __shifted = __builtin_ia32_psrldqi128(
+                                           __x, _Shift * sizeof(value_type) * 8);
+                  return _VecOps<_DataType>::_S_extract(__vec_bit_cast<value_type>(__shifted));
+                }
+            }
+#endif
+          return _S_static_permute(*this, [](int __i) {
+                   return __i + _Shift >= _S_size ? zero_element : __i + _Shift;
+                 });
+        }
+
+      template <typename _BinaryOp, _ArchTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr value_type
+        _M_reduce(_BinaryOp __binary_op) const
+      {
+        if constexpr (_S_size == 1)
+          return operator[](0);
+        else if constexpr (_Traits.template _M_eval_as_f32<value_type>()
+                             and (is_same_v<_BinaryOp, plus<>>
+                                    or is_same_v<_BinaryOp, multiplies<>>))
+          return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
+#ifdef __SSE2__
+        else if constexpr (is_integral_v<value_type> and sizeof(value_type) == 1
+                             and is_same_v<decltype(__binary_op), multiplies<>>)
+          {
+            // convert to unsigned short because of missing 8-bit mul instruction
+            // we don't need to preserve the order of elements
+            using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
+            auto __a = __builtin_bit_cast(_V16, *this);
+            return __binary_op(__a, __a >> 8)._M_reduce(__binary_op);
+            // alternative: return _V16(*this)._M_reduce(__binary_op);
+          }
+#endif
+        else if constexpr (__has_single_bit(unsigned(_S_size)))
+          {
+            if constexpr (sizeof(_M_data) > 16)
+              return _M_reduce_1(__binary_op)._M_reduce(__binary_op);
+            else if constexpr (_S_size == 2)
+              return _M_reduce_1(__binary_op)[0];
+            else
+              {
+                static_assert(_S_size <= 16);
+                auto __x = *this;
+#ifdef __SSE2__
+                if constexpr (sizeof(_M_data) <= 16 and is_integral_v<value_type>)
+                  {
+                    if constexpr (_S_size > 8)
+                      __x = __binary_op(__x, __x.template _M_elements_shifted_down<8>());
+                    if constexpr (_S_size > 4)
+                      __x = __binary_op(__x, __x.template _M_elements_shifted_down<4>());
+                    if constexpr (_S_size > 2)
+                      __x = __binary_op(__x, __x.template _M_elements_shifted_down<2>());
+                    return __binary_op(__x, __x.template _M_elements_shifted_down<1>())[0];
+                  }
+#endif
+                if constexpr (_S_size > 8)
+                  __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<8>()));
+                if constexpr (_S_size > 4)
+                  __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<4>()));
+#ifdef __SSE2__
+                // avoid pshufb by "promoting" to int
+                if constexpr (is_integral_v<value_type> and sizeof(value_type) <= 1)
+                  return resize_t<4, rebind_t<int, basic_vec>>(chunk<4>(__x)[0])
+                           ._M_reduce(__binary_op);
+#endif
+                if constexpr (_S_size > 2)
+                  __x = __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<2>()));
+                if constexpr (is_integral_v<value_type> and sizeof(value_type) == 2)
+                  return __binary_op(__x, _S_static_permute(__x, _SwapNeighbors<1>()))[0];
+                else
+                  return __binary_op(vec<value_type, 1>(__x[0]), vec<value_type, 1>(__x[1]))[0];
+              }
+          }
+        else
+          {
+            // e.g. _S_size = 16 + 16 + 15 (vec<char, 47>)
+            // -> 8 + 8 + 7 -> 4 + 4 + 3 -> 2 + 2 + 1 -> 1
+            auto __chunked = chunk<__bit_floor(unsigned(_S_size)) / 2>(*this);
+            using _Cp = decltype(__chunked);
+            if constexpr (tuple_size_v<_Cp> == 4)
+              {
+                const auto& [__a, __b, __c, __rest] = __chunked;
+                return __binary_op(__binary_op(__a, __b), __c)._M_reduce_tail(__rest, __binary_op);
+              }
+            else if constexpr (tuple_size_v<_Cp> == 3)
+              {
+                const auto& [__a, __b, __rest] = __chunked;
+                return __binary_op(__a, __b)._M_reduce_tail(__rest, __binary_op);
+              }
+            else
+              static_assert(false);
+          }
+      }
+
+      // [simd.math] ----------------------------------------------------------
+      //
+      // ISO/IEC 60559 on the classification operations (5.7.2 General Operations):
+      // "They are never exceptional, even for signaling NaNs."
+      //
+      template <_OptTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr mask_type
+        _M_isnan() const requires is_floating_point_v<value_type>
+        {
+          if constexpr (_Traits._M_finite_math_only())
+            return mask_type(false);
+          else if constexpr (_S_is_scalar)
+            return mask_type(std::isnan(_M_data));
+          else if constexpr (_S_use_bitmask)
+            return _M_isunordered(*this);
+          else if constexpr (not _Traits._M_support_snan())
+            return not (*this == *this);
+          else if (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data))
+            return mask_type([&](int __i) { return std::isnan(_M_data[__i]); });
+          else
+            {
+              // 60559: NaN is represented as Inf + non-zero mantissa bits
+              using _Ip = __integer_from<sizeof(value_type)>;
+              return __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity())
+                       < __builtin_bit_cast(rebind_t<_Ip, basic_vec>, _M_fabs());
+            }
+        }
+
+      template <_TargetTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr mask_type
+        _M_isinf() const requires is_floating_point_v<value_type>
+        {
+          if constexpr (_Traits._M_finite_math_only())
+            return mask_type(false);
+          else if constexpr (_S_is_scalar)
+            return mask_type(std::isinf(_M_data));
+          else if (__builtin_is_constant_evaluated() or __builtin_constant_p(_M_data))
+            return mask_type([&](int __i) { return std::isinf(_M_data[__i]); });
+#ifdef _GLIBCXX_X86
+          else if constexpr (_S_use_bitmask)
+            return mask_type::_S_init(__x86_bitmask_isinf(_M_data));
+          else if constexpr (_Traits._M_have_avx512dq())
+            return __x86_bit_to_vecmask<typename mask_type::_DataType>(
+                     __x86_bitmask_isinf(_M_data));
+#endif
+          else
+            {
+              using _Ip = __integer_from<sizeof(value_type)>;
+              return __vec_bit_cast<_Ip>(_M_fabs()._M_data)
+                       == __builtin_bit_cast(_Ip, numeric_limits<value_type>::infinity());
+            }
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      _M_abs() const
+      {
+        if constexpr (is_floating_point_v<value_type>)
+          return _M_fabs();
+        else
+          return _M_data < 0 ? -_M_data : _M_data;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      _M_fabs() const
+      {
+        static_assert(is_floating_point_v<value_type>);
+        return __vec_andnot(_S_signmask<_DataType>, _M_data);
+      }
+
+      template <_TargetTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr mask_type
+        _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
+        {
+          if constexpr (_Traits._M_finite_math_only())
+            return mask_type(false);
+          else if constexpr (_S_is_scalar)
+            return mask_type(std::isunordered(_M_data, __y._M_data));
+#ifdef _GLIBCXX_X86
+          else if constexpr (_S_use_bitmask)
+            return _M_bitmask_cmp<_X86Cmp::_Unord>(__y._M_data);
+#endif
+          else
+            return mask_type([&](int __i) {
+                     return std::isunordered(_M_data[__i], __y._M_data[__i]);
+                   });
+        }
+
+      // [simd.overview] default constructor ----------------------------------
+      basic_vec() = default;
+
+      // [simd.overview] impl-def conversions ---------------------------------
+      constexpr
+      basic_vec(_DataType __x) requires (not _S_is_scalar)
+        : _M_data(__x)
+      {}
+
+      constexpr
+      operator _DataType() requires (not _S_is_scalar)
+      { return _M_data; }
+
+      // [simd.ctor] broadcast constructor ------------------------------------
+      template <__simd_vec_bcast<value_type> _Up>
+        [[__gnu__::__always_inline__]]
+        constexpr explicit(not __broadcast_constructible<_Up, value_type>)
+        basic_vec(_Up&& __x) noexcept
+          : _M_data(_DataType() == _DataType() ? static_cast<value_type>(__x) : value_type())
+        {}
+
+#ifdef _GLIBCXX_SIMD_CONSTEVAL_BROADCAST
+      template <__simd_vec_bcast_consteval<value_type> _Up>
+        consteval
+        basic_vec(_Up&& __x)
+        : _M_data(_DataType() == _DataType()
+                    ? __value_preserving_cast<value_type>(__x) : value_type())
+        {
+          // TODO: I would prefer the convertible_to check to be a constraint on this constructor.
+          // However, that would change the order in overload resolution, which 
+          static_assert(convertible_to<_Up, value_type>);
+          static_assert(is_arithmetic_v<remove_cvref_t<_Up>>);
+        }
+#endif
+
+      // [simd.ctor] conversion constructor -----------------------------------
+      template <typename _Up, typename _UAbi>
+        requires (__simd_size_v<_Up, _UAbi> == _S_size)
+        // FIXME(file LWG issue): missing constraint `constructible_from<value_type, _Up>`
+        [[__gnu__::__always_inline__]]
+        constexpr
+        explicit(not __value_preserving_convertible_to<_Up, value_type>
+                   or __higher_rank_than<_Up, value_type>)
+        basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
+          : _M_data([&] [[__gnu__::__always_inline__]]() {
+              if constexpr (_S_is_scalar)
+                return static_cast<value_type>(__x[0]);
+              else
+                return __vec_cast<_DataType>(__x._M_concat_data());
+            }())
+        {}
+
+      // [simd.ctor] generator constructor ------------------------------------
+      template <__simd_generator_invokable<value_type, _S_size> _Fp>
+        [[__gnu__::__always_inline__]]
+        constexpr explicit
+        basic_vec(_Fp&& __gen)
+        : _M_data([&] [[__gnu__::__always_inline__]] {
+            constexpr auto [...__is] = __iota<int[_S_size]>;
+            return _DataType{static_cast<value_type>(__gen(__simd_size_constant<__is>))...};
+          }())
+        {}
+
+      template <__almost_simd_generator_invokable<value_type, _S_size> _Fp>
+        constexpr explicit
+        basic_vec(_Fp&&)
+          = _GLIBCXX_DELETE_MSG("Invalid return type of the generator function: "
+                                "Requires value-preserving conversion or implicitly "
+                                "convertible user-defined type.");
+
+      // [simd.ctor] load constructor -----------------------------------------
+      template <typename _Up>
+        [[__gnu__::__always_inline__]]
+        constexpr
+        basic_vec(_LoadCtorTag, const _Up* __ptr)
+          : _M_data()
+        {
+          if constexpr (_S_is_scalar)
+            _M_data = static_cast<value_type>(__ptr[0]);
+          else if (__builtin_is_constant_evaluated())
+            {
+              constexpr auto [...__is] = __iota<int[_S_size]>;
+              _M_data = _DataType{__ptr[__is]...};
+            }
+          else if constexpr (is_integral_v<_Up> == is_integral_v<value_type>
+                               and is_floating_point_v<_Up> == is_floating_point_v<value_type>
+                               and sizeof(_Up) == sizeof(value_type))
+            {
+              // This assumes std::floatN_t to be bitwise equal to float/double
+              __builtin_memcpy(&_M_data, __ptr, sizeof(value_type) * _S_size);
+            }
+          else
+            {
+              __vec_builtin_type<_Up, _S_full_size> __tmp = {};
+              __builtin_memcpy(&__tmp, __ptr, sizeof(_Up) * _S_size);
+              _M_data = __vec_cast<_DataType>(__tmp);
+            }
+        }
+
+      template <__static_sized_range<size.value> _Rg, typename... _Flags>
+        // FIXME(file LWG issue):
+        // 1. missing constraint on `constructible_from<value_type, range_value_t<_Rg>>`
+        // 2. Mandates should be Constraints to fix answers of convertible_to and constructible_from
+        //
+        // Consider `convertible_to<array<complex<float>, 4>, vec<float, 4>>`. It should say false
+        // but currently would be true.
+        // Also, with `flag_convert` the current Mandates doesn't catch the complex<float> -> float
+        // issue and fails with horrible diagnostics somewhere in the instantiation.
+        [[__gnu__::__always_inline__]]
+        constexpr
+        basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
+          : basic_vec(_LoadCtorTag(), __flags.template _S_adjust_pointer<basic_vec>(
+                                        std::ranges::data(__range)))
+        {
+          static_assert(__loadstore_convertible_to<std::ranges::range_value_t<_Rg>, value_type,
+                                                   _Flags...>);
+        }
+
+      // [simd.subscr] --------------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr value_type
+      operator[](__simd_size_type __i) const
+      {
+        if constexpr (_S_is_scalar)
+          return _M_data;
+        else
+          return _M_data[__i];
+      }
+
+      // [simd.unary] unary operators -----------------------------------------
+      // increment and decrement are implemented in terms of operator+=/-= which avoids UB on
+      // padding elements while not breaking UBsan
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec&
+      operator++() noexcept requires requires(value_type __a) { ++__a; }
+      { return *this += value_type(1); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator++(int) noexcept requires requires(value_type __a) { __a++; }
+      {
+        basic_vec __r = *this;
+        *this += value_type(1);
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec&
+      operator--() noexcept requires requires(value_type __a) { --__a; }
+      { return *this -= value_type(1); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator--(int) noexcept requires requires(value_type __a) { __a--; }
+      {
+        basic_vec __r = *this;
+        *this -= value_type(1);
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr mask_type
+      operator!() const noexcept requires requires(value_type __a) { !__a; }
+      { return mask_type::_S_init(!_M_data); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator+() const noexcept requires requires(value_type __a) { +__a; }
+      { return *this; }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator-() const noexcept requires requires(value_type __a) { -__a; }
+      { return _S_init(-_M_data); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator~() const noexcept requires requires(value_type __a) { ~__a; }
+      { return _S_init(~_M_data); }
+
+      // [simd.cassign] binary operators
+#define _GLIBCXX_SIMD_DEFINE_OP(sym)                                 \
+      [[__gnu__::__always_inline__]]                                 \
+      friend constexpr basic_vec&                                    \
+      operator sym##=(basic_vec& __x, const basic_vec& __y) noexcept \
+      requires requires(value_type __a) { __a sym __a; }             \
+      {                                                              \
+        __x._M_data sym##= __y._M_data;                              \
+        return __x;                                                  \
+      }
+
+      _GLIBCXX_SIMD_DEFINE_OP(&)
+      _GLIBCXX_SIMD_DEFINE_OP(|)
+      _GLIBCXX_SIMD_DEFINE_OP(^)
+
+#undef _GLIBCXX_SIMD_DEFINE_OP
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator+=(basic_vec& __x, const basic_vec& __y) noexcept
+      requires requires(value_type __a) { __a + __a; }
+      {
+        if constexpr (_S_is_partial and is_integral_v<value_type> and is_signed_v<value_type>)
+          { // avoid spurious UB on signed integer overflow of the padding element(s). But don't
+            // remove UB of the active elements (so that UBsan can still do its job).
+            using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
+            const _DataType __result
+              = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
+                                              + reinterpret_cast<_UV>(__y._M_data));
+            const auto __positive = __y > value_type();
+            const auto __overflow = __positive != (__result > __x);
+            if (__overflow._M_any_of())
+              __builtin_unreachable(); // trigger UBsan
+            __x._M_data = __result;
+          }
+        else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
+          __x = basic_vec(rebind_t<float, basic_vec>(__x) + __y);
+        else
+          __x._M_data += __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator-=(basic_vec& __x, const basic_vec& __y) noexcept
+      requires requires(value_type __a) { __a - __a; }
+      {
+        if constexpr (_S_is_partial and is_integral_v<value_type> and is_signed_v<value_type>)
+          { // avoid spurious UB on signed integer overflow of the padding element(s). But don't
+            // remove UB of the active elements (so that UBsan can still do its job).
+            using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
+            const _DataType __result
+              = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
+                                              - reinterpret_cast<_UV>(__y._M_data));
+            const auto __positive = __y > value_type();
+            const auto __overflow = __positive != (__result < __x);
+            if (__overflow._M_any_of())
+              __builtin_unreachable(); // trigger UBsan
+            __x._M_data = __result;
+          }
+        else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
+          __x = basic_vec(rebind_t<float, basic_vec>(__x) - __y);
+        else
+          __x._M_data -= __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator*=(basic_vec& __x, const basic_vec& __y) noexcept
+      requires requires(value_type __a) { __a * __a; }
+      {
+        if constexpr (_S_is_partial and is_integral_v<value_type> and is_signed_v<value_type>)
+          { // avoid spurious UB on signed integer overflow of the padding element(s). But don't
+            // remove UB of the active elements (so that UBsan can still do its job).
+            for (int __i = 0; __i < _S_size; ++__i)
+              {
+                if (__builtin_mul_overflow_p(__x._M_data[__i], __y._M_data[__i], value_type()))
+                  __builtin_unreachable();
+              }
+            using _UV = typename _Ap::template _DataType<make_unsigned_t<value_type>>;
+            __x._M_data = reinterpret_cast<_DataType>(reinterpret_cast<_UV>(__x._M_data)
+                                                        * reinterpret_cast<_UV>(__y._M_data));
+          }
+
+        // 'uint16 * uint16' promotes to int and can therefore lead to UB. The standard does not
+        // require to avoid the undefined behavior. It's unnecessary and easy to avoid. It's also
+        // unexpected because there's no UB on the vector types (which don't promote).
+        else if constexpr (_S_is_scalar and is_unsigned_v<value_type>
+                             and is_signed_v<decltype(value_type() * value_type())>)
+          __x._M_data = unsigned(__x._M_data) * unsigned(__y._M_data);
+
+        else if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
+          __x = basic_vec(rebind_t<float, basic_vec>(__x) * __y);
+
+        else
+          __x._M_data *= __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator/=(basic_vec& __x, const basic_vec& __y) noexcept
+      requires requires(value_type __a) { __a / __a; }
+      {
+#ifdef __SSE2__
+        // x86 doesn't have integral SIMD division instructions
+        // While division is faster, the required conversions are still a problem:
+        // see PR121274, PR121284, and PR121296 for missed optimizations wrt. conversions
+        if (not (__x._M_is_constprop() and __y._M_is_constprop()))
+          {
+            if constexpr (is_integral_v<value_type>
+                            and __value_preserving_convertible_to<value_type, float>)
+              return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
+            else if constexpr (is_integral_v<value_type>
+                                 and __value_preserving_convertible_to<value_type, double>)
+              return __x = basic_vec(rebind_t<double, basic_vec>(__x) / __y);
+          }
+#endif
+        if constexpr (_TargetTraits()._M_eval_as_f32<value_type>())
+          return __x = basic_vec(rebind_t<float, basic_vec>(__x) / __y);
+
+        basic_vec __y1 = __y;
+        if constexpr (_S_is_partial)
+          {
+            if constexpr (is_integral_v<value_type>)
+              {
+                // Assume integral division doesn't have SIMD instructions and must be done per
+                // element anyway. Partial vectors should skip their padding elements.
+                if (__builtin_is_constant_evaluated())
+                  __x = basic_vec([&](int __i) -> value_type {
+                          return __x._M_data[__i] / __y._M_data[__i];
+                        });
+                else
+                  {
+                    for (int __i = 0; __i < _S_size; ++__i)
+                      __x._M_data[__i] /= __y._M_data[__i];
+                  }
+                return __x;
+              }
+            else
+              __y1 = __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
+                                   __y, basic_vec(value_type(1)));
+          }
+        __x._M_data /= __y1._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator%=(basic_vec& __x, const basic_vec& __y) noexcept
+      requires requires(value_type __a) { __a % __a; }
+      {
+        static_assert(is_integral_v<value_type>);
+        if constexpr (_S_is_partial)
+          {
+            if (__builtin_is_constant_evaluated())
+              __x = basic_vec([&](int __i) -> value_type {
+                      return __x._M_data[__i] % __y._M_data[__i];
+                    });
+            else if (__builtin_constant_p(__x._M_data % __y._M_data))
+              __x._M_data %= __y._M_data;
+            else if (__y._M_is_constprop())
+              __x._M_data %= __select_impl(mask_type::_S_init(mask_type::_S_implicit_mask),
+                                           __y, basic_vec(value_type(1)))._M_data;
+            else
+              {
+                // Assume integral division doesn't have SIMD instructions and must be done per
+                // element anyway. Partial vectors should skip their padding elements.
+                for (int __i = 0; __i < _S_size; ++__i)
+                  __x._M_data[__i] %= __y._M_data[__i];
+              }
+          }
+        else
+          __x._M_data %= __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator<<=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires(value_type __a) { __a << __a; }
+      {
+        __glibcxx_simd_precondition(is_unsigned_v<value_type> or all_of(__y >= value_type()),
+                                    "negative shift is undefined behavior");
+        __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
+                                    "too large shift invokes undefined behavior");
+        __x._M_data <<= __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator>>=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires(value_type __a) { __a >> __a; }
+      {
+        __glibcxx_simd_precondition(is_unsigned_v<value_type> or all_of(__y >= value_type()),
+                                    "negative shift is undefined behavior");
+        __glibcxx_simd_precondition(all_of(__y < __max_shift<value_type>),
+                                    "too large shift invokes undefined behavior");
+        __x._M_data >>= __y._M_data;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
+      {
+        __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
+        __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
+                                    "too large shift invokes undefined behavior");
+        __x._M_data <<= __y;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
+      {
+        __glibcxx_simd_precondition(__y >= 0, "negative shift is undefined behavior");
+        __glibcxx_simd_precondition(__y < int(__max_shift<value_type>),
+                                    "too large shift invokes undefined behavior");
+        __x._M_data >>= __y;
+        return __x;
+      }
+
+      // [simd.comparison] ----------------------------------------------------
+#if _GLIBCXX_X86
+      template <_X86Cmp _Cmp>
+        [[__gnu__::__always_inline__]]
+        constexpr mask_type
+        _M_bitmask_cmp(_DataType __y) const
+        {
+          static_assert(_S_use_bitmask);
+          if (__builtin_is_constant_evaluated()
+                or (__builtin_constant_p(_M_data) and __builtin_constant_p(__y)))
+            {
+              constexpr auto [...__is] = __iota<int[_S_size]>;
+              constexpr auto __cmp_op = [] [[__gnu__::__always_inline__]]
+                                          (value_type __a, value_type __b) {
+                if constexpr (_Cmp == _X86Cmp::_Eq)
+                  return __a == __b;
+                else if constexpr (_Cmp == _X86Cmp::_Lt)
+                  return __a < __b;
+                else if constexpr (_Cmp == _X86Cmp::_Le)
+                  return __a <= __b;
+                else if constexpr (_Cmp == _X86Cmp::_Unord)
+                  return std::isunordered(__a, __b);
+                else if constexpr (_Cmp == _X86Cmp::_Neq)
+                  return __a != __b;
+                else if constexpr (_Cmp == _X86Cmp::_Nlt)
+                  return not (__a < __b);
+                else if constexpr (_Cmp == _X86Cmp::_Nle)
+                  return not (__a <= __b);
+                else
+                  static_assert(false);
+              };
+              return mask_type::_S_init(((__cmp_op(__vec_get(_M_data, __is), __vec_get(__y, __is))
+                                            ? (1ULL << __is) : 0) | ...));
+              }
+            else
+              return mask_type::_S_init(__x86_bitmask_cmp<_Cmp>(_M_data, __y));
+        }
+#endif
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator==(const basic_vec& __x, const basic_vec& __y) noexcept
+      {
+#if _GLIBCXX_X86
+        if constexpr (_S_use_bitmask)
+          return __x._M_bitmask_cmp<_X86Cmp::_Eq>(__y._M_data);
+        else
+#endif
+          return mask_type::_S_init(__x._M_data == __y._M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
+      {
+#if _GLIBCXX_X86
+        if constexpr (_S_use_bitmask)
+          return __x._M_bitmask_cmp<_X86Cmp::_Neq>(__y._M_data);
+        else
+#endif
+          return mask_type::_S_init(__x._M_data != __y._M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator<(const basic_vec& __x, const basic_vec& __y) noexcept
+      {
+#if _GLIBCXX_X86
+        if constexpr (_S_use_bitmask)
+          return __x._M_bitmask_cmp<_X86Cmp::_Lt>(__y._M_data);
+        else
+#endif
+          return mask_type::_S_init(__x._M_data < __y._M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
+      {
+#if _GLIBCXX_X86
+        if constexpr (_S_use_bitmask)
+          return __x._M_bitmask_cmp<_X86Cmp::_Le>(__y._M_data);
+        else
+#endif
+          return mask_type::_S_init(__x._M_data <= __y._M_data);
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator>(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return __y < __x; }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return __y <= __x; }
+
+      // [simd.cond] ---------------------------------------------------------
+      template <_TargetTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        friend constexpr basic_vec
+        __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
+        {
+          if constexpr (_S_size == 1)
+            return __k[0] ? __t : __f;
+          else if constexpr (_S_use_bitmask)
+            {
+#if _GLIBCXX_X86
+              if (__builtin_is_constant_evaluated()
+                    or (__k._M_is_constprop() and __t._M_is_constprop() and __f._M_is_constprop()))
+                return basic_vec([&](int __i) { return __k[__i] ? __t[__i] : __f[__i]; });
+              else
+                return __x86_bitmask_blend(__k._M_data, __t._M_data, __f._M_data);
+#else
+              static_assert(false, "TODO");
+#endif
+            }
+          else if (__builtin_is_constant_evaluated())
+            return __k._M_data ? __t._M_data : __f._M_data;
+          else
+            {
+              using _VO = _VecOps<_DataType>;
+              if (_VO::_S_is_constprop_equal_to(__f._M_data, 0))
+                {
+                  if (is_integral_v<value_type> and sizeof(_M_data) >= 8
+                        and _VO::_S_is_constprop_equal_to(__t._M_data, 1))
+                    return basic_vec((-__k)._M_abs());
+                  /*                  else if (is_integral_v<value_type> and sizeof(_M_data) >= 8
+                             and _VO::_S_is_constprop_equal_to(__t._M_data, value_type(-1)))
+                    return basic_vec(-__k);*/
+                  else
+                    return __vec_and(reinterpret_cast<_DataType>(__k._M_data), __t._M_data);
+                }
+              else if (_VecOps<_DataType>::_S_is_constprop_equal_to(__t._M_data, 0))
+                {
+                  if (is_integral_v<value_type> and sizeof(_M_data) >= 8
+                        and _VO::_S_is_constprop_equal_to(__f._M_data, 1))
+                    return value_type(1) + basic_vec(-__k);
+                  else
+                    return __vec_andnot(reinterpret_cast<_DataType>(__k._M_data), __f._M_data);
+                }
+              else
+                {
+#if _GLIBCXX_X86
+                  // this works around bad code-gen when the compiler can't see that __k is a vector-mask.
+                  // This pattern, is recognized to match the x86 blend instructions, which only consider
+                  // the sign bit of the mask register. Also, without SSE4, if the compiler knows that __k
+                  // is a vector-mask, then the '< 0' is elided.
+                  return __k._M_data < 0 ? __t._M_data : __f._M_data;
+#endif
+                  return __k._M_data ? __t._M_data : __f._M_data;
+                }
+            }
+        }
+    };
+
+  template <__vectorizable _Tp, __abi_tag _Ap>
+    requires (_Ap::_S_nreg > 1)
+      and (not __complex_like<_Tp>)
+    class basic_vec<_Tp, _Ap>
+    : _BinaryOps<_Tp, _Ap>
+    {
+      template <typename, typename>
+        friend class basic_vec;
+
+      static constexpr int _S_size = _Ap::_S_size;
+
+      static constexpr int _N0 = __bit_ceil(unsigned(_S_size)) / 2;
+
+      static constexpr int _N1 = _S_size - _N0;
+
+      using _DataType0 = __similar_vec<_Tp, _N0, _Ap>;
+
+      using _DataType1 = __similar_vec<_Tp, _N1, _Ap>;
+
+      static_assert(_DataType0::abi_type::_S_nreg + _DataType1::abi_type::_S_nreg == _Ap::_S_nreg);
+
+      static constexpr bool _S_is_scalar = _DataType0::_S_is_scalar;
+
+      _DataType0 _M_data0;
+
+      _DataType1 _M_data1;
+
+    public:
+      using value_type = _Tp;
+
+      using abi_type = _Ap;
+
+      using mask_type = basic_mask<sizeof(_Tp), abi_type>;
+
+      using iterator = __iterator<basic_vec>;
+
+      using const_iterator = __iterator<const basic_vec>;
+
+      constexpr iterator
+      begin() noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      begin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr const_iterator
+      cbegin() const noexcept
+      { return {*this, 0}; }
+
+      constexpr default_sentinel_t
+      end() const noexcept
+      { return {}; }
+
+      constexpr default_sentinel_t
+      cend() const noexcept
+      { return {}; }
+
+      static constexpr auto size = __simd_size_constant<_S_size>;
+
+      [[__gnu__::__always_inline__]]
+      static constexpr basic_vec
+      _S_init(const _DataType0& __x, const _DataType1& __y)
+      {
+        basic_vec __r;
+        __r._M_data0 = __x;
+        __r._M_data1 = __y;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr const _DataType0&
+      _M_get_low() const
+      { return _M_data0; }
+
+      [[__gnu__::__always_inline__]]
+      constexpr const _DataType1&
+      _M_get_high() const
+      { return _M_data1; }
+
+      [[__gnu__::__always_inline__]]
+      constexpr bool
+      _M_is_constprop() const
+      { return _M_data0._M_is_constprop() and _M_data1._M_is_constprop(); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr auto
+      _M_concat_data() const
+      {
+        return __vec_concat(_M_data0._M_concat_data(),
+                            __vec_zero_pad_to<sizeof(_M_data0)>(_M_data1._M_concat_data()));
+      }
+
+      template <int _Size = _S_size, int _Offset = 0, typename _A0, typename _Fp>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_vec
+        _S_static_permute(const basic_vec<value_type, _A0>& __x, _Fp&& __idxmap)
+        {
+          return _S_init(
+                   _DataType0::template _S_static_permute<_Size, _Offset>(__x, __idxmap),
+                   _DataType1::template _S_static_permute<_Size, _Offset + _N0>(__x, __idxmap));
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      _M_complex_conj() const
+      { return _S_init(_M_data0._M_complex_conj(), _M_data1._M_complex_conj()); }
+
+      template <typename _CxVec, _TargetTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr void
+        _M_complex_multiply_with(const basic_vec& __yvec)
+        {
+          _M_data0.template _M_complex_multiply_with<_CxVec>(__yvec._M_data0);
+          _M_data1.template _M_complex_multiply_with<_CxVec>(__yvec._M_data1);
+        }
+
+      template <typename _Cx>
+        [[__gnu__::__always_inline__]]
+        static constexpr void
+        _S_cxctgus_mul(basic_vec& __re0, basic_vec& __im0,
+                       const basic_vec& __re1, const basic_vec& __im1)
+        {
+          _DataType0::template _S_cxctgus_mul<_Cx>(__re0._M_data0, __im0._M_data0,
+                                                   __re1._M_data0, __im1._M_data0);
+          _DataType1::template _S_cxctgus_mul<_Cx>(__re0._M_data1, __im0._M_data1,
+                                                   __re1._M_data1, __im1._M_data1);
+        }
+
+      template <typename _Vp>
+        [[__gnu__::__always_inline__]]
+        constexpr auto
+        _M_chunk() const noexcept
+        {
+          constexpr int __n = _S_size / _Vp::_S_size;
+          constexpr int __rem = _S_size % _Vp::_S_size;
+          if constexpr (_N0 == _Vp::_S_size)
+            {
+              if constexpr (__rem == 0)
+                return array<_Vp, __n> {_M_data0, _M_data1};
+              else
+                return tuple<_Vp, resize_t<__rem, _Vp>> {_M_data0, _M_data1};
+            }
+          else if constexpr (__rem == 0)
+            {
+              using _Rp = array<_Vp, __n>;
+              if constexpr (sizeof(_Rp) == sizeof(*this))
+                {
+                  static_assert(not _Vp::_S_is_partial);
+                  return __builtin_bit_cast(_Rp, *this);
+                }
+              else
+                {
+                  constexpr auto [...__is] = __iota<int[__n]>;
+                  return _Rp {_Vp([&](int __i) { return (*this)[__i + __is * _Vp::_S_size]; })...};
+                }
+            }
+          else
+            {
+              constexpr auto [...__is] = __iota<int[__n]>;
+              using _Rest = resize_t<__rem, _Vp>;
+              // can't bit-cast because the member order of tuple is reversed
+              return tuple {
+                _Vp  ([&](int __i) { return (*this)[__i + __is * _Vp::_S_size]; })...,
+                _Rest([&](int __i) { return (*this)[__i + __n * _Vp::_S_size]; })
+              };
+            }
+        }
+
+      template <typename _A0, typename... _As>
+        [[__gnu__::__always_inline__]]
+        constexpr void
+        _M_assign_from(auto _Offset, const basic_vec<value_type, _A0>& __x0,
+                       const basic_vec<value_type, _As>&... __xs)
+        {
+          if constexpr (_Offset.value >= _A0::_S_size)
+            // make the pack as small as possible
+            _M_assign_from(integral_constant<int, _Offset.value - _A0::_S_size>(), __xs...);
+          else
+            {
+              _M_data0._M_assign_from(_Offset, __x0, __xs...);
+              _M_data1._M_assign_from(integral_constant<int, _Offset + _DataType0::size>(),
+                                      __x0, __xs...);
+            }
+        }
+
+      template <typename _A0>
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_vec
+        _S_concat(const basic_vec<value_type, _A0>& __x0) noexcept
+        { return basic_vec(__x0); }
+
+      template <typename _A0, typename... _As>
+        requires (sizeof...(_As) >= 1)
+        [[__gnu__::__always_inline__]]
+        static constexpr basic_vec
+        _S_concat(const basic_vec<value_type, _A0>& __x0,
+                  const basic_vec<value_type, _As>&... __xs) noexcept
+        {
+          if constexpr (_A0::_S_size == _N0)
+            {
+              if constexpr (sizeof...(_As) == 1)
+                return _S_init(__x0, __xs...);
+              else
+                return _S_init(__x0, _DataType1::_S_concat(__xs...));
+            }
+          else if (__builtin_is_constant_evaluated()
+                     or (__x0._M_is_constprop() and ... and __xs._M_is_constprop()))
+            {
+              basic_vec __r;
+              __r._M_data0.template _M_assign_from(integral_constant<int, 0>(), __x0, __xs...);
+              __r._M_data1.template _M_assign_from(_DataType0::size, __x0, __xs...);
+              return __r;
+            }
+          else
+            {
+              basic_vec __r = {};
+              byte* __dst = reinterpret_cast<byte*>(&__r);
+              constexpr size_t __nbytes0 = sizeof(value_type) * _A0::_S_size;
+              __builtin_memcpy(__dst, &__x0, _A0::_S_nreg == 1 ? sizeof(__x0) : __nbytes0);
+              __dst += sizeof(value_type) * _A0::_S_size;
+              template for (const auto& __x : {__xs...})
+                {
+                  constexpr size_t __nbytes = sizeof(value_type) * __x.size.value;
+                  __builtin_memcpy(__dst, &__x, __nbytes);
+                  __dst += __nbytes;
+                }
+              return __r;
+            }
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr auto
+      _M_reduce_1(auto __binary_op) const
+      {
+        static_assert(_N0 == _N1);
+        return __binary_op(_M_data0, _M_data1);
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr value_type
+      _M_reduce_tail(const auto& __rest, auto __binary_op) const
+      {
+        if constexpr (__rest.size() > _S_size)
+          {
+            auto [__a, __b] = __rest.template _M_chunk<basic_vec>();
+            return __binary_op(*this, __a)._M_reduce_tail(__b, __binary_op);
+          }
+        else if constexpr (__rest.size() == _S_size)
+          return __binary_op(*this, __rest)._M_reduce(__binary_op);
+        else
+          return _M_reduce_1(__binary_op)._M_reduce_tail(__rest, __binary_op);
+      }
+
+      template <typename _BinaryOp, _TargetTraits _Traits = {}>
+        [[__gnu__::__always_inline__]]
+        constexpr value_type
+        _M_reduce(_BinaryOp __binary_op) const
+        {
+          if constexpr (_Traits.template _M_eval_as_f32<value_type>()
+                          and (is_same_v<_BinaryOp, plus<>>
+                                 or is_same_v<_BinaryOp, multiplies<>>))
+            return value_type(rebind_t<float, basic_vec>(*this)._M_reduce(__binary_op));
+#ifdef __SSE2__
+          else if constexpr (is_integral_v<value_type> and sizeof(value_type) == 1
+                               and is_same_v<decltype(__binary_op), multiplies<>>)
+            {
+              // convert to unsigned short because of missing 8-bit mul instruction
+              // we don't need to preserve the order of elements
+#if 1
+              using _V16 = resize_t<_S_size / 2, rebind_t<unsigned short, basic_vec>>;
+              auto __a = __builtin_bit_cast(_V16, *this);
+              return __binary_op(__a, __a >> 8)._M_reduce(__binary_op);
+#else
+              // alternative:
+              using _V16 = rebind_t<unsigned short, basic_vec>;
+              return _V16(*this)._M_reduce(__binary_op);
+#endif
+            }
+#endif
+          else if constexpr (_N0 == _N1)
+            return _M_reduce_1(__binary_op)._M_reduce(__binary_op);
+#if 0 // needs benchmarking before we do this
+          else if constexpr (sizeof(_M_data0) == sizeof(_M_data1)
+                               and requires {
+                                 __default_identity_element<value_type, decltype(__binary_op)>();
+                               })
+            { // extend to power-of-2 with identity element for more parallelism
+              _DataType0 __v1 = __builtin_bit_cast(_DataType0, _M_data1);
+              constexpr _DataType0 __id
+                = __default_identity_element<value_type, decltype(__binary_op)>();
+              constexpr auto __k = _DataType0::mask_type::_S_partial_mask_of_n(_N1);
+              __v1 = __select_impl(__k, __v1, __id);
+              return __binary_op(_M_data0, __v1)._M_reduce(__binary_op);
+            }
+#endif
+          else
+            return _M_data0._M_reduce_1(__binary_op)._M_reduce_tail(_M_data1, __binary_op);
+        }
+
+      [[__gnu__::__always_inline__]]
+      constexpr mask_type
+      _M_isnan() const requires is_floating_point_v<value_type>
+      { return mask_type::_S_init(_M_data0._M_isnan(), _M_data1._M_isnan()); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr mask_type
+      _M_isinf() const requires is_floating_point_v<value_type>
+      { return mask_type::_S_init(_M_data0._M_isinf(), _M_data1._M_isinf()); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr mask_type
+      _M_isunordered(basic_vec __y) const requires is_floating_point_v<value_type>
+      {
+        return mask_type::_S_init(_M_data0._M_isunordered(__y._M_data0),
+                                  _M_data1._M_isunordered(__y._M_data1));
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      _M_abs() const
+      { return _S_init(_M_data0._M_abs(), _M_data1._M_abs()); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      _M_fabs() const
+      { return _S_init(_M_data0._M_fabs(), _M_data1._M_fabs()); }
+
+      basic_vec() = default;
+
+      // [simd.overview] impl-def conversions ---------------------------------
+      using _NativeVecType
+        = decltype([] {
+            if constexpr (_S_is_scalar)
+              return _InvalidInteger();
+            else
+              return __vec_builtin_type<value_type, __bit_ceil(unsigned(_S_size))>();
+          }());
+
+      [[__gnu__::__always_inline__]]
+      constexpr
+      basic_vec(const _NativeVecType& __x) requires (not _S_is_scalar)
+      : _M_data0(_VecOps<__vec_builtin_type<value_type, _N0>>::_S_extract(__x)),
+        _M_data1(_VecOps<__vec_builtin_type<value_type, __bit_ceil(unsigned(_N1))>>
+                   ::_S_extract(__x, integral_constant<int, _N0>()))
+      {}
+
+      [[__gnu__::__always_inline__]]
+      constexpr
+      operator _NativeVecType() const requires (not _S_is_scalar)
+      { return _M_concat_data(); }
+
+      // [simd.ctor] broadcast constructor ------------------------------------
+      template <__simd_vec_bcast<value_type> _Up>
+        [[__gnu__::__always_inline__]]
+        constexpr explicit(not __broadcast_constructible<_Up, value_type>)
+        basic_vec(_Up&& __x) noexcept
+          : _M_data0(static_cast<value_type>(__x)), _M_data1(static_cast<value_type>(__x))
+        {}
+
+#ifdef _GLIBCXX_SIMD_CONSTEVAL_BROADCAST
+      template <__simd_vec_bcast_consteval<value_type> _Up>
+        consteval
+        basic_vec(_Up&& __x)
+        : _M_data0(__value_preserving_cast<value_type>(__x)),
+          _M_data1(__value_preserving_cast<value_type>(__x))
+        {
+          static_assert(convertible_to<_Up, value_type>);
+          static_assert(is_arithmetic_v<remove_cvref_t<_Up>>);
+        }
+#endif
+
+      // [simd.ctor] conversion constructor -----------------------------------
+      template <typename _Up, typename _UAbi>
+        requires (__simd_size_v<_Up, _UAbi> == _S_size)
+        // FIXME(file LWG issue): missing constraint `constructible_from<value_type, _Up>`
+        [[__gnu__::__always_inline__]]
+        constexpr
+        explicit(not __value_preserving_convertible_to<_Up, value_type>
+                   or __higher_rank_than<_Up, value_type>)
+        basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
+          : _M_data0(get<0>(chunk<_N0>(__x))),
+            _M_data1(get<1>(chunk<_N0>(__x)))
+        {}
+
+      // [simd.ctor] generator constructor ------------------------------------
+      template <__simd_generator_invokable<value_type, _S_size> _Fp>
+        [[__gnu__::__always_inline__]]
+        constexpr explicit
+        basic_vec(_Fp&& __gen)
+          : _M_data0(__gen), _M_data1([&] [[__gnu__::__always_inline__]] (auto __i) {
+                               return __gen(__simd_size_constant<__i + _N0>);
+                             })
+        {}
+
+      template <__almost_simd_generator_invokable<value_type, _S_size> _Fp>
+        constexpr explicit
+        basic_vec(_Fp&&)
+          = _GLIBCXX_DELETE_MSG("Invalid return type of the generator function: "
+                                "Requires value-preserving conversion or implicitly "
+                                "convertible user-defined type.");
+
+      // [simd.ctor] load constructor -----------------------------------------
+      template <typename _Up>
+        [[__gnu__::__always_inline__]]
+        constexpr
+        basic_vec(_LoadCtorTag, const _Up* __ptr)
+          : _M_data0(_LoadCtorTag(), __ptr),
+            _M_data1(_LoadCtorTag(), __ptr + _N0)
+        {}
+
+      template <__static_sized_range<size.value> _Rg, typename... _Flags>
+        // FIXME: see load ctor(s) above
+        constexpr
+        basic_vec(_Rg&& __range, flags<_Flags...> __flags = {})
+        : basic_vec(_LoadCtorTag(),
+                    __flags.template _S_adjust_pointer<basic_vec>(std::ranges::data(__range)))
+        {
+          static_assert(__loadstore_convertible_to<std::ranges::range_value_t<_Rg>, value_type,
+                                                   _Flags...>);
+        }
+
+      // [simd.subscr] --------------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr value_type
+      operator[](__simd_size_type __i) const
+      {
+        struct _Tmp
+        { alignas(basic_vec) const value_type _M_values[_S_size]; };
+        return __builtin_bit_cast(_Tmp, *this)._M_values[__i];
+      }
+
+      // [simd.unary] unary operators -----------------------------------------
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec&
+      operator++() noexcept requires requires(value_type __a) { ++__a; }
+      {
+        ++_M_data0;
+        ++_M_data1;
+        return *this;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator++(int) noexcept requires requires(value_type __a) { __a++; }
+      {
+        basic_vec __r = *this;
+        ++_M_data0;
+        ++_M_data1;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec&
+      operator--() noexcept requires requires(value_type __a) { --__a; }
+      {
+        --_M_data0;
+        --_M_data1;
+        return *this;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator--(int) noexcept requires requires(value_type __a) { __a--; }
+      {
+        basic_vec __r = *this;
+        --_M_data0;
+        --_M_data1;
+        return __r;
+      }
+
+      [[__gnu__::__always_inline__]]
+      constexpr mask_type
+      operator!() const noexcept requires requires(value_type __a) { !__a; }
+      { return mask_type::_S_init(!_M_data0, !_M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator+() const noexcept requires requires(value_type __a) { +__a; }
+      { return *this; }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator-() const noexcept requires requires(value_type __a) { -__a; }
+      { return _S_init(-_M_data0, -_M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      constexpr basic_vec
+      operator~() const noexcept requires requires(value_type __a) { ~__a; }
+      { return _S_init(~_M_data0, ~_M_data1); }
+
+      // [simd.cassign] -------------------------------------------------------
+#define _GLIBCXX_SIMD_DEFINE_OP(sym)                                 \
+      [[__gnu__::__always_inline__]]                                 \
+      friend constexpr basic_vec&                                    \
+      operator sym##=(basic_vec& __x, const basic_vec& __y) _GLIBCXX_SIMD_NOEXCEPT \
+      {                                                              \
+        __x._M_data0 sym##= __y._M_data0;                            \
+        __x._M_data1 sym##= __y._M_data1;                            \
+        return __x;                                                  \
+      }
+
+      _GLIBCXX_SIMD_DEFINE_OP(+)
+      _GLIBCXX_SIMD_DEFINE_OP(-)
+      _GLIBCXX_SIMD_DEFINE_OP(*)
+      _GLIBCXX_SIMD_DEFINE_OP(/)
+      _GLIBCXX_SIMD_DEFINE_OP(%)
+      _GLIBCXX_SIMD_DEFINE_OP(&)
+      _GLIBCXX_SIMD_DEFINE_OP(|)
+      _GLIBCXX_SIMD_DEFINE_OP(^)
+      _GLIBCXX_SIMD_DEFINE_OP(<<)
+      _GLIBCXX_SIMD_DEFINE_OP(>>)
+
+#undef _GLIBCXX_SIMD_DEFINE_OP
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator<<=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires(value_type __a, __simd_size_type __b) { __a << __b; }
+      {
+        __x._M_data0 <<= __y;
+        __x._M_data1 <<= __y;
+        return __x;
+      }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec&
+      operator>>=(basic_vec& __x, __simd_size_type __y) _GLIBCXX_SIMD_NOEXCEPT
+      requires requires(value_type __a, __simd_size_type __b) { __a >> __b; }
+      {
+        __x._M_data0 >>= __y;
+        __x._M_data1 >>= __y;
+        return __x;
+      }
+
+      // [simd.comparison] ----------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator==(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return mask_type::_S_init(__x._M_data0 == __y._M_data0, __x._M_data1 == __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator!=(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return mask_type::_S_init(__x._M_data0 != __y._M_data0, __x._M_data1 != __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator<(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return mask_type::_S_init(__x._M_data0 < __y._M_data0, __x._M_data1 < __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator<=(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return mask_type::_S_init(__x._M_data0 <= __y._M_data0, __x._M_data1 <= __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator>(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return mask_type::_S_init(__x._M_data0 > __y._M_data0, __x._M_data1 > __y._M_data1); }
+
+      [[__gnu__::__always_inline__]]
+      friend constexpr mask_type
+      operator>=(const basic_vec& __x, const basic_vec& __y) noexcept
+      { return mask_type::_S_init(__x._M_data0 >= __y._M_data0, __x._M_data1 >= __y._M_data1); }
+
+      // [simd.cond] ---------------------------------------------------------
+      [[__gnu__::__always_inline__]]
+      friend constexpr basic_vec
+      __select_impl(const mask_type& __k, const basic_vec& __t, const basic_vec& __f) noexcept
+      {
+        return _S_init(__select_impl(__k._M_data0, __t._M_data0, __f._M_data0),
+                       __select_impl(__k._M_data1, __t._M_data1, __f._M_data1));
+      }
+    };
+
+  // [simd.overview] deduction guide ------------------------------------------
+  template <__static_sized_range _Rg, typename... _Ts>
+    basic_vec(_Rg&& __r, _Ts...)
+    -> basic_vec<ranges::range_value_t<_Rg>,
+                 __deduce_abi_t<ranges::range_value_t<_Rg>,
+#if 0 // PR117849
+                                ranges::size(__r)>>;
+#else
+                                decltype(std::span(__r))::extent>>;
+#endif
+
+#if 1
+  // FIXME: file new LWG issue about this missing deduction guide
+  template <size_t _Bytes, typename _Ap>
+    basic_vec(basic_mask<_Bytes, _Ap>)
+    -> basic_vec<__integer_from<_Bytes>,
+                 decltype(__abi_rebind<__integer_from<_Bytes>, basic_mask<_Bytes, _Ap>::size.value,
+                                       _Ap>())>;
+#endif
+
+  // [P3319R5] ----------------------------------------------------------------
+  template <__vectorizable _Tp>
+    requires is_arithmetic_v<_Tp>
+    inline constexpr _Tp
+    __iota<_Tp> = _Tp();
+
+  template <typename _Tp, typename _Abi>
+    inline constexpr basic_vec<_Tp, _Abi>
+    __iota<basic_vec<_Tp, _Abi>> = basic_vec<_Tp, _Abi>([](_Tp __i) -> _Tp {
+      static_assert(__simd_size_v<_Tp, _Abi> - 1 <= numeric_limits<_Tp>::max(),
+                    "iota object would overflow");
+      return __i;
+    });
+}
+
+#pragma GCC diagnostic pop
+#endif // C++26
+#endif // _GLIBCXX_SIMD_VEC_H

[PATCH 06/11] libstdc++: Implement C++26 [simd] basic_vec

Reply via email to