https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111231
--- Comment #5 from Mathieu Malaterre <malat at debian dot org> --- Result from cvise (+manual editing): c++ -Wfatal-errors -Wall -Wextra -Werror -o works -O1 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard -mfp16-format=ieee step7.cc -> exit(0) c++ -Wfatal-errors -Wall -Wextra -Werror -o fails -O2 -march=armv7-a -mfpu=neon-vfpv4 -mfloat-abi=hard -mfp16-format=ieee step7.cc -> exit(42) Where: % cat step7.cc #include <inttypes.h> #include <stddef.h> #define HWY_MIN(a, b) a < b template <size_t kBytes, typename From, typename To> void CopyBytes(From from, To to) { __builtin_memcpy(to, from, kBytes); } template <typename From, typename To> void CopySameSize(From from, To to) { CopyBytes<sizeof(int)>(from, to); } struct Relations { using Wide = int16_t; }; template <typename> using MakeWide = Relations::Wide; #if 0 #else #define HWY_LANES(T) (sizeof(T)) #endif namespace detail { constexpr size_t ScaleByPower(size_t N, int pow2) { return pow2 >= 0 ? N : 0; } } // namespace detail template <typename Lane, size_t N, int kPow2> struct Simd { using T = Lane; static constexpr size_t kWhole = N; static constexpr int kFrac = 0; static constexpr size_t kPrivateLanes = detail::ScaleByPower(kWhole, kFrac); constexpr size_t MaxBytes() { return kPrivateLanes; } template <typename> static constexpr size_t RepartitionLanes() { return sizeof(int); } template <int kNewPow2, size_t kNewMaxLanes> static constexpr size_t WholeN() { return detail::ScaleByPower(kNewMaxLanes, kNewPow2); } template <int kNewPow2, size_t kNewMaxLanes> static constexpr size_t NewN() { return WholeN<kNewPow2, kNewMaxLanes>(); } template <typename> using Rebind = Simd; template <typename NewT> using Repartition = Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>; }; namespace detail { template <typename T, size_t N, int> struct ClampNAndPow2 { using type = Simd<T, HWY_MIN(N, 6), 3>; }; template <typename T, size_t, int kPow2> struct CappedTagChecker { static constexpr size_t N = HWY_LANES(T); using type = typename ClampNAndPow2<T, N, kPow2>::type; }; } // namespace detail template <typename T, size_t kLimit, int kPow2 = 0> using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type; template <class D> using TFromD = typename D::T; #define HWY_MAX_LANES_D(D) D::kPrivateLanes template <class D> size_t MaxLanes(D) { return HWY_MAX_LANES_D(D); } template <class D> size_t Lanes(D) { return HWY_MAX_LANES_D(D); } template <class T, class D> using Rebind = typename D::Rebind<T>; template <class D> using RebindToUnsigned = Rebind<D, D>; template <class T, class D> using Repartition = typename D::Repartition<T>; template <class D> using RepartitionToWide = Repartition<MakeWide<D>, D>; #include <cmath> #include <cstring> #include <memory> namespace hwy { template <typename T, size_t> struct Vec128 { T raw[16 / sizeof(T)]; }; template <class D> Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D); template <class D> using VFromD = decltype(Zero(D())); template <class D, class VFrom> VFromD<D> BitCast(D, VFrom v) { VFromD<D> to; CopySameSize(&v, &to); return to; } template <class D, typename T2> VFromD<D> Set(D d, T2 t) { VFromD<D> v; for (size_t i = 0; i < MaxLanes(d); ++i) v.raw[i] = t; return v; } template <typename T, size_t N> Vec128<T, N> SaturatedAdd(Vec128<T, N> b) { Vec128<T, N> a; using TW = T; for (size_t i = 0; i < N; ++i) a.raw[i] = HWY_MIN(TW() + b.raw[i], T()); return a; } template <class D> void Store(VFromD<D> v, D d, TFromD<D> *aligned) { CopyBytes<d.MaxBytes()>(v.raw, aligned); } template <class D> using Vec = decltype(Zero(D())); template <class DI16, class VU8, class VI8> Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) { RebindToUnsigned<decltype(di16)> du16; auto a1 = BitCast(du16, a), b1(BitCast(di16, b)); return SaturatedAdd(a1); } using AllocPtr = void *(void *, size_t); using FreePtr = void (*)(void *, void *); void *AllocateAlignedBytes(size_t size, AllocPtr, void *) { return aligned_alloc(4096, size); } void FreeAlignedBytes(const void *ptr, FreePtr, void *) { free((void*)ptr); } AllocPtr *AllocateAlignedItems_alloc_ptr; template <typename T> T *AllocateAlignedItems(size_t items) { size_t size = sizeof(T); bool is_pow2 = 0; size_t bits(size); size_t bytes = is_pow2 ? bits : items; return static_cast<T *>( AllocateAlignedBytes(bytes, AllocateAlignedItems_alloc_ptr, 0)); } struct AlignedFreer { AlignedFreer(FreePtr free_ptr, void *opaque_ptr) : free_(free_ptr), opaque_ptr_(opaque_ptr) {} template <typename T> void operator()(T aligned_pointer) { FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_); } FreePtr free_; void *opaque_ptr_; }; template <typename T> using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>; FreePtr AllocateAligned_free; template <typename T> AlignedFreeUniquePtr<T[]> AllocateAligned(size_t items, void *opaque) { T *__trans_tmp_1 = AllocateAlignedItems<T>(items); return AlignedFreeUniquePtr<T[]>(__trans_tmp_1, AlignedFreer(AllocateAligned_free, opaque)); } template <typename T> AlignedFreeUniquePtr<T[]> AllocateAligned(size_t items) { return AllocateAligned<T>(items, nullptr); } void AssertArrayEqual2(void *a, void *b, size_t c, const char *, int) { if (memcmp(a, b, c)) exit(42); } template <class n, typename o = TFromD<n>> void p(n d, Vec<n> expected, Vec<n> actual, const char *ab, int ac) { size_t q = Lanes(d); auto r = AllocateAligned<o>(q), s = AllocateAligned<o>(q); Store(expected, d, r.get()); Store(actual, d, s.get()); AssertArrayEqual2(r.get(), s.get(), q, ab, ac); } #define t(d, expected, actual) p(d, expected, actual, __FILE__, __LINE__) template <typename o, size_t u, class v> struct w { static void x(size_t ae, size_t af) { CappedTag<o, u> d; size_t ag = Lanes(d); if (ag < ae) return; if (af) v()(o(), d); } }; struct y { template <typename b, class c> void operator()(b, c d) { RepartitionToWide<c> aa; size_t g = Lanes(d); RebindToUnsigned<decltype(d)> h; auto m = 0, j = Set(h, b()), k = Set(aa, 0); for (size_t i = 0; i < g; i++) t(aa, k, SatWidenMulPairwiseAdd(aa, j, m)); } }; template <int ah = 1> struct ai { template <typename o> void operator()(o) { size_t aj = ah; constexpr size_t ak HWY_LANES(o); size_t af = ak; w<o, ak, y>::x(aj, af); } }; void l() { ai()(int8_t()); } } // namespace hwy int main() { hwy::l(); }