https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111231
--- Comment #5 from Mathieu Malaterre <malat at debian dot org> ---
Result from cvise (+manual editing):
c++ -Wfatal-errors -Wall -Wextra -Werror -o works -O1 -march=armv7-a
-mfpu=neon-vfpv4 -mfloat-abi=hard -mfp16-format=ieee step7.cc
-> exit(0)
c++ -Wfatal-errors -Wall -Wextra -Werror -o fails -O2 -march=armv7-a
-mfpu=neon-vfpv4 -mfloat-abi=hard -mfp16-format=ieee step7.cc
-> exit(42)
Where:
% cat step7.cc
#include <inttypes.h>
#include <stddef.h>
#define HWY_MIN(a, b) a < b
template <size_t kBytes, typename From, typename To>
void CopyBytes(From from, To to) {
__builtin_memcpy(to, from, kBytes);
}
template <typename From, typename To> void CopySameSize(From from, To to) {
CopyBytes<sizeof(int)>(from, to);
}
struct Relations {
using Wide = int16_t;
};
template <typename> using MakeWide = Relations::Wide;
#if 0
#else
#define HWY_LANES(T) (sizeof(T))
#endif
namespace detail {
constexpr size_t ScaleByPower(size_t N, int pow2) { return pow2 >= 0 ? N : 0; }
} // namespace detail
template <typename Lane, size_t N, int kPow2> struct Simd {
using T = Lane;
static constexpr size_t kWhole = N;
static constexpr int kFrac = 0;
static constexpr size_t kPrivateLanes = detail::ScaleByPower(kWhole, kFrac);
constexpr size_t MaxBytes() { return kPrivateLanes; }
template <typename> static constexpr size_t RepartitionLanes() {
return sizeof(int);
}
template <int kNewPow2, size_t kNewMaxLanes>
static constexpr size_t WholeN() {
return detail::ScaleByPower(kNewMaxLanes, kNewPow2);
}
template <int kNewPow2, size_t kNewMaxLanes> static constexpr size_t NewN() {
return WholeN<kNewPow2, kNewMaxLanes>();
}
template <typename> using Rebind = Simd;
template <typename NewT>
using Repartition =
Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>;
};
namespace detail {
template <typename T, size_t N, int> struct ClampNAndPow2 {
using type = Simd<T, HWY_MIN(N, 6), 3>;
};
template <typename T, size_t, int kPow2> struct CappedTagChecker {
static constexpr size_t N = HWY_LANES(T);
using type = typename ClampNAndPow2<T, N, kPow2>::type;
};
} // namespace detail
template <typename T, size_t kLimit, int kPow2 = 0>
using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type;
template <class D> using TFromD = typename D::T;
#define HWY_MAX_LANES_D(D) D::kPrivateLanes
template <class D> size_t MaxLanes(D) { return HWY_MAX_LANES_D(D); }
template <class D> size_t Lanes(D) { return HWY_MAX_LANES_D(D); }
template <class T, class D> using Rebind = typename D::Rebind<T>;
template <class D> using RebindToUnsigned = Rebind<D, D>;
template <class T, class D> using Repartition = typename D::Repartition<T>;
template <class D> using RepartitionToWide = Repartition<MakeWide<D>, D>;
#include <cmath>
#include <cstring>
#include <memory>
namespace hwy {
template <typename T, size_t> struct Vec128 {
T raw[16 / sizeof(T)];
};
template <class D> Vec128<TFromD<D>, HWY_MAX_LANES_D(D)> Zero(D);
template <class D> using VFromD = decltype(Zero(D()));
template <class D, class VFrom> VFromD<D> BitCast(D, VFrom v) {
VFromD<D> to;
CopySameSize(&v, &to);
return to;
}
template <class D, typename T2> VFromD<D> Set(D d, T2 t) {
VFromD<D> v;
for (size_t i = 0; i < MaxLanes(d); ++i)
v.raw[i] = t;
return v;
}
template <typename T, size_t N> Vec128<T, N> SaturatedAdd(Vec128<T, N> b) {
Vec128<T, N> a;
using TW = T;
for (size_t i = 0; i < N; ++i)
a.raw[i] = HWY_MIN(TW() + b.raw[i], T());
return a;
}
template <class D> void Store(VFromD<D> v, D d, TFromD<D> *aligned) {
CopyBytes<d.MaxBytes()>(v.raw, aligned);
}
template <class D> using Vec = decltype(Zero(D()));
template <class DI16, class VU8, class VI8>
Vec<DI16> SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) {
RebindToUnsigned<decltype(di16)> du16;
auto a1 = BitCast(du16, a), b1(BitCast(di16, b));
return SaturatedAdd(a1);
}
using AllocPtr = void *(void *, size_t);
using FreePtr = void (*)(void *, void *);
void *AllocateAlignedBytes(size_t size, AllocPtr, void *) {
return aligned_alloc(4096, size);
}
void FreeAlignedBytes(const void *ptr, FreePtr, void *) {
free((void*)ptr);
}
AllocPtr *AllocateAlignedItems_alloc_ptr;
template <typename T> T *AllocateAlignedItems(size_t items) {
size_t size = sizeof(T);
bool is_pow2 = 0;
size_t bits(size);
size_t bytes = is_pow2 ? bits : items;
return static_cast<T *>(
AllocateAlignedBytes(bytes, AllocateAlignedItems_alloc_ptr, 0));
}
struct AlignedFreer {
AlignedFreer(FreePtr free_ptr, void *opaque_ptr)
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
template <typename T> void operator()(T aligned_pointer) {
FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
}
FreePtr free_;
void *opaque_ptr_;
};
template <typename T>
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
FreePtr AllocateAligned_free;
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(size_t items, void *opaque) {
T *__trans_tmp_1 = AllocateAlignedItems<T>(items);
return AlignedFreeUniquePtr<T[]>(__trans_tmp_1,
AlignedFreer(AllocateAligned_free, opaque));
}
template <typename T> AlignedFreeUniquePtr<T[]> AllocateAligned(size_t items) {
return AllocateAligned<T>(items, nullptr);
}
void AssertArrayEqual2(void *a, void *b, size_t c, const char *, int) {
if (memcmp(a, b, c))
exit(42);
}
template <class n, typename o = TFromD<n>>
void p(n d, Vec<n> expected, Vec<n> actual, const char *ab, int ac) {
size_t q = Lanes(d);
auto r = AllocateAligned<o>(q), s = AllocateAligned<o>(q);
Store(expected, d, r.get());
Store(actual, d, s.get());
AssertArrayEqual2(r.get(), s.get(), q, ab, ac);
}
#define t(d, expected, actual) p(d, expected, actual, __FILE__, __LINE__)
template <typename o, size_t u, class v> struct w {
static void x(size_t ae, size_t af) {
CappedTag<o, u> d;
size_t ag = Lanes(d);
if (ag < ae)
return;
if (af)
v()(o(), d);
}
};
struct y {
template <typename b, class c> void operator()(b, c d) {
RepartitionToWide<c> aa;
size_t g = Lanes(d);
RebindToUnsigned<decltype(d)> h;
auto m = 0, j = Set(h, b()), k = Set(aa, 0);
for (size_t i = 0; i < g; i++)
t(aa, k, SatWidenMulPairwiseAdd(aa, j, m));
}
};
template <int ah = 1> struct ai {
template <typename o> void operator()(o) {
size_t aj = ah;
constexpr size_t ak HWY_LANES(o);
size_t af = ak;
w<o, ak, y>::x(aj, af);
}
};
void l() { ai()(int8_t()); }
} // namespace hwy
int main() { hwy::l(); }