https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87139
Bug ID: 87139 Summary: 6.4 x86_64 incorrect code generation with -O3 around _addcarry_u64 Product: gcc Version: 6.4.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: cbcode at gmail dot com Target Milestone: --- x86_64 gcc-6 (tested with 6.4.0 and 6.3.0 under linux and 6.4.0 under windows/mingw) generates incorrect code when optimizing with -O3 in certain situations around _addcarry_u64. gcc-6 with -O2 is fine. gcc-5, gcc-7 and gcc-8 with -O3 are all fine. It is difficult to pinpoint the exact problem but the following should narrow it down enough to be helpful. #include <array> #include <cassert> #include <cstdint> #include <cstdio> #include <x86intrin.h> using std::size_t; using std::uint64_t; static_assert(sizeof(size_t) == sizeof(uint64_t), ""); //big-integer multiplication, limb-size is size_t is uint64_t template<size_t Na, size_t Nb> std::array<size_t, Na + Nb> big_mul(std::array<size_t, Na> const& aa, std::array<size_t, Nb> const& bb) noexcept { static constexpr size_t const Nc = Na + Nb; std::array<size_t, Nc> cc = {{0,}}; for (size_t na = 0; na != Na; ++na) { for (size_t nb = 0; nb != Nb; ++nb) { alignas(__uint128_t) size_t lohi[2]; reinterpret_cast<__uint128_t&>(lohi) = __uint128_t(aa[na]) * __uint128_t(bb[nb]); //Note: placement-new instead of reinterpret_cast makes no difference size_t n = na + nb; unsigned char c = __builtin_add_overflow(cc[n], lohi[0], &cc[n]); #if 1 //Note: x86_64-gcc-6 -O3 generates bad code; gcc-6 -O2 is fine; gcc-5 and gcc-7 and above are fine c = _addcarry_u64(c, cc[n + 1], lohi[1], &reinterpret_cast<unsigned long long&>(cc[n + 1])); for (n += 2; n <= Nc - 1; ++n) c = __builtin_add_overflow(cc[n], c, &cc[n]); #elif 1 //Note: added 'c &&' in for-loop condition: problem goes away c = _addcarry_u64(c, cc[n + 1], lohi[1], &reinterpret_cast<unsigned long long&>(cc[n + 1])); for (n += 2; c && n <= Nc - 1; ++n) c = __builtin_add_overflow(cc[n], c, &cc[n]); #else //Note: inserted '++n' between add_overflow and add_carry: problem goes away ++n; c = _addcarry_u64(c, cc[n], lohi[1], &reinterpret_cast<unsigned long long&>(cc[n])); for (++n; c && n <= Nc - 1; ++n) c = __builtin_add_overflow(cc[n], c, &cc[n]); #endif } } return cc; } //simple random-number-generator, https://github.com/svaarala/duktape/blob/master/misc/splitmix64.c static uint64_t rnd_state = 1; static inline uint64_t rnd() { uint64_t z = rnd_state += 0x9e3779b97f4a7c15; z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; z = (z ^ (z >> 27)) * 0x94d049bb133111eb; return z ^ (z >> 31); } //main() writes python3-script; pipe into python3; no output means no error //g++-6 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #bad //g++-6 -std=c++11 -O2 -o test test.cpp && ./test 100 | python3 #Ok //g++-5 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #Ok //g++-7 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #Ok //g++-8 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #Ok int main(int argc, char const** argv) { assert(argc == 2); size_t const M = size_t(std::strtoull(argv[1], nullptr, 0)); static constexpr size_t const Na = 2, Nb = 2; //Note: any combination of Na>=2, Nb>=2 causes the problem std::array<size_t, Na > aa; std::array<size_t, Nb > bb; std::array<size_t, Na + Nb> cc; for (size_t m = 0; m != M; ++m) { for (size_t& a: aa) a = rnd(); for (size_t& b: bb) b = rnd(); cc = big_mul(aa, bb); std::printf( "a=0x");for (size_t n = Na; n--;) std::printf("%016zx", aa[n]); std::printf("; b=0x");for (size_t n = Nb; n--;) std::printf("%016zx", bb[n]); std::printf("; c=0x");for (size_t n = Na+Nb; n--;) std::printf("%016zx", cc[n]); std::printf("\nif a*b!=c: print(hex(a),'*', hex(b),'\\n=',hex(a*b),'\\n?',hex(c))\n"); } return 0; }