https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87139
Bug ID: 87139
Summary: 6.4 x86_64 incorrect code generation with -O3 around
_addcarry_u64
Product: gcc
Version: 6.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: cbcode at gmail dot com
Target Milestone: ---
x86_64 gcc-6 (tested with 6.4.0 and 6.3.0 under linux and 6.4.0 under
windows/mingw) generates incorrect code when optimizing with -O3 in certain
situations around _addcarry_u64. gcc-6 with -O2 is fine. gcc-5, gcc-7 and gcc-8
with -O3 are all fine. It is difficult to pinpoint the exact problem but the
following should narrow it down enough to be helpful.
#include <array>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <x86intrin.h>
using std::size_t;
using std::uint64_t;
static_assert(sizeof(size_t) == sizeof(uint64_t), "");
//big-integer multiplication, limb-size is size_t is uint64_t
template<size_t Na, size_t Nb>
std::array<size_t, Na + Nb> big_mul(std::array<size_t, Na> const& aa,
std::array<size_t, Nb> const& bb) noexcept {
static constexpr size_t const Nc = Na + Nb;
std::array<size_t, Nc> cc = {{0,}};
for (size_t na = 0; na != Na; ++na) {
for (size_t nb = 0; nb != Nb; ++nb) {
alignas(__uint128_t) size_t lohi[2];
reinterpret_cast<__uint128_t&>(lohi) = __uint128_t(aa[na]) *
__uint128_t(bb[nb]); //Note: placement-new instead of reinterpret_cast makes no
difference
size_t n = na + nb;
unsigned char c = __builtin_add_overflow(cc[n], lohi[0], &cc[n]);
#if 1 //Note: x86_64-gcc-6 -O3 generates bad code; gcc-6 -O2 is fine; gcc-5 and
gcc-7 and above are fine
c = _addcarry_u64(c, cc[n + 1], lohi[1], &reinterpret_cast<unsigned
long long&>(cc[n + 1]));
for (n += 2; n <= Nc - 1; ++n) c = __builtin_add_overflow(cc[n], c,
&cc[n]);
#elif 1 //Note: added 'c &&' in for-loop condition: problem goes away
c = _addcarry_u64(c, cc[n + 1], lohi[1], &reinterpret_cast<unsigned
long long&>(cc[n + 1]));
for (n += 2; c && n <= Nc - 1; ++n) c =
__builtin_add_overflow(cc[n], c, &cc[n]);
#else //Note: inserted '++n' between add_overflow and add_carry: problem goes
away
++n;
c = _addcarry_u64(c, cc[n], lohi[1], &reinterpret_cast<unsigned
long long&>(cc[n]));
for (++n; c && n <= Nc - 1; ++n) c = __builtin_add_overflow(cc[n],
c, &cc[n]);
#endif
}
}
return cc;
}
//simple random-number-generator,
https://github.com/svaarala/duktape/blob/master/misc/splitmix64.c
static uint64_t rnd_state = 1;
static inline uint64_t rnd() {
uint64_t z = rnd_state += 0x9e3779b97f4a7c15;
z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
return z ^ (z >> 31);
}
//main() writes python3-script; pipe into python3; no output means no error
//g++-6 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #bad
//g++-6 -std=c++11 -O2 -o test test.cpp && ./test 100 | python3 #Ok
//g++-5 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #Ok
//g++-7 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #Ok
//g++-8 -std=c++11 -O3 -o test test.cpp && ./test 100 | python3 #Ok
int main(int argc, char const** argv) { assert(argc == 2);
size_t const M = size_t(std::strtoull(argv[1], nullptr, 0));
static constexpr size_t const Na = 2, Nb = 2; //Note: any combination of
Na>=2, Nb>=2 causes the problem
std::array<size_t, Na > aa;
std::array<size_t, Nb > bb;
std::array<size_t, Na + Nb> cc;
for (size_t m = 0; m != M; ++m) {
for (size_t& a: aa) a = rnd();
for (size_t& b: bb) b = rnd();
cc = big_mul(aa, bb);
std::printf( "a=0x");for (size_t n = Na; n--;)
std::printf("%016zx", aa[n]);
std::printf("; b=0x");for (size_t n = Nb; n--;)
std::printf("%016zx", bb[n]);
std::printf("; c=0x");for (size_t n = Na+Nb; n--;)
std::printf("%016zx", cc[n]);
std::printf("\nif a*b!=c: print(hex(a),'*',
hex(b),'\\n=',hex(a*b),'\\n?',hex(c))\n");
}
return 0;
}