https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95218
--- Comment #8 from Martin Liška <marxin at gcc dot gnu.org> --- There's partially reduced test-case: $ cat fma.i double res_test0101[] = { -3, 1, 17, 51, 109, 197, 321, 487, 701, 969, 1297, 1691, 2157, 2701, 3329, 4047, 4861, 5777, 6801, 7939, 9197, 10581, 12097, 13751, 15549, 17497, 19601, 21867, 24301, 26909, 29697, 32671}; double res_test0110[] = {3, -1, -17, -51, -109, -197, -321, -487, -701, -969, -1297, -1691, -2157, -2701, -3329, -4047, -4861, -5777, -6801, -7939, -9197, -10581, -12097, -13751, -15549, -17497, -19601, -21867, -24301, -26909, -29697, -32671}; extern void abort() __attribute__(()) __attribute__(()); static __inline int __get_cpuid(unsigned int __leaf, unsigned int *__eax, unsigned int *__ebx, unsigned int *__ecx, unsigned int *__edx) { __asm__("cpuid\n\t" : "=a"(*__eax), "=b"(*__ebx), "=c"(*__ecx), "=d"(*__edx) : "0"(__leaf)); } static void fma_test(); int main() { unsigned int eax, ebx, ecx, edx; if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx)) 0; if (ecx & (1 << 12)) fma_test(); return 0; } double m1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}; double m2[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33}; double m3[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}; double m4[32]; int test_fails = 0; void compare_result(double *res) { int i; int good = 1; i = 0; for (i; i < 32; i++) if (m4[i] != res[i]) if (good) good = 0; if (!good) test_fails = 1; } static void fma_test() { double __trans_tmp_3; double __trans_tmp_2; double __trans_tmp_1; int i; for (i = 0; i < 32; i++) m4[i] = 0; i = 0; for (i; i < 32; i++) { double a = m1[i]; double b = m2[i]; double c = m3[i]; __trans_tmp_1 = ((a * b) - c) * a - b; m4[i] = __trans_tmp_1; } compare_result(res_test0101); i = 0; for (i; i < 32; i++) { { double a = m1[i]; double b = m2[i]; double c = m3[i]; __trans_tmp_3 = -((a * b) - c) * a + b; } m4[i] = __trans_tmp_3; } compare_result(res_test0110); i = 0; for (i; i < 32; i++) { double a = m1[i]; double b = m2[i]; double c = m3[i]; __trans_tmp_2 = -((a * b) - c) * a - b; m4[i] = __trans_tmp_2; } if (test_fails) abort(); } $ gcc -O3 -Wno-attributes -mfpmath=sse -mfma fma.i && ./a.out Aborted (core dumped)