https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98544
--- Comment #12 from Richard Biener <rguenth at gcc dot gnu.org> --- So at least the following doesn't produce any different answers for me comparing -O -ftree-loop-vectorize -mavx to -O0 typedef unsigned long size_t; template<typename T> inline void PM(T &a, T &b, T c, T d) { a=c+d; b=c-d; } template<typename T1, typename T2, typename T3> inline void MULPM (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) { a=c*e+d*f; b=c*f-d*e; } typedef double T; void __attribute__((noipa)) radb2(size_t ido, size_t l1, const T * __restrict cc, T * __restrict ch, const T * __restrict wa) { auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; }; auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T& { return cc[a+ido*(b+2*c)]; }; auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k<l1;++k) for (size_t i=2; i<ido; i+=2) { size_t ic=ido-i; T ti2, tr2; PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k)); PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k)); MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2); } } int main() { double *cc, *ch, *wa; cc = new double[8*8*8]; ch = new double[8*8*8]; wa = new double[8*8*8]; for (size_t i = 0; i < 8*8*8; ++i) cc[i] = i; for (size_t i = 0; i < 8*8*8; ++i) ch[i] = i + 0.25; for (size_t i = 0; i < 8*8*8; ++i) wa[i] = i - 0.5; radb2 (8, 8, cc, ch, wa); for (size_t i = 0; i < 8*8*8; ++i) __builtin_printf ("%f %f %f\n", cc[i], ch[i], wa[i]); return 0; }