https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97343
Richard Biener <rguenth at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Status|UNCONFIRMED |NEW Component|target |tree-optimization Ever confirmed|0 |1 Target| |x86_64-*-* i?86-*-* Blocks| |53947 Keywords| |missed-optimization Last reconfirmed| |2020-10-09 CC| |rguenth at gcc dot gnu.org --- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> --- All below for Part 2. Without -ffast-math you are seeing GCC using in-order reductions now while with -ffast-math the vectorizer gets a bit confused about reassociations done before, for me producing .L3: vmovupd 32(%rsi,%rax), %ymm3 vmovupd (%rdx,%rax), %ymm7 vinsertf128 $1, (%rsi,%rax), %ymm3, %ymm0 vinsertf128 $1, 32(%rdx,%rax), %ymm7, %ymm2 vmovupd 32(%rsi,%rax), %ymm5 vpermpd $136, %ymm0, %ymm4 vpermpd $40, %ymm2, %ymm7 vpermpd $221, %ymm0, %ymm1 vpermpd $125, %ymm2, %ymm3 vperm2f128 $49, (%rsi,%rax), %ymm5, %ymm0 vmovupd (%rdx,%rax), %ymm2 vperm2f128 $49, 32(%rdx,%rax), %ymm2, %ymm2 addq $64, %rax vpermpd $136, %ymm0, %ymm5 vpermpd $221, %ymm0, %ymm0 vpermpd $40, %ymm2, %ymm8 vpermpd $125, %ymm2, %ymm2 vmulpd %ymm8, %ymm5, %ymm5 vmulpd %ymm2, %ymm0, %ymm0 vfmadd132pd %ymm3, %ymm5, %ymm1 vfmadd231pd %ymm7, %ymm4, %ymm0 vaddpd %ymm0, %ymm1, %ymm0 vaddpd %ymm0, %ymm6, %ymm6 cmpq %rcx, %rax jne .L3 -ffast-math vs. non-ffast-math we're using a SLP reduction vs. 4 reduction chains and this SLP reduction ends up looking like t5.c:7:21: note: Vectorizing SLP tree: t5.c:7:21: note: node 0x4100c20 (max_nunits=4, refcnt=2) t5.c:7:21: note: stmt 0 acc_imre_158 = acc_imre_3 + _34; t5.c:7:21: note: stmt 1 acc_reim_156 = acc_reim_1 + _8; t5.c:7:21: note: stmt 2 acc_imim_154 = _21 + acc_imim_35; t5.c:7:21: note: stmt 3 acc_rere_146 = _11 + acc_rere_29; t5.c:7:21: note: children 0x3f272e0 0x4100bb0 t5.c:7:21: note: node 0x3f272e0 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 acc_imre_3 = PHI <acc_imre_158(7), 0.0(8)> t5.c:7:21: note: stmt 1 acc_reim_1 = PHI <acc_reim_156(7), 0.0(8)> t5.c:7:21: note: stmt 2 acc_imim_35 = PHI <acc_imim_154(7), 0.0(8)> t5.c:7:21: note: stmt 3 acc_rere_29 = PHI <acc_rere_146(7), 0.0(8)> t5.c:7:21: note: children 0x4100c20 t5.c:7:21: note: node 0x4100bb0 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _34 = _36 + _157; t5.c:7:21: note: stmt 1 _8 = _30 + _155; t5.c:7:21: note: stmt 2 _21 = _15 + _153; t5.c:7:21: note: stmt 3 _11 = _6 + _145; t5.c:7:21: note: children 0x4100920 0x4100b40 t5.c:7:21: note: node 0x4100920 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _36 = _37 + _73; t5.c:7:21: note: stmt 1 _30 = _32 + _71; t5.c:7:21: note: stmt 2 _15 = _10 + _69; t5.c:7:21: note: stmt 3 _6 = _31 + _61; t5.c:7:21: note: children 0x41004e0 0x41008b0 t5.c:7:21: note: node 0x41004e0 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _37 = _101 + _129; t5.c:7:21: note: stmt 1 _32 = _99 + _127; t5.c:7:21: note: stmt 2 _10 = _97 + _125; t5.c:7:21: note: stmt 3 _31 = _89 + _117; t5.c:7:21: note: children 0x3f2a550 0x3f28700 t5.c:7:21: note: node 0x3f2a550 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _101 = _88 * _94; t5.c:7:21: note: stmt 1 _99 = _86 * _96; t5.c:7:21: note: stmt 2 _97 = _94 * _96; t5.c:7:21: note: stmt 3 _89 = _86 * _88; t5.c:7:21: note: children 0x40b6990 0x3f29e00 t5.c:7:21: note: node 0x40b6990 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _88 = *_87; t5.c:7:21: note: stmt 1 _96 = *_95; t5.c:7:21: note: stmt 2 _96 = *_95; t5.c:7:21: note: stmt 3 _88 = *_87; t5.c:7:21: note: load permutation { 1 5 5 1 } t5.c:7:21: note: node 0x3f29e00 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _94 = *_93; t5.c:7:21: note: stmt 1 _86 = *_85; t5.c:7:21: note: stmt 2 _94 = *_93; t5.c:7:21: note: stmt 3 _86 = *_85; t5.c:7:21: note: load permutation { 5 1 5 1 } t5.c:7:21: note: node 0x3f28700 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _129 = _116 * _122; t5.c:7:21: note: stmt 1 _127 = _114 * _124; t5.c:7:21: note: stmt 2 _125 = _122 * _124; t5.c:7:21: note: stmt 3 _117 = _114 * _116; t5.c:7:21: note: children 0x3f287e0 0x3f28770 t5.c:7:21: note: node 0x3f287e0 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _116 = *_115; t5.c:7:21: note: stmt 1 _124 = *_123; t5.c:7:21: note: stmt 2 _124 = *_123; t5.c:7:21: note: stmt 3 _116 = *_115; t5.c:7:21: note: load permutation { 2 6 6 2 } t5.c:7:21: note: node 0x3f28770 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _122 = *_121; t5.c:7:21: note: stmt 1 _114 = *_113; t5.c:7:21: note: stmt 2 _122 = *_121; t5.c:7:21: note: stmt 3 _114 = *_113; t5.c:7:21: note: load permutation { 6 2 6 2 } t5.c:7:21: note: node 0x41008b0 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _73 = _60 * _66; t5.c:7:21: note: stmt 1 _71 = _58 * _68; t5.c:7:21: note: stmt 2 _69 = _66 * _68; t5.c:7:21: note: stmt 3 _61 = _58 * _60; t5.c:7:21: note: children 0x4100290 0x4100810 t5.c:7:21: note: node 0x4100290 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _60 = *_59; t5.c:7:21: note: stmt 1 _68 = *_67; t5.c:7:21: note: stmt 2 _68 = *_67; t5.c:7:21: note: stmt 3 _60 = *_59; t5.c:7:21: note: load permutation { 0 4 4 0 } t5.c:7:21: note: node 0x4100810 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _66 = *_65; t5.c:7:21: note: stmt 1 _58 = *_57; t5.c:7:21: note: stmt 2 _66 = *_65; t5.c:7:21: note: stmt 3 _58 = *_57; t5.c:7:21: note: load permutation { 4 0 4 0 } t5.c:7:21: note: node 0x4100b40 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _157 = _144 * _150; t5.c:7:21: note: stmt 1 _155 = _142 * _152; t5.c:7:21: note: stmt 2 _153 = _150 * _152; t5.c:7:21: note: stmt 3 _145 = _142 * _144; t5.c:7:21: note: children 0x4100990 0x4100a50 t5.c:7:21: note: node 0x4100990 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _144 = *_143; t5.c:7:21: note: stmt 1 _152 = *_151; t5.c:7:21: note: stmt 2 _152 = *_151; t5.c:7:21: note: stmt 3 _144 = *_143; t5.c:7:21: note: load permutation { 3 7 7 3 } t5.c:7:21: note: node 0x4100a50 (max_nunits=4, refcnt=1) t5.c:7:21: note: stmt 0 _150 = *_149; t5.c:7:21: note: stmt 1 _142 = *_141; t5.c:7:21: note: stmt 2 _150 = *_149; t5.c:7:21: note: stmt 3 _142 = *_141; t5.c:7:21: note: load permutation { 7 3 7 3 } which eventually shows some non-obvious permute optimization opportunities. I'm currently working on a permute optimization phase btw. but for start only handling cases that do not help here. Btw, if I use -ffast-math but disable reassociation via -fno-tree-reassoc I get the reduction chain variant which optimizes to .L3: vmovupd 32(%rsi,%rax), %ymm6 vmovupd 32(%rdx,%rax), %ymm7 vmovupd (%rsi,%rax), %ymm5 vfmadd231pd (%rdx,%rax), %ymm6, %ymm0 vfmadd231pd (%rdx,%rax), %ymm5, %ymm3 vfmadd231pd (%rsi,%rax), %ymm7, %ymm1 addq $64, %rax vfmadd231pd %ymm6, %ymm7, %ymm2 cmpq %rcx, %rax jne .L3 even with GCC 10 (-Ofast -march=core-avx2 -fno-tree-reassoc). Which means the following source change helps: void __attribute__((optimize("no-tree-reassoc"))) cdot(double* res, const double* a, const double* b, int N) { double acc_rere = 0; double acc_imim = 0; double acc_reim = 0; double acc_imre = 0; for (int c = 0; c < N; ++c) { for (int k = 0; k < 4; ++k) { acc_rere += a[c*8+k+0]*b[c*8+k+0]; acc_imim += a[c*8+k+4]*b[c*8+k+4]; acc_reim += a[c*8+k+0]*b[c*8+k+4]; acc_imre += a[c*8+k+4]*b[c*8+k+0]; } } res[0] = acc_rere+acc_imim; res[4] = acc_imre-acc_reim; } the reduction epilogue ends up like vextractf128 $0x1, %ymm3, %xmm4 vaddpd %xmm3, %xmm4, %xmm3 vunpckhpd %xmm3, %xmm3, %xmm4 vaddpd %xmm3, %xmm4, %xmm3 vextractf128 $0x1, %ymm2, %xmm4 vaddpd %xmm2, %xmm4, %xmm4 vunpckhpd %xmm4, %xmm4, %xmm2 vaddpd %xmm4, %xmm2, %xmm2 vextractf128 $0x1, %ymm1, %xmm4 vaddpd %xmm1, %xmm4, %xmm4 vaddsd %xmm2, %xmm3, %xmm2 vunpckhpd %xmm4, %xmm4, %xmm1 vaddpd %xmm4, %xmm1, %xmm1 vextractf128 $0x1, %ymm0, %xmm4 vaddpd %xmm0, %xmm4, %xmm4 vunpckhpd %xmm4, %xmm4, %xmm0 vaddpd %xmm4, %xmm0, %xmm0 vsubsd %xmm1, %xmm0, %xmm0 vzeroupper vmovsd %xmm2, (%rdi) vmovsd %xmm0, 32(%rdi) which is not optimal since we miss the opportunity to vectorize the adds of the accumulators res[0] = acc_rere+acc_imim; res[4] = acc_imre-acc_reim; Referenced Bugs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947 [Bug 53947] [meta-bug] vectorizer missed-optimizations