https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79722
Bug ID: 79722 Summary: Missed opportunity for fused multiply/add with avx2 Product: gcc Version: unknown Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: tkoenig at gcc dot gnu.org Target Milestone: --- Created attachment 40835 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=40835&action=edit Output of gcc -Ofast -mavx2 -S -o bar-gcc.s bar.c The test case is the same as PR 79709: typedef double v4do __attribute__((vector_size (32))); typedef long int v4i __attribute__((vector_size (32))); #define VSET(vect,val) do { vect[0]=val; vect[1]=val; vect[2]=val; vect[3]=val; } while (0) void foo(v4do cx, v4do cy, v4i *r) { v4do x, y, xn, yn; v4i add, res; v4do two, four; long int done; VSET(res, 0L); VSET(two, 2.0); VSET(four, 4.0); x = cx; y = cy; done = 0; while (1) { xn = x*x - y*y + cx; yn = two*x*y + cy; add = xn+xn + yn*yn < four; res += add; if (add[0] == 0 || add[1] == 0 || add[2] || add[3]) break; x = xn; y = yn; } *r = res; } With gcc, the inner loop is tranlsated into .L13: vpextrq $1, %xmm2, %rax testq %rax, %rax je .L2 vextracti128 $0x1, %ymm2, %xmm2 vmovq %xmm2, %rax testq %rax, %rax jne .L2 vpextrq $1, %xmm2, %rax vmovapd %ymm4, %ymm3 testq %rax, %rax jne .L2 .L3: vmulpd %ymm3, %ymm3, %ymm4 vmulpd %ymm9, %ymm3, %ymm3 vaddpd %ymm0, %ymm4, %ymm4 vmulpd %ymm7, %ymm3, %ymm3 vsubpd %ymm10, %ymm4, %ymm4 vaddpd %ymm1, %ymm3, %ymm9 vaddpd %ymm4, %ymm4, %ymm2 vmulpd %ymm9, %ymm9, %ymm10 vaddpd %ymm10, %ymm2, %ymm2 vcmpltpd %ymm6, %ymm2, %ymm2 vmovq %xmm2, %rax vpaddq %ymm2, %ymm8, %ymm8 testq %rax, %rax jne .L13 icc -O3 -march=core-avx2 -S results in ..B1.6: # Preds ..B1.5 # Execution count [6.94e-01] vmovdqa %ymm11, %ymm5 #27.7 # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 ..B1.2: # Preds ..B1.6 ..B1.1 # Execution count [1.69e+00] vmovaps %ymm4, %ymm11 #21.24 vfmsub213pd %ymm6, %ymm4, %ymm11 #21.24 vfmsub231pd %ymm5, %ymm5, %ymm11 #21.24 vmulpd %ymm3, %ymm5, %ymm5 #22.16 vaddpd %ymm11, %ymm11, %ymm8 #23.16 vfmadd213pd %ymm7, %ymm5, %ymm4 #22.22 vfmadd231pd %ymm4, %ymm4, %ymm8 #23.24 vcmpltpd %ymm2, %ymm8, %ymm9 #23.29 vandpd %ymm9, %ymm1, %ymm10 #23.29 vmovups %ymm10, -64(%rsp) #23.7 vpaddq %ymm10, %ymm0, %ymm0 #24.7 cmpq $0, -64(%rsp) #25.21 je ..B1.7 # Prob 20% #25.21 # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2 ymm3 ymm4 ymm6 ymm7 ymm11 ..B1.3: # Preds ..B1.2 # Execution count [1.36e+00] cmpq $0, -56(%rsp) #25.36 je ..B1.7 # Prob 20% #25.36 # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2 ymm3 ymm4 ymm6 ymm7 ymm11 ..B1.4: # Preds ..B1.3 # Execution count [1.08e+00] cmpq $0, -48(%rsp) #25.45 jne ..B1.7 # Prob 20% #25.45 # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2 ymm3 ymm4 ymm6 ymm7 ymm11 ..B1.5: # Preds ..B1.4 # Execution count [8.67e-01] cmpq $0, -40(%rsp) #25.55 je ..B1.6 # Prob 80% #25.55 # LOE rbx rdi r12 r13 r14 r15 ymm0 ymm1 ymm2 ymm3 ymm4 ymm6 ymm7 ymm11 where icc uses eight double precision floating point operations vs. gcc's ten.