http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50713
--- Comment #4 from Richard Guenther <rguenth at gcc dot gnu.org> 2011-10-14
13:23:52 UTC ---
(In reply to comment #3)
> thanks for the splitting: it is indeed the most serious problem.
> what about "complex ops in scalar code"? keep it here and shall I spawn a
> specific one (maybe once PR50622 is fixed…)?
Yes, splitting that would be a good idea as well.
Btw, for the float sum() testcase the Intel compiler generates:
_Z3sum1AS_:
# parameter 1: %xmm0 %xmm1
# parameter 2: %xmm2 %xmm3
..B2.1: # Preds ..B2.0
..___tag_value__Z3sum1AS_.4: #11.1
movaps %xmm2, %xmm4 #11.1
movaps %xmm3, %xmm5 #11.1
shufps $1, %xmm2, %xmm4 #11.1
shufps $1, %xmm3, %xmm5 #11.1
movlps %xmm0, -24(%rsp) #11.1
movlps %xmm1, -16(%rsp) #11.1
addss %xmm2, %xmm0 #12.3
addss -20(%rsp), %xmm4 #12.3
addss -12(%rsp), %xmm5 #12.3
addss %xmm3, %xmm1 #12.3
movss %xmm0, -24(%rsp) #12.3
movss %xmm4, -20(%rsp) #12.3
movss %xmm1, -16(%rsp) #12.3
movss %xmm5, -12(%rsp) #12.3
movsd -24(%rsp), %xmm0 #16.10
movsd -16(%rsp), %xmm1 #16.10
ret #16.10
awful, but slightly better than GCC when vectorization is turned off:
_Z3sum1AS_:
.LFB1:
.cfi_startproc
movq %xmm3, -48(%rsp)
movq %xmm2, -56(%rsp)
movq %xmm0, -40(%rsp)
movq %xmm1, -32(%rsp)
movss -52(%rsp), %xmm2
movss -48(%rsp), %xmm1
movss -44(%rsp), %xmm0
addss -32(%rsp), %xmm1
movss -56(%rsp), %xmm3
addss -28(%rsp), %xmm0
addss -36(%rsp), %xmm2
addss -40(%rsp), %xmm3
movss %xmm1, -16(%rsp)
movss %xmm0, -12(%rsp)
movss %xmm2, -20(%rsp)
movss %xmm3, -24(%rsp)
movq -16(%rsp), %xmm1
movq -24(%rsp), %xmm0
ret
(well, eventually)