[Bug tree-optimization/50713] SLP vs loop: code generated differs

rguenth at gcc dot gnu.org Fri, 14 Oct 2011 06:24:48 -0700

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50713


--- Comment #4 from Richard Guenther <rguenth at gcc dot gnu.org> 2011-10-14 
13:23:52 UTC ---
(In reply to comment #3)
> thanks for the splitting: it is indeed the most serious problem.
> what about "complex ops in scalar code"? keep it here and shall I spawn a
> specific one (maybe once PR50622 is fixed…)?

Yes, splitting that would be a good idea as well.

Btw, for the float sum() testcase the Intel compiler generates:

_Z3sum1AS_:
# parameter 1: %xmm0 %xmm1
# parameter 2: %xmm2 %xmm3
..B2.1:                         # Preds ..B2.0
..___tag_value__Z3sum1AS_.4:                                    #11.1
        movaps    %xmm2, %xmm4                                  #11.1
        movaps    %xmm3, %xmm5                                  #11.1
        shufps    $1, %xmm2, %xmm4                              #11.1
        shufps    $1, %xmm3, %xmm5                              #11.1
        movlps    %xmm0, -24(%rsp)                              #11.1
        movlps    %xmm1, -16(%rsp)                              #11.1
        addss     %xmm2, %xmm0                                  #12.3
        addss     -20(%rsp), %xmm4                              #12.3
        addss     -12(%rsp), %xmm5                              #12.3
        addss     %xmm3, %xmm1                                  #12.3
        movss     %xmm0, -24(%rsp)                              #12.3
        movss     %xmm4, -20(%rsp)                              #12.3
        movss     %xmm1, -16(%rsp)                              #12.3
        movss     %xmm5, -12(%rsp)                              #12.3
        movsd     -24(%rsp), %xmm0                              #16.10
        movsd     -16(%rsp), %xmm1                              #16.10
        ret                                                     #16.10

awful, but slightly better than GCC when vectorization is turned off:

_Z3sum1AS_:
.LFB1:
        .cfi_startproc
        movq    %xmm3, -48(%rsp)
        movq    %xmm2, -56(%rsp)
        movq    %xmm0, -40(%rsp)
        movq    %xmm1, -32(%rsp)
        movss   -52(%rsp), %xmm2
        movss   -48(%rsp), %xmm1
        movss   -44(%rsp), %xmm0
        addss   -32(%rsp), %xmm1
        movss   -56(%rsp), %xmm3
        addss   -28(%rsp), %xmm0
        addss   -36(%rsp), %xmm2
        addss   -40(%rsp), %xmm3
        movss   %xmm1, -16(%rsp)
        movss   %xmm0, -12(%rsp)
        movss   %xmm2, -20(%rsp)
        movss   %xmm3, -24(%rsp)
        movq    -16(%rsp), %xmm1
        movq    -24(%rsp), %xmm0
        ret

(well, eventually)

[Bug tree-optimization/50713] SLP vs loop: code generated differs

Reply via email to