http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54390

--- Comment #1 from H.J. Lu <hjl.tools at gmail dot com> 2012-08-28 13:21:29 
UTC ---
Without AVX, we got

    xorps    %xmm4, %xmm4
    shufps    $0xe4, %xmm4, %xmm0
    movlhps    %xmm1, %xmm0
    movaps    %xmm4, %xmm1
    movaps    %xmm2, %xmm4
    shufps    $0xe4, %xmm1, %xmm4
    movaps    %xmm4, %xmm1
    movlhps    %xmm3, %xmm1
    addps    %xmm1, %xmm0
    movhps    %xmm0, -16(%rsp)
    movq    -16(%rsp), %rax
    movlps    %xmm0, -24(%rsp)
    movq    %rax, -48(%rsp)
    movq    -48(%rsp), %xmm1
    movq    -24(%rsp), %xmm0
    ret

Adding -mtune=corei7, we got

    movq    %xmm1, -16(%rsp)
    movq    %xmm0, -24(%rsp)
    movq    %xmm2, -40(%rsp)
    movups    -24(%rsp), %xmm0
    movq    %xmm3, -32(%rsp)
    movups    -40(%rsp), %xmm1
    addps    %xmm1, %xmm0
    movups    %xmm0, -24(%rsp)
    movq    -16(%rsp), %rax
    movq    -24(%rsp), %xmm0
    movd    %rax, %xmm1
    ret

With AVX, we got

    vmovq    %xmm0, -24(%rsp)
    vmovq    %xmm1, -16(%rsp)
    vmovq    %xmm2, -40(%rsp)
    vmovq    %xmm3, -32(%rsp)
    vmovups    -24(%rsp), %xmm1
    vmovups    -40(%rsp), %xmm0
    vaddps    %xmm0, %xmm1, %xmm0
    vmovups    %xmm0, -24(%rsp)
    movq    -16(%rsp), %rax
    movq    %rax, -48(%rsp)
    vmovq    -24(%rsp), %xmm0
    vmovq    -48(%rsp), %xmm1
    ret

I think -mavx/-mtune=corei7 enables unaligned load/store which
improves vectorizer.

Reply via email to