https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65135

--- Comment #2 from H.J. Lu <hjl.tools at gmail dot com> ---
The assembly code generates by r220674 is much shorter:

bar:
    call    __x86.get_pc_thunk.ax
    addl    $_GLOBAL_OFFSET_TABLE_, %eax
    movl    FPtr@GOTOFF(%eax), %edx
    movl    inc@GOTOFF(%eax), %ecx
    leal    (%edx,%ecx,4), %ecx
    cmpl    %ecx, FEOF@GOTOFF(%eax)
    movl    4(%esp), %ecx
    cmovb    F@GOTOFF(%eax), %edx
    movl    %ecx, (%edx)
    movl    inc@GOTOFF(%eax), %ecx
    leal    (%edx,%ecx,4), %edx
    movl    %edx, FPtr@GOTOFF(%eax)
    ret

vs

bar:
    call    __x86.get_pc_thunk.dx
    addl    $_GLOBAL_OFFSET_TABLE_, %edx
    pushl    %edi
    pushl    %esi
    movl    FPtr@GOT(%edx), %ecx
    pushl    %ebx
    movl    inc@GOT(%edx), %ebx
    movl    FEOF@GOT(%edx), %edi
    movl    (%ecx), %eax
    movl    (%ebx), %esi
    leal    (%eax,%esi,4), %esi
    cmpl    %esi, (%edi)
    jnb    .L2
    movl    F@GOT(%edx), %eax
    movl    (%eax), %eax
.L2:
    movl    16(%esp), %edx
    movl    %edx, (%eax)
    movl    (%ebx), %edx
    leal    (%eax,%edx,4), %eax
    movl    %eax, (%ecx)
    popl    %ebx
    popl    %esi
    popl    %edi
    ret
    .size    bar, .-bar

Why doesn't it improve performance? Why does it hurt performance instead?

Reply via email to