7 suck

law at redhat dot com Tue, 14 Feb 2017 08:52:34 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=21485


--- Comment #63 from Jeffrey A. Law <law at redhat dot com> ---
So going back to the gcc-3.4 code the major change is we make much more use of
the complex addressing modes on x86:

NumSift:
.LFB2:
        pushq   %rbx
.LCFI0:
        movq    %rsi, %r8
        .p2align 4,,7
.L11:
        leaq    (%r8,%r8), %rcx
        cmpq    %rdx, %rcx
        ja      .L10
.L14:
        cmpq    %rdx, %rcx
        movq    %rcx, %rsi
        jae     .L4
        movq    8(%rdi,%rcx,8), %rbx
        cmpq    %rbx, (%rdi,%rcx,8)
        leaq    1(%rcx), %rax
        cmovl   %rax, %rsi
.L4:
        movq    (%rdi,%r8,8), %rax
        movq    (%rdi,%rsi,8), %rcx
        cmpq    %rcx, %rax
        jge     .L6
        movq    %rax, (%rdi,%rsi,8)
        movq    %rcx, (%rdi,%r8,8)
        movq    %rsi, %r8
        leaq    (%r8,%r8), %rcx
        cmpq    %rdx, %rcx
        jbe     .L14
.L10:
        popq    %rbx
        ret
        .p2align 4,,7
.L6:
        leaq    1(%rdx), %r8
        jmp     .L11


Contrast the loop body to what we generate now:
NumSift:
.LFB0:
        .cfi_startproc
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        leaq    1(%rdx), %rbx
.L2:
        leaq    (%rsi,%rsi), %rcx
        cmpq    %rdx, %rcx
        ja      .L9
.L6:
        movq    %rsi, %rax
        salq    $4, %rax
        addq    %rdi, %rax
        cmpq    %rdx, %rcx
        movq    (%rax), %r8
        jnb     .L3
        leaq    1(%rcx), %r9
        leaq    (%rdi,%r9,8), %r10
        movq    (%r10), %r11
        cmpq    %r8, %r11
        jle     .L3
        movq    %r11, %r8
        movq    %r10, %rax
        movq    %r9, %rcx
.L3:
        leaq    (%rdi,%rsi,8), %r9
        movq    %rbx, %rsi
        movq    (%r9), %r10
        cmpq    %r8, %r10
        jge     .L2
        movq    %rcx, %rsi
        movq    %r10, (%rax)
        movq    %r8, (%r9)
        leaq    (%rsi,%rsi), %rcx
        cmpq    %rdx, %rcx
        jbe     .L6
.L9:
        popq    %rbx

I haven't looked deeply at the dumps, but I suspect that CSE/PRE on the address
arithmetic spoils our ability to utilize the complex addressing modes.  Which
would tend match the findings from c#62 where a change in a conditional
inhibits PRE and we end up with better code.

[Bug tree-optimization/21485] [5/6/7 Regression] missed load PRE, PRE makes i?86/7 suck

Reply via email to