4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

lucier at math dot purdue dot edu Tue, 05 May 2009 20:43:53 -0700


------- Comment #53 from lucier at math dot purdue dot edu  2009-05-06 03:43 
-------
I posted a possible fix to gcc-patches with the subject line


Possible fix for 30% performance regression in PR 33928

Here's the assembly for the main loop after the changes I proposed:

.L4230:
        movq    %r11, %rdi
        addq    8(%r10), %rdi
        movq    8(%r10), %rsi
        movq    8(%r10), %rdx
        movq    40(%r10), %rax
        leaq    4(%r11), %rbx
        addq    %rdi, %rsi
        leaq    4(%rdi), %r9
        movq    %rdi, -8(%r10)
        addq    %rsi, %rdx
        leaq    4(%rsi), %r8
        movq    %rsi, -24(%r10)
        leaq    4(%rdx), %rcx
        movq    %r9, -16(%r10)
        movq    %rdx, -40(%r10)
        movq    %r8, -32(%r10)
        addq    $7, %rax
        movq    %rcx, -48(%r10)
        movsd   (%rax,%rcx,2), %xmm12
        leaq    (%rbx,%rbx), %rcx
        movsd   (%rax,%rdx,2), %xmm3
        leaq    (%rax,%r11,2), %rdx
        addq    $8, %r11
        movsd   (%rax,%r8,2), %xmm14
        cmpq    %r11, %r13
        movsd   (%rax,%rsi,2), %xmm13
        movsd   (%rax,%r9,2), %xmm11
        movsd   (%rax,%rdi,2), %xmm10
        movsd   (%rax,%rcx), %xmm8
        movq    24(%r10), %rax
        movsd   (%rdx), %xmm7
        movsd   15(%rax), %xmm2
        movsd   7(%rax), %xmm1
        movapd  %xmm2, %xmm0
        movsd   31(%rax), %xmm9
        movapd  %xmm1, %xmm6
        mulsd   %xmm3, %xmm0
        movapd  %xmm1, %xmm4
        mulsd   %xmm12, %xmm6
        mulsd   %xmm3, %xmm4
        movapd  %xmm1, %xmm3
        mulsd   %xmm13, %xmm1
        mulsd   %xmm14, %xmm3
        addsd   %xmm0, %xmm6
        movapd  %xmm2, %xmm0
        movsd   23(%rax), %xmm5
        mulsd   %xmm12, %xmm0
        movapd  %xmm7, %xmm12
        subsd   %xmm0, %xmm4
        movapd  %xmm2, %xmm0
        mulsd   %xmm14, %xmm2
        movapd  %xmm8, %xmm14
        mulsd   %xmm13, %xmm0
        movapd  %xmm11, %xmm13
        addsd   %xmm6, %xmm11
        subsd   %xmm6, %xmm13
        subsd   %xmm2, %xmm1
        movapd  %xmm10, %xmm2
        addsd   %xmm0, %xmm3
        movapd  %xmm5, %xmm0
        subsd   %xmm4, %xmm2
        addsd   %xmm4, %xmm10
        subsd   %xmm1, %xmm12
        addsd   %xmm1, %xmm7
        movapd  %xmm9, %xmm1
        subsd   %xmm3, %xmm14
        mulsd   %xmm2, %xmm0
        xorpd   .LC5(%rip), %xmm1
        addsd   %xmm3, %xmm8
        movapd  %xmm1, %xmm3
        mulsd   %xmm2, %xmm1
        movapd  %xmm5, %xmm2
        mulsd   %xmm13, %xmm3
        mulsd   %xmm11, %xmm2
        addsd   %xmm0, %xmm3
        movapd  %xmm5, %xmm0
        mulsd   %xmm10, %xmm5
        mulsd   %xmm13, %xmm0
        subsd   %xmm0, %xmm1
        movapd  %xmm9, %xmm0
        mulsd   %xmm11, %xmm9
        mulsd   %xmm10, %xmm0
        subsd   %xmm9, %xmm5
        addsd   %xmm0, %xmm2
        movapd  %xmm7, %xmm0
        addsd   %xmm5, %xmm0
        subsd   %xmm5, %xmm7
        movsd   %xmm0, (%rdx)
        movapd  %xmm8, %xmm0
        movq    40(%r10), %rax
        subsd   %xmm2, %xmm8
        addsd   %xmm2, %xmm0
        movsd   %xmm0, 7(%rcx,%rax)
        movq    -8(%r10), %rdx
        movq    40(%r10), %rax
        movapd  %xmm12, %xmm0
        subsd   %xmm1, %xmm12
        movsd   %xmm7, 7(%rax,%rdx,2)
        movq    -16(%r10), %rdx
        movq    40(%r10), %rax
        addsd   %xmm1, %xmm0
        movsd   %xmm8, 7(%rax,%rdx,2)
        movq    -24(%r10), %rdx
        movq    40(%r10), %rax
        movsd   %xmm0, 7(%rax,%rdx,2)
        movapd  %xmm14, %xmm0
        movq    -32(%r10), %rdx
        movq    40(%r10), %rax
        subsd   %xmm3, %xmm14
        addsd   %xmm3, %xmm0
        movsd   %xmm0, 7(%rax,%rdx,2)
        movq    -40(%r10), %rdx
        movq    40(%r10), %rax
        movsd   %xmm12, 7(%rax,%rdx,2)
        movq    -48(%r10), %rdx
        movq    40(%r10), %rax
        movsd   %xmm14, 7(%rax,%rdx,2)
        jg      .L4230
        movq    %rbx, %r13
.L4228:


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928

[Bug tree-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

Reply via email to