4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

lucier at math dot purdue dot edu Wed, 06 May 2009 13:44:30 -0700


------- Comment #64 from lucier at math dot purdue dot edu  2009-05-06 20:43 
-------
In answer to comment 60, here's the command line where I added
-fforward-propagate -fno-move-loop-invariants:


/pkgs/gcc-mainline/bin/gcc -save-temps -I../include -I. -Wall -W -Wno-unused
-O1 -fno-math-errno -fschedule-insns2 -fno-trapping-math -fno-strict-aliasing
-fwrapv -fomit-frame-pointer -fPIC -fno-common -mieee-fp -fforward-propagate
-fno-move-loop-invariants -DHAVE_CONFIG_H -D___PRIMAL -D___LIBRARY
-D___GAMBCDIR="\"/usr/local/Gambit-C/v4.1.2\"" -D___SYS_TYPE_CPU="\"x86_64\""
-D___SYS_TYPE_VENDOR="\"unknown\"" -D___SYS_TYPE_OS="\"linux-gnu\"" -c _num.c

here's the compiler:

/pkgs/gcc-mainline/bin/gcc -v
Using built-in specs.
Target: x86_64-unknown-linux-gnu
Configured with: /tmp/lucier/gcc/mainline/configure --enable-checking=release
--prefix=/pkgs/gcc-mainline --enable-languages=c
Thread model: posix
gcc version 4.5.0 20090506 (experimental) [trunk revision 147199] (GCC) 

and the runtime didn't change (substantially)

    132 ms cpu time (132 user, 0 system)

and the loop looks pretty much just as bad (it's 117 instructions long, by my
count):

.L2752:
        movq    %rcx, %rdx
        addq    8(%rax), %rdx
        leaq    4(%rcx), %rdi
        movq    %rdx, -8(%rax)
        leaq    4(%rdx), %rbx
        addq    8(%rax), %rdx
        movq    %rbx, -16(%rax)
        movq    %rdx, -24(%rax)
        leaq    4(%rdx), %rbx
        addq    8(%rax), %rdx
        movq    %rbx, -32(%rax)
        movq    %rdx, -40(%rax)
        leaq    4(%rdx), %rbx
        movq    40(%rax), %rdx
        movq    %rbx, -48(%rax)
        movsd   7(%rdx,%rbx,2), %xmm9
        movq    -40(%rax), %rbx
        leaq    7(%rdx,%rcx,2), %r8
        addq    $8, %rcx
        movsd   (%r8), %xmm4
        cmpq    %rcx, %r13
        movsd   7(%rdx,%rbx,2), %xmm11
        movq    -32(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm5
        movq    -24(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm7
        movq    -16(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm14
        movq    -8(%rax), %rbx
        movsd   7(%rdx,%rbx,2), %xmm6
        leaq    (%rdi,%rdi), %rbx
        movsd   7(%rbx,%rdx), %xmm8
        movq    24(%rax), %rdx
        movapd  %xmm6, %xmm13
        movsd   15(%rdx), %xmm1
        movsd   7(%rdx), %xmm2
        movapd  %xmm1, %xmm10
        movsd   31(%rdx), %xmm3
        movapd  %xmm2, %xmm12
        mulsd   %xmm11, %xmm10
        mulsd   %xmm9, %xmm12
        mulsd   %xmm2, %xmm11
        mulsd   %xmm1, %xmm9
        movsd   23(%rdx), %xmm0
        addsd   %xmm12, %xmm10
        movapd  %xmm2, %xmm12
        mulsd   %xmm7, %xmm2
        subsd   %xmm9, %xmm11
        movapd  %xmm1, %xmm9
        mulsd   %xmm5, %xmm12
        mulsd   %xmm5, %xmm1
        movapd  %xmm8, %xmm5
        mulsd   %xmm7, %xmm9
        movapd  %xmm4, %xmm7
        subsd   %xmm11, %xmm13
        addsd   %xmm6, %xmm11
        movsd   .LC5(%rip), %xmm6
        subsd   %xmm1, %xmm2
        movapd  %xmm0, %xmm1
        addsd   %xmm12, %xmm9
        movapd  %xmm14, %xmm12
        xorpd   %xmm3, %xmm6
        subsd   %xmm10, %xmm12
        mulsd   %xmm13, %xmm1
        subsd   %xmm2, %xmm7
        addsd   %xmm4, %xmm2
        movapd  %xmm6, %xmm4
        addsd   %xmm14, %xmm10
        mulsd   %xmm13, %xmm6
        mulsd   %xmm12, %xmm4
        subsd   %xmm9, %xmm5
        mulsd   %xmm0, %xmm12
        addsd   %xmm8, %xmm9
        movapd  %xmm0, %xmm8
        mulsd   %xmm11, %xmm0
        addsd   %xmm1, %xmm4
        movapd  %xmm3, %xmm1
        mulsd   %xmm10, %xmm3
        subsd   %xmm12, %xmm6
        mulsd   %xmm11, %xmm1
        mulsd   %xmm10, %xmm8
        subsd   %xmm3, %xmm0
        addsd   %xmm1, %xmm8
        movapd  %xmm2, %xmm1
        addsd   %xmm0, %xmm1
        subsd   %xmm0, %xmm2
        movapd  %xmm7, %xmm0
        subsd   %xmm6, %xmm7
        addsd   %xmm6, %xmm0
        movsd   %xmm1, (%r8)
        movapd  %xmm9, %xmm1
        movq    40(%rax), %rdx
        subsd   %xmm8, %xmm9
        addsd   %xmm8, %xmm1
        movsd   %xmm1, 7(%rbx,%rdx)
        movq    -8(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm2, 7(%rdx,%rbx,2)
        movq    -16(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm9, 7(%rdx,%rbx,2)
        movq    -24(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm0, 7(%rdx,%rbx,2)
        movapd  %xmm5, %xmm0
        movq    -32(%rax), %rbx
        movq    40(%rax), %rdx
        subsd   %xmm4, %xmm5
        addsd   %xmm4, %xmm0
        movsd   %xmm0, 7(%rdx,%rbx,2)
        movq    -40(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm7, 7(%rdx,%rbx,2)
        movq    -48(%rax), %rbx
        movq    40(%rax), %rdx
        movsd   %xmm5, 7(%rdx,%rbx,2)
        jg      .L2752
        movq    %rdi, %r13
.L2751:


-- 

lucier at math dot purdue dot edu changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|WAITING                     |NEW


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928

[Bug rtl-optimization/33928] [4.3/4.4/4.5 Regression] 30% performance slowdown in floating-point code caused by r118475

Reply via email to