------- Comment #32 from lucier at math dot purdue dot edu 2008-05-30 16:01
-------
I've decided to test the current ira branch with this problem. I used the
build instructions in comment 24.
With -fno-ira I get the same results as with 4.3.0 (no surprise there).
With -fira I get the time
(time (direct-fft-recursive-4 a table))
422 ms real time
421 ms cpu time (421 user, 0 system)
no collections
64 bytes allocated
no minor faults
no major faults
which is an improvement, and the code at the beginning of the loop is
.L7262:
movq %rdx, %rcx
addq (%rsi), %rcx
leaq 4(%rdx), %r15
movq %rcx, (%rbx)
addq $4, %rcx
movq %rcx, (%rbp)
movq (%rbx), %rcx
addq (%rsi), %rcx
movq %rcx, (%rdi)
addq $4, %rcx
movq %rcx, (%r8)
movq (%rdi), %rcx
addq (%rsi), %rcx
leaq 4(%rcx), %r10
movq %rcx, (%r9)
movq %r10, (%r13)
movq (%rax), %rcx
addq $7, %rcx
movsd (%rcx,%r10,2), %xmm4
movq (%r9), %r10
leaq (%rcx,%rdx,2), %r11
addq $8, %rdx
movsd (%r11), %xmm11
movsd (%rcx,%r10,2), %xmm5
movq (%r8), %r10
movsd (%rcx,%r10,2), %xmm6
movq (%rdi), %r10
movsd (%rcx,%r10,2), %xmm7
movq (%rbp), %r10
movsd (%rcx,%r10,2), %xmm8
movq (%rbx), %r10
movapd %xmm8, %xmm14
movsd (%rcx,%r10,2), %xmm9
leaq (%r15,%r15), %r10
movsd (%rcx,%r10), %xmm10
movq (%r12), %rcx
movapd %xmm9, %xmm15
movsd 15(%rcx), %xmm1
movsd 7(%rcx), %xmm2
movapd %xmm1, %xmm13
movsd 31(%rcx), %xmm3
movapd %xmm2, %xmm12
which is also an improvement, but it still is nowhere near the result for
4.2.2.
So, whatever is causing this problem, it appears the new register allocator
isn't going to fix it.
The code generated by today's mainline (136210) isn't better than 4.3.0; the
time is
(time (direct-fft-recursive-4 a table))
469 ms real time
469 ms cpu time (469 user, 0 system)
no collections
64 bytes allocated
no minor faults
no major faults
and code is essentially the same as for 4.3.0
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928