------- Comment #8 from uros at kss-loka dot si 2006-08-17 07:45 -------
Also interesting is, that -march=pentium4 produces following "de-optimized"
code, adding a couple more instructions and wasting %eax register:
.L8:
leal (%ebx,%ebx), %eax
movl 40(%esp), %edx
movl (%edx,%eax,2), %edx
movl %edx, (%esp)
movl 40(%esp), %edx
movl 4(%edx,%eax,2), %ecx
movapd %xmm2, %xmm1
cmpl %ecx, (%esp)
jge .L11
movl (%esp), %edx
.L12:
Some additiona timing can be shown (gcc-4.2 -O2 -fomit-frame-pointer):
-march=pentium4: 0m2.756s
-march=pentium4 -fno-ivopts: 0m2.500s
-march=pentium4 -fno-ivopts -mfpmath=sse: 0m2.461s
-msse2 -fno-ivopts -mfmpath=sse: 0m2.311s
In the last case, the generated code is equal to gcc-3.2 generated one:
.L8:
movl 36(%esp), %edx
movapd %xmm2, %xmm1
movl (%edx,%ebx,4), %eax
movl 4(%edx,%ebx,4), %ecx
cmpl %ecx, %eax
jge .L11
movl %eax, %edx
.p2align 4,,7
.L12:
movl (%edi,%edx,4), %eax
movsd (%esi,%eax,8), %xmm0
mulsd (%ebp,%edx,8), %xmm0
addl $1, %edx
cmpl %edx, %ecx
addsd %xmm0, %xmm1
jg .L12
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21676