------- Additional Comments From uros at kss-loka dot si 2004-11-04 09:33 -------
ASM code, produced with CVS gcc dated 04. Nov 2004 looks much better, but still
not as good as 3.2:
LU_copy_matrix:
pushl %ebp
pushl %edi
pushl %esi
pushl %ebx
movl 24(%esp), %ebp
movl 20(%esp), %eax
testl %eax, %eax
jle .L8
movl 32(%esp), %esi
xorl %edi, %edi
.L4:
testl %ebp, %ebp
jle .L6
movl 28(%esp), %eax
movl (%eax,%edi,4), %ebx
movl (%esi), %ecx <= (*1)
xorl %edx, %edx
.L5:
leal 0(,%edx,8), %eax |<= (*2)
fldl (%ecx,%eax) |
fstpl (%ebx,%eax) |
addl $1, %edx
cmpl %edx, %ebp
jg .L5
.L6:
addl $1, %edi
addl $4, %esi <= (*1)
cmpl %edi, 20(%esp)
jg .L4
.L8:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
(*1): "movl (%esi,%edi,4), %ecx" could be used here. The second addl in .L4
could be eliminated in this case.
(*2): Why not use:
fldl (%ecx,%edx,8)
fstpl (%ebx,%edx,8)
directly. lea instruction would be eliminated, together with the use of %eax
register.
Uros.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17647