------- Additional Comments From uros at kss-loka dot si  2004-11-04 09:33 -------
ASM code, produced with CVS gcc dated 04. Nov 2004 looks much better, but still
not as good as 3.2:

LU_copy_matrix:
        pushl   %ebp
        pushl   %edi
        pushl   %esi
        pushl   %ebx
        movl    24(%esp), %ebp
        movl    20(%esp), %eax
        testl   %eax, %eax
        jle     .L8
        movl    32(%esp), %esi
        xorl    %edi, %edi
.L4:
        testl   %ebp, %ebp
        jle     .L6
        movl    28(%esp), %eax
        movl    (%eax,%edi,4), %ebx
        movl    (%esi), %ecx          <= (*1)
        xorl    %edx, %edx
.L5:
        leal    0(,%edx,8), %eax     |<= (*2)
        fldl    (%ecx,%eax)          |
        fstpl   (%ebx,%eax)          |
        addl    $1, %edx
        cmpl    %edx, %ebp
        jg      .L5
.L6:
        addl    $1, %edi
        addl    $4, %esi              <= (*1)
        cmpl    %edi, 20(%esp)
        jg      .L4
.L8:
        popl    %ebx
        popl    %esi
        popl    %edi
        popl    %ebp
        ret

(*1):  "movl    (%esi,%edi,4), %ecx" could be used here. The second addl in .L4
could be eliminated in this case.

(*2): Why not use:

       fldl    (%ecx,%edx,8)
       fstpl   (%ebx,%edx,8)

directly. lea instruction would be eliminated, together with the use of %eax
register.

Uros.

-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17647

Reply via email to