[Bug rtl-optimization/19780] Floating point computation far slower for -mfpmath=sse

ubizjak at gmail dot com Tue, 03 Apr 2007 05:32:56 -0700


------- Comment #9 from ubizjak at gmail dot com  2007-04-03 13:32 -------
(In reply to comment #8)
> what's the generated code for -ffast-math? in principle i don't see a reason
> why it should make any difference...


Trying to answer your question, I have played a bit with compile flags and
things are getting really strange:

[EMAIL PROTECTED] test]$ gcc -O2 -mfpmath=387 pr19780.c 
[EMAIL PROTECTED] test]$ time ./a.out
Start?
Stop!
Result = 0.000000, 0.000000, 1.000000

real    0m1.211s
user    0m1.212s
sys     0m0.004s
[EMAIL PROTECTED] test]$ gcc -O2 -mfpmath=387 -msse pr19780.c 
[EMAIL PROTECTED] test]$ time ./a.out
Start?
Stop!
Result = 0.000000, 0.000000, 1.000000

real    0m0.555s
user    0m0.552s
sys     0m0.004s

Note that -msse should have no effect on calculations. The difference between
asm dumps is:

--- pr19780.s   2007-04-03 14:28:14.000000000 +0200
+++ pr19780.s_  2007-04-03 14:28:01.000000000 +0200
@@ -17,69 +17,61 @@
        pushl   %ebp
        movl    %esp, %ebp
        pushl   %ecx
-       subl    $84, %esp
+       subl    $100, %esp
        movl    $.LC0, (%esp)
        call    puts
        xorl    %eax, %eax
-       fldz
        fld1
        fsts    -16(%ebp)
+       fldz
+       fsts    -12(%ebp)
+       fld     %st(0)
        fld     %st(1)
-       fld     %st(2)
-       fld     %st(3)
        jmp     .L2
        .p2align 4,,7
 .L7:
-       fstp    %st(5)
-       fstp    %st(0)
-       fxch    %st(1)
-       fxch    %st(2)
-       fxch    %st(3)
-       fxch    %st(4)
        fxch    %st(3)
+       fxch    %st(2)
 .L2:
-       fld     %st(1)
+       fld     %st(2)
        addl    $1, %eax
-       fmul    %st(3), %st
+       fmul    %st(1), %st
        cmpl    $100000000, %eax
-       fstps   -12(%ebp)
+       flds    -12(%ebp)
+       fmul    %st(5), %st
+       fsubrp  %st, %st(1)
+       flds    -12(%ebp)
+       fmul    %st(3), %st
        flds    -16(%ebp)
-       fmul    %st(1), %st
-       fsubrs  -12(%ebp)
-       fstps   -12(%ebp)
-       fmul    %st(4), %st
-       fld     %st(3)
        fmul    %st(3), %st
        fsubrp  %st, %st(1)
        flds    -16(%ebp)
-       fmulp   %st, %st(4)
-       fxch    %st(1)
+       fmul    %st(6), %st
+       fxch    %st(5)
        fmul    %st(4), %st
-       fsubrp  %st, %st(3)
-       flds    -16(%ebp)
-       fld     %st(3)
+       fsubrp  %st, %st(5)
        fxch    %st(2)
-       fsts    -16(%ebp)
-       flds    -12(%ebp)
+       fstps   -12(%ebp)
+       fxch    %st(2)
+       fstps   -16(%ebp)
        jne     .L7
-       fstp    %st(0)
-       fstp    %st(5)
-       fstp    %st(0)
-       fstp    %st(0)
-       fstp    %st(0)
+       fstp    %st(3)
+       fxch    %st(1)
        movl    $.LC3, (%esp)
        fstps   -40(%ebp)
+       fxch    %st(1)
        fstps   -56(%ebp)
+       fstps   -72(%ebp)
        call    puts
        flds    -40(%ebp)
        fstpl   20(%esp)
        flds    -56(%ebp)
        fstpl   12(%esp)
-       flds    -12(%ebp)
+       flds    -72(%ebp)
        fstpl   4(%esp)
        movl    $.LC4, (%esp)
        call    printf
-       addl    $84, %esp
+       addl    $100, %esp
        xorl    %eax, %eax
        popl    %ecx
        popl    %ebp

where (+++) is with -msse.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19780

[Bug rtl-optimization/19780] Floating point computation far slower for -mfpmath=sse

Reply via email to