m68k - GCC 4.4.0 generates not so good code from asm inline

2009-07-29 Thread ami_stuff
Hi,

Here is a C source code which I compiled with GCC 3.4.0 and GCC 4.4.0. GCC 
3.4.0 output looks a lot better.

#include 
#include 

#define umul_ppmm(xh, xl, a, b) \
__asm__ ("| Inlined umul_ppmm\n" \
" move.l %0,%/d5\n" \
" move.l %1,%/d4\n" \
" moveq #16,%/d3\n" \
" move.l %0,%/d2\n" \
" mulu %1,%0\n" \
" lsr.l %/d3,%/d4\n" \
" lsr.l %/d3,%/d5\n" \
" mulu %/d4,%/d2\n" \
" mulu %/d5,%1\n" \
" mulu %/d5,%/d4\n" \
" move.l %/d2,%/d5\n" \
" lsr.l %/d3,%/d2\n" \
" add.w %1,%/d5\n" \
" addx.l %/d2,%/d4\n" \
" lsl.l %/d3,%/d5\n" \
" lsr.l %/d3,%1\n" \
" add.l %/d5,%0\n" \
" addx.l %/d4,%1" \
: "=d" ((uint32_t) (xl)), "=d" ((uint32_t) (xh)) \
: "0" ((uint32_t) (a)), "1" ((uint32_t) (b)) \
: "d2", "d3", "d4", "d5")

inline int64_t MUL64(int a, int b)
{
uint32_t au = a;
uint32_t bu = b;

uint32_t resh, resl;
uint64_t res;

umul_ppmm(resh, resl, au, bu);

if (a < 0)
resh -= bu;
if (b < 0)
resh -= au;

res = ((uint64_t)resh << 32) | resl;

return res;
} 


GCC 4.4.0 asm output:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16128,-(sp)
move.l 28(sp),d0
move.l 32(sp),a0
move.l d0,d6
move.l a0,d1
#APP
;# 36 "mul642.c" 1
| Inlined umul_ppmm
move.l d6,d5
move.l d1,d4
moveq #16,d3
move.l d6,d2
mulu d1,d6
lsr.l d3,d4
lsr.l d3,d5
mulu d4,d2
mulu d5,d1
mulu d5,d4
move.l d2,d5
lsr.l d3,d2
add.w d1,d5
addx.l d2,d4
lsl.l d3,d5
lsr.l d3,d1
add.l d5,d6
addx.l d4,d1
#NO_APP
tst.l d0
jlt L6
tst.l a0
jlt L7
L3:
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L7:
sub.l d0,d1
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L6:
sub.l a0,d1
tst.l a0
jge L3
jra L7 

GCC 3.4.0 asm output:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
moveml #0x3f00,s...@-
movel sp@(28),d1
movel sp@(32),d0
movel d1,d7
movel d0,d6
#APP
| Inlined umul_ppmm
move.l d7,d5
move.l d6,d4
moveq #16,d3
move.l d7,d2
mulu d6,d7
lsr.l d3,d4
lsr.l d3,d5
mulu d4,d2
mulu d5,d6
mulu d5,d4
move.l d2,d5
lsr.l d3,d2
add.w d6,d5
addx.l d2,d4
lsl.l d3,d5
lsr.l d3,d6
add.l d5,d7
addx.l d4,d6
#NO_APP
tstl d1
jlt L5
tstl d0
jge L3
jra L6
.even
L5:
subl d0,d6
tstl d0
jge L3
.even
L6:
subl d1,d6
.even
L3:
movel d6,d0
clrl d1
orl d7,d1
moveml s...@+,#0xfc
rts 

Is it a regression?

Regards



Re: m68k - GCC 4.4.0 generates not so good code from asm inline

2009-07-29 Thread ami_stuff
When I use -O1 with GCC 4.4.0 (-m68060 -fomit-frame-pointer), I get better code.

#include 
#include 

inline int64_t MUL64(int a, int b)
{

uint32_t resh, resl;
uint32_t au = a;
uint32_t bu = b;

__asm__ ("move.l %0, d5\n\t"
"move.l %1, d4\n\t"
"moveq #16, d3\n\t"
"move.l %0, d2\n\t"
"mulu %1, %0\n\t"
"lsr.l d3, d4\n\t"
"lsr.l d3, d5\n\t"
"mulu d4, d2\n\t"
"mulu d5, %1\n\t"
"mulu d5, d4\n\t"
"move.l d2, d5\n\t"
"lsr.l d3, d2\n\t"
"add.w %1, d5\n\t"
"addx.l d2, d4\n\t"
"lsl.l d3, d5\n\t"
"lsr.l d3, %1\n\t"
"add.l d5, %0\n\t"
"addx.l d4, %1\n\t"
: "=d"(resl), "=d"(resh)
: "0"(au), "1"(bu)
: "d2", "d3", "d4", "d5");

if (a < 0)
resh -= bu;
if (b < 0)
resh -= au;

return ((uint64_t)resh << 32) | resl;
}

GCC 4.4.0 -O3:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16128,-(sp)
move.l 28(sp),d0
move.l 32(sp),a0
move.l d0,d6
move.l a0,d1
#APP
;# 11 "mul645.c" 1
move.l d6, d5
move.l d1, d4
moveq #16, d3
move.l d6, d2
mulu d1, d6
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d1
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d1, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d1
add.l d5, d6
addx.l d4, d1

#NO_APP
tst.l d0
jlt L6
tst.l a0
jlt L7
L3:
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L7:
sub.l d0,d1
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L6:
sub.l a0,d1
tst.l a0
jge L3
jra L7

GCC 4.4.0 -O2:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16128,-(sp)
move.l 28(sp),d0
move.l 32(sp),a0
move.l d0,d6
move.l a0,d1
#APP
;# 11 "mul645.c" 1
move.l d6, d5
move.l d1, d4
moveq #16, d3
move.l d6, d2
mulu d1, d6
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d1
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d1, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d1
add.l d5, d6
addx.l d4, d1

#NO_APP
tst.l d0
jlt L6
tst.l a0
jlt L7
L3:
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L7:
sub.l d0,d1
move.l d1,d2
clr.l d3
move.l d2,d0
move.l d3,d1
or.l d6,d1
move.l d0,d6
move.l d1,d7
move.l d7,d1
movem.l (sp)+,#252
rts
L6:
sub.l a0,d1
tst.l a0
jge L3
jra L7

GCC 4.4.0 -O1:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
movem.l #16176,-(sp)
move.l 40(sp),d0
move.l 36(sp),a2
move.l a2,d7
move.l d0,d6
#APP
;# 11 "mul645.c" 1
move.l d7, d5
move.l d6, d4
moveq #16, d3
move.l d7, d2
mulu d6, d7
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d6
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d6, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d6
add.l d5, d7
addx.l d4, d6

#NO_APP
tst.l a2
jge L2
sub.l d0,d6
L2:
tst.l d0
jge L3
sub.l a2,d6
L3:
move.l d6,d1
clr.l d2
or.l d7,d2
move.l d1,d0
move.l d2,d1
movem.l (sp)+,#3324
rts

GCC 4.4.0 -O0:

#NO_APP
.text
.even
.globl _MUL64
_MUL64:
lea (-16,sp),sp
movem.l #16128,-(sp)
move.l 44(sp),32(sp)
move.l 48(sp),36(sp)
move.l 32(sp),d1
move.l 36(sp),d0
#APP
;# 11 "mul645.c" 1
move.l d1, d5
move.l d0, d4
moveq #16, d3
move.l d1, d2
mulu d0, d1
lsr.l d3, d4
lsr.l d3, d5
mulu d4, d2
mulu d5, d0
mulu d5, d4
move.l d2, d5
lsr.l d3, d2
add.w d0, d5
addx.l d2, d4
lsl.l d3, d5
lsr.l d3, d0
add.l d5, d1
addx.l d4, d0

#NO_APP
move.l d1,28(sp)
move.l d0,24(sp)
tst.l 44(sp)
jge L2
move.l 36(sp),d0
sub.l d0,24(sp)
L2:
tst.l 48(sp)
jge L3
move.l 32(sp),d2
sub.l d2,24(sp)
L3:
move.l 24(sp),d7
clr.l d6
move.l d7,d0
clr.l d1
move.l 28(sp),a1
lea 0.w,a0
move.l a0,d2
move.l a1,d3
or.l d2,d0
or.l d3,d1
movem.l (sp)+,#252
lea (16,sp),sp
rts 

Regards



GCC's bug reports not available for search

2009-08-05 Thread ami_stuff
Hi,

Please someone change the status of these bug reports to be available in the 
search engine of bugtracker:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40819
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40977

Thanks

Regards



Re: GCC's bug reports not available for search

2009-08-05 Thread ami_stuff
Hi,

> You have to be more specific on what you refer to with "search engine
> of bugtracker".

http://gcc.gnu.org/bugzilla/query.cgi

Sorry, these bugs are available for search, but not with "m68k" keyword.
Maybe this can be fixed.

Regards



Re: GCC's bug reports not available for search

2009-08-05 Thread ami_stuff
Hi,

Ok, I didn't use "advenced search". Problem solved.

Regards



GCC 4..4.x speed regression - help?

2009-08-16 Thread ami_stuff
Hi,

I found out that GCC 4.4.x build of minigzip from zlib package is a lot slower 
compared to GCC 3.4.0 build.
Maybe someone can compile minigzip for his system with GCC 3.4.x and GCC 4.4.x 
and compare time of compression
with bigger file? This way we would know if this regression only happens on the 
m68k GCC or maybe on other
platforms too.

I really don't like regressions like this :/

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40454

Regards



Re: GCC 4..4.x speed regression - help?

2009-08-16 Thread ami_stuff
Hi,

The problematic source code is deflate.c from libz.

CFLAGS=-O3 -DUSE_MMAP -m68060 -fomit-frame-pointer

When I compile all source code with GCC 4.4.1, I get slow minigzip binary.

When I compile all source code with GCC 4.4.1 except deflate.c (this one I
compile with GCC 3.4.0), I get minigzip binary with normal speed.

Regards



"-fno-unswitch-loops" option have no effect?

2009-08-16 Thread ami_stuff
Hi,

I found out that when I use "-fno-unswitch-loops" option, it have no effect - 
loops are unswitched anyway:

-O3 -fno-unswitch-loops

Because of that to avoid -funswitch-loops optimization I must use something 
like this:

-O2 -finline-functions -fpredictive-commoning -fgcse-after-reload 
-ftree-vectorize 

Is that a bug?

Tested with GCC 4.4.1 (m68k).

Regards



Re: "-fno-unswitch-loops" option have no effect?

2009-08-17 Thread ami_stuff
Hi,

> Hundreds and hundreds of people read this list, so every low-detail
> "I think there may be a bug" message you send wastes hours of other
> people's time.

Ok, Ok, but if someone will reproduce the same problem on his system I can fill 
bugreport,
otherwise I will only waste my time, hours of my time, to start detailed 
bugreport.
Maybe this is only fault of my GCC's build?

> If it *is* a bug, on the bug tracker, it will be picked up by people
> interested in that specific area

It may takes months before someone will look at the bugreport. If it's really a 
bug IMHO
it's a bit critical bug.

> though you don't seem to do much
> in-depth research, nor do you supply any detail of what you did and
> what you expected.

There is no need for much detail, just compile any bigger sourcecode with -O3 
-fno-.. and
after that with -O2  When the object files are not identical, something is 
wrong
and I can start a bug report.

Regards



Re: "-fno-unswitch-loops" option have no effect?

2009-08-17 Thread ami_stuff
Hi,

> Works fine for me.  gcc.dg/tree-ssa/loop-6.c is unswitched with -O3
> but not with -O3 -fno-unswitch-loops.

This one works for me too.

Could you try to compile "deflate.c" from libz?

Here are my results:

-m68060 -O3 -fno-unswitch-loops 
   - 12,9kb
-m68060 -O3 
   - 12,9kb
-m68060 -O2 -finline-functions -fgcse-after-reload -ftree-vectorize 
-fpredictive-commoning - 12,4kb

but it's a lot more visible with "libavcodec/dsputil.c" from FFmpeg package:

OPTFLAGS= -mnobitfield -m68060 -std=c99  -Wdeclaration-after-statement 
-Wdisabled-optimization -fno-math-errno -D_ISOC99_SOURCE 
-D_POSIX_C_SOURCE=200112 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -fno-common 
-fomit-frame-pointer -Wall -Wno-switch -Wpointer-arith -Wredundant-decls 
-Wcast-qual -Wwrite-strings -Wundef -O2 -finline-functions -fgcse-after-reload 
-ftree-vectorize -fpredictive-commoning:

306kb

OPTFLAGS= -mnobitfield -m68060 -std=c99  -Wdeclaration-after-statement 
-Wdisabled-optimization -fno-math-errno -D_ISOC99_SOURCE 
-D_POSIX_C_SOURCE=200112 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -fno-common 
-fomit-frame-pointer -Wall -Wno-switch -Wpointer-arith -Wredundant-decls 
-Wcast-qual -Wwrite-strings -Wundef -O3 -fno-unswitch-loops:

1,18mb

OPTFLAGS= -mnobitfield -m68060 -std=c99  -Wdeclaration-after-statement 
-Wdisabled-optimization -fno-math-errno -D_ISOC99_SOURCE 
-D_POSIX_C_SOURCE=200112 -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -fno-common 
-fomit-frame-pointer -Wall -Wno-switch -Wpointer-arith -Wredundant-decls 
-Wcast-qual -Wwrite-strings -Wundef -O3

1,18mb

Regards



Re: "-fno-unswitch-loops" option have no effect?

2009-08-17 Thread ami_stuff
Hi,

> >
> > Here are my results:
> >
> > -m68060 -O3 -fno-unswitch-loops 
> >- 12,9kb
> > -m68060 -O3 
> >- 12,9kb
> > -m68060 -O2 -finline-functions -fgcse-after-reload -ftree-vectorize 
> > -fpredictive-commoning - 12,4kb
> 
> And why do you think that loop unswitching has anything to do with the 
> size growth?

Shouldn't I get the same file sizes with:

-m68060 -O3 -fno-unswitch-loops   

and

-m68060 -O2 -finline-functions -fgcse-after-reload -ftree-vectorize 
-fpredictive-commoning

?

I don't know, but I think the same optimalizations should be used in these 
cases, so file size should be the same.
I'm wrong?



Re: "-fno-unswitch-loops" option have no effect?

2009-08-17 Thread ami_stuff
Hi,

> gcc > 4 AOS 68k builds are build from offical gcc sourcetree and there are
> no changes 

Yes, I know, but some problems may happens anyway like it happened before
(problems with defines).
I think if there is a problem the best is to compare GCCs from different 
sources.

> how much is size diffrent ?

306kb vs 1,18mb - "dsputil.c"

> .but i
> think you can easy test, compile the ffmpeg, if it give no internal
> compiler error with -fno then it work.

I get the same ICE with "-O3" and "-O3 -fno-unswitch-loops" - "mpegvideo.c" 
file.

Regards



Speed regression (m68k)

2009-06-13 Thread ami_stuff
Hi,

I notice about 20% speed regression with GCC 4.4.0 PNGCrush build compared to 
GCC 3.4.0 build (Amiga 68...@50mhz). 

CFLAGS = -I. -DNO_FSEEKO -O2 -fomit-frame-pointer -Wall -m68060 -s

Here are the results:

GCC 3.4.0:

CPU time used = 267.340 seconds (decoding 16.940,
encoding 247.800, other 2.600 seconds)

GCC 4.4.0:

CPU time used = 328.360 seconds (decoding 16.800,
encoding 309.260, other 2.300 seconds) 

Maybe someone with m68k Debian/PPC/x86 can compile PNGCrush with GCC 3.4.0 and 
GCC 4.4.0, so we will know if this regression happens there too?

Regards




Re: Speed regression (m68k)

2009-06-15 Thread ami_stuff
Hi,

> I assume that you are measuring the speed of the compiled code, not the
> speed of the compiler itself.

Yes.

> Can you open a bug report about this,
> with enough information for other people to reproduce the results (e.g.,
> pointer to source code being compiled and input file(s)).

Done. Bug #40454.

Regards