Hi, When I compile source code with GCC 4.4.1 (-m68060 -O3 -fomit-frame-pointer):
#include <stdint.h> #define umul_ppmm(xh, xl, a, b) \ __asm__ ("| Inlined umul_ppmm\n" \ " move%.l %2,%/d0\n" \ " move%.l %3,%/d1\n" \ " move%.l %/d0,%/d2\n" \ " swap %/d0\n" \ " move%.l %/d1,%/d3\n" \ " swap %/d1\n" \ " move%.w %/d2,%/d4\n" \ " mulu %/d3,%/d4\n" \ " mulu %/d1,%/d2\n" \ " mulu %/d0,%/d3\n" \ " mulu %/d0,%/d1\n" \ " move%.l %/d4,%/d0\n" \ " eor%.w %/d0,%/d0\n" \ " swap %/d0\n" \ " add%.l %/d0,%/d2\n" \ " add%.l %/d3,%/d2\n" \ " jcc 1f\n" \ " add%.l %#65536,%/d1\n" \ "1: swap %/d2\n" \ " moveq %#0,%/d0\n" \ " move%.w %/d2,%/d0\n" \ " move%.w %/d4,%/d2\n" \ " move%.l %/d2,%1\n" \ " add%.l %/d1,%/d0\n" \ " move%.l %/d0,%0" \ : "=g" ((uint32_t) (xh)), \ "=g" ((uint32_t) (xl)) \ : "g" ((uint32_t) (a)), \ "g" ((uint32_t) (b)) \ : "d0", "d1", "d2", "d3", "d4") int64_t MUL64(int a, int b) { uint32_t au = a; uint32_t bu = b; uint32_t resh, resl; uint64_t res; umul_ppmm(resh, resl, au, bu); if (a < 0) resh -= bu; if (b < 0) resh -= au; res = ((uint64_t)resh << 32) | resl; return res; } I get this asm output: #NO_APP .text .even .globl _MUL64 _MUL64: movem.l #16128,-(sp) move.l 28(sp),a0 move.l 32(sp),a1 #APP | Inlined umul_ppmm move.l a0,d0 move.l a1,d1 move.l d0,d2 swap d0 move.l d1,d3 swap d1 move.w d2,d4 mulu d3,d4 mulu d1,d2 mulu d0,d3 mulu d0,d1 move.l d4,d0 eor.w d0,d0 swap d0 add.l d0,d2 add.l d3,d2 jcc 1f add.l #65536,d1 1: swap d2 moveq #0,d0 move.w d2,d0 move.w d4,d2 move.l d2,d6 add.l d1,d0 move.l d0,d5 #NO_APP tst.l a0 jlt L6 tst.l a1 jlt L7 L3: move.l d5,d0 clr.l d1 move.l d0,d2 move.l d1,d3 or.l d6,d3 move.l d2,d6 move.l d3,d7 move.l d2,d0 move.l d7,d1 movem.l (sp)+,#252 rts L7: sub.l a0,d5 move.l d5,d0 clr.l d1 move.l d0,d2 move.l d1,d3 or.l d6,d3 move.l d2,d6 move.l d3,d7 move.l d2,d0 move.l d7,d1 movem.l (sp)+,#252 rts L6: sub.l a1,d5 tst.l a1 jge L3 jra L7 The asm output is not good (not optimized), because GCC 4.4.1 have problems with this part of the code: res = ((uint64_t)resh << 32) | resl; GCC 3.4.0 generates better asm output (-m68060 -O3 -fomit-frame-ppointer): #NO_APP .text .even .globl _MUL64 _MUL64: moveml #0x3e00,s...@- movel sp@(24),a1 movel sp@(28),a0 #APP | Inlined umul_ppmm movel a1,d0 movel a0,d1 movel d0,d2 swap d0 movel d1,d3 swap d1 movew d2,d4 mulu d3,d4 mulu d1,d2 mulu d0,d3 mulu d0,d1 movel d4,d0 eorw d0,d0 swap d0 addl d0,d2 addl d3,d2 jcc 1f addl #65536,d1 1: swap d2 moveq #0,d0 movew d2,d0 movew d4,d2 movel d2,d6 addl d1,d0 movel d0,d5 #NO_APP tstl a1 jlt L5 tstl a0 jge L3 jra L6 .even L5: subl a0,d5 tstl a0 jge L3 .even L6: subl a1,d5 .even L3: movel d5,d0 clrl d1 orl d6,d1 moveml s...@+,#0x7c rts For GCC 4.4.1 the best asm code is generaded from this C code: typedef unsigned int uint32_t; typedef unsigned long long uint64_t; #define umul_ppmm(xh, xl, a, b) \ __asm__ ("| Inlined umul_ppmm\n" \ " move%.l %2,%/d5\n" \ " move%.l %3,%/d6\n" \ " move%.l %/d5,%/d2\n" \ " swap %/d5\n" \ " move%.l %/d6,%/d3\n" \ " swap %/d6\n" \ " move%.w %/d2,%/d4\n" \ " mulu %/d3,%/d4\n" \ " mulu %/d6,%/d2\n" \ " mulu %/d5,%/d3\n" \ " mulu %/d5,%/d6\n" \ " move%.l %/d4,%/d5\n" \ " eor%.w %/d5,%/d5\n" \ " swap %/d5\n" \ " add%.l %/d5,%/d2\n" \ " add%.l %/d3,%/d2\n" \ " jcc 1f\n" \ " add%.l %#65536,%/d6\n" \ "1: swap %/d2\n" \ " moveq %#0,%/d5\n" \ " move%.w %/d2,%/d5\n" \ " move%.w %/d4,%/d2\n" \ " move%.l %/d2,%1\n" \ " add%.l %/d6,%/d5\n" \ " move%.l %/d5,%0" \ : "=g" ((uint32_t) (xh)), \ "=g" ((uint32_t) (xl)) \ : "g" ((uint32_t) (a)), \ "g" ((uint32_t) (b)) \ : "d2", "d3", "d4", "d5", "d6") inline uint64_t MUL64(int a, int b) { uint32_t au = a; uint32_t bu = b; union { struct { uint32_t h, l; } parts; uint64_t whole; } res; umul_ppmm(res.parts.h, res.parts.l, au, bu); if (a < 0) res.parts.h -= bu; if (b < 0) res.parts.h -= au; return res.whole; } Asm output: #NO_APP .text .even .globl _MUL64 _xxMULH: movem.l #15872,-(sp) move.l 24(sp),a0 move.l 28(sp),a1 #APP | Inlined umul_ppmm move.l a0,d5 move.l a1,d6 move.l d5,d2 swap d5 move.l d6,d3 swap d6 move.w d2,d4 mulu d3,d4 mulu d6,d2 mulu d5,d3 mulu d5,d6 move.l d4,d5 eor.w d5,d5 swap d5 add.l d5,d2 add.l d3,d2 jcc 1f add.l #65536,d6 1: swap d2 moveq #0,d5 move.w d2,d5 move.w d4,d2 move.l d2,d1 add.l d6,d5 move.l d5,d0 #NO_APP tst.l a0 jlt L6 tst.l a1 jlt L7 L3: movem.l (sp)+,#124 rts L7: sub.l a0,d0 movem.l (sp)+,#124 rts L6: sub.l a1,d0 tst.l a1 jge L3 Maybe someone will teach GCC 4.4.x to use optimized code with: res = ((uint64_t)resh << 32) | resl; ? Regards -- Summary: Problem with code like this: res = ((uint64_t)resh << 32) | resl; Product: gcc Version: 4.4.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: ami_stuff at o2 dot pl GCC host triplet: i686-cygwin GCC target triplet: m68k-amigaos http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40977