[Bug c/40977] New: Problem with code like this: res = ((uint64_t)resh << 32) | resl;

ami_stuff at o2 dot pl Wed, 05 Aug 2009 13:26:09 -0700

Hi,

When I compile source code with GCC 4.4.1 (-m68060 -O3 -fomit-frame-pointer):


#include <stdint.h>

#define umul_ppmm(xh, xl, a, b) \
  __asm__ ("| Inlined umul_ppmm\n"                                      \
           "    move%.l %2,%/d0\n"                                      \
           "    move%.l %3,%/d1\n"                                      \
           "    move%.l %/d0,%/d2\n"                                    \
           "    swap    %/d0\n"                                         \
           "    move%.l %/d1,%/d3\n"                                    \
           "    swap    %/d1\n"                                         \
           "    move%.w %/d2,%/d4\n"                                    \
           "    mulu    %/d3,%/d4\n"                                    \
           "    mulu    %/d1,%/d2\n"                                    \
           "    mulu    %/d0,%/d3\n"                                    \
           "    mulu    %/d0,%/d1\n"                                    \
           "    move%.l %/d4,%/d0\n"                                    \
           "    eor%.w  %/d0,%/d0\n"                                    \
           "    swap    %/d0\n"                                         \
           "    add%.l  %/d0,%/d2\n"                                    \
           "    add%.l  %/d3,%/d2\n"                                    \
           "    jcc     1f\n"                                           \
           "    add%.l  %#65536,%/d1\n"                                 \
           "1:  swap    %/d2\n"                                         \
           "    moveq   %#0,%/d0\n"                                     \
           "    move%.w %/d2,%/d0\n"                                    \
           "    move%.w %/d4,%/d2\n"                                    \
           "    move%.l %/d2,%1\n"                                      \
           "    add%.l  %/d1,%/d0\n"                                    \
           "    move%.l %/d0,%0"                                        \
           : "=g" ((uint32_t) (xh)),                                    \
             "=g" ((uint32_t) (xl))                                     \
           : "g" ((uint32_t) (a)),                                      \
             "g" ((uint32_t) (b))                                       \
           : "d0", "d1", "d2", "d3", "d4")


int64_t MUL64(int a, int b)
 {
   uint32_t au = a;
   uint32_t bu = b;

   uint32_t resh, resl;
   uint64_t res;

   umul_ppmm(resh, resl, au, bu);

   if (a < 0)
     resh -= bu;
   if (b < 0)
     resh -= au;

   res = ((uint64_t)resh << 32) | resl;

   return res;
}

I get this asm output:

#NO_APP
        .text
        .even
        .globl  _MUL64
_MUL64:
        movem.l #16128,-(sp)
        move.l 28(sp),a0
        move.l 32(sp),a1
#APP
        | Inlined umul_ppmm
        move.l  a0,d0
        move.l  a1,d1
        move.l  d0,d2
        swap    d0
        move.l  d1,d3
        swap    d1
        move.w  d2,d4
        mulu    d3,d4
        mulu    d1,d2
        mulu    d0,d3
        mulu    d0,d1
        move.l  d4,d0
        eor.w   d0,d0
        swap    d0
        add.l   d0,d2
        add.l   d3,d2
        jcc     1f
        add.l   #65536,d1
1:      swap    d2
        moveq   #0,d0
        move.w  d2,d0
        move.w  d4,d2
        move.l  d2,d6
        add.l   d1,d0
        move.l  d0,d5
#NO_APP
        tst.l a0
        jlt L6
        tst.l a1
        jlt L7
L3:
        move.l d5,d0
        clr.l d1
        move.l d0,d2
        move.l d1,d3
        or.l d6,d3
        move.l d2,d6
        move.l d3,d7
        move.l d2,d0
        move.l d7,d1
        movem.l (sp)+,#252
        rts
L7:
        sub.l a0,d5
        move.l d5,d0
        clr.l d1
        move.l d0,d2
        move.l d1,d3
        or.l d6,d3
        move.l d2,d6
        move.l d3,d7
        move.l d2,d0
        move.l d7,d1
        movem.l (sp)+,#252
        rts
L6:
        sub.l a1,d5
        tst.l a1
        jge L3
        jra L7

The asm output is not good (not optimized), because GCC 4.4.1 have problems
with this part of the code:

res = ((uint64_t)resh << 32) | resl;


GCC 3.4.0 generates better asm output (-m68060 -O3 -fomit-frame-ppointer):

#NO_APP
        .text
        .even
        .globl  _MUL64
_MUL64:
        moveml #0x3e00,s...@-
        movel sp@(24),a1
        movel sp@(28),a0
#APP
        | Inlined umul_ppmm
        movel   a1,d0
        movel   a0,d1
        movel   d0,d2
        swap    d0
        movel   d1,d3
        swap    d1
        movew   d2,d4
        mulu    d3,d4
        mulu    d1,d2
        mulu    d0,d3
        mulu    d0,d1
        movel   d4,d0
        eorw    d0,d0
        swap    d0
        addl    d0,d2
        addl    d3,d2
        jcc     1f
        addl    #65536,d1
1:      swap    d2
        moveq   #0,d0
        movew   d2,d0
        movew   d4,d2
        movel   d2,d6
        addl    d1,d0
        movel   d0,d5
#NO_APP
        tstl a1
        jlt L5
        tstl a0
        jge L3
        jra L6
        .even
L5:
        subl a0,d5
        tstl a0
        jge L3
        .even
L6:
        subl a1,d5
        .even
L3:
        movel d5,d0
        clrl d1
        orl d6,d1
        moveml s...@+,#0x7c
        rts


For GCC 4.4.1 the best asm code is generaded from this C code:

typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;

#define umul_ppmm(xh, xl, a, b)                 \
  __asm__ ("| Inlined umul_ppmm\n"              \
           "    move%.l %2,%/d5\n"              \
           "    move%.l %3,%/d6\n"              \
           "    move%.l %/d5,%/d2\n"            \
           "    swap    %/d5\n"                 \
           "    move%.l %/d6,%/d3\n"            \
           "    swap    %/d6\n"                 \
           "    move%.w %/d2,%/d4\n"            \
           "    mulu    %/d3,%/d4\n"            \
           "    mulu    %/d6,%/d2\n"            \
           "    mulu    %/d5,%/d3\n"            \
           "    mulu    %/d5,%/d6\n"            \
           "    move%.l %/d4,%/d5\n"            \
           "    eor%.w  %/d5,%/d5\n"            \
           "    swap    %/d5\n"                 \
           "    add%.l  %/d5,%/d2\n"            \
           "    add%.l  %/d3,%/d2\n"            \
           "    jcc     1f\n"                   \
           "    add%.l  %#65536,%/d6\n"         \
           "1:  swap    %/d2\n"                 \
           "    moveq   %#0,%/d5\n"             \
           "    move%.w %/d2,%/d5\n"            \
           "    move%.w %/d4,%/d2\n"            \
           "    move%.l %/d2,%1\n"              \
           "    add%.l  %/d6,%/d5\n"            \
           "    move%.l %/d5,%0"                \
           : "=g" ((uint32_t) (xh)),            \
             "=g" ((uint32_t) (xl))             \
           : "g" ((uint32_t) (a)),              \
             "g" ((uint32_t) (b))               \
           : "d2", "d3", "d4", "d5", "d6")

inline uint64_t MUL64(int a, int b)
{
  uint32_t au = a;
  uint32_t bu = b;

  union
  {
    struct
    {
      uint32_t h, l;
    } parts;

    uint64_t whole;
  } res;

  umul_ppmm(res.parts.h, res.parts.l, au, bu);

  if (a < 0)
    res.parts.h -= bu;
  if (b < 0)
    res.parts.h -= au;

  return res.whole;
}

Asm output:

#NO_APP
        .text
        .even
        .globl  _MUL64
_xxMULH:
        movem.l #15872,-(sp)
        move.l 24(sp),a0
        move.l 28(sp),a1
#APP
        | Inlined umul_ppmm
        move.l  a0,d5
        move.l  a1,d6
        move.l  d5,d2
        swap    d5
        move.l  d6,d3
        swap    d6
        move.w  d2,d4
        mulu    d3,d4
        mulu    d6,d2
        mulu    d5,d3
        mulu    d5,d6
        move.l  d4,d5
        eor.w   d5,d5
        swap    d5
        add.l   d5,d2
        add.l   d3,d2
        jcc     1f
        add.l   #65536,d6
1:      swap    d2
        moveq   #0,d5
        move.w  d2,d5
        move.w  d4,d2
        move.l  d2,d1
        add.l   d6,d5
        move.l  d5,d0
#NO_APP
        tst.l a0
        jlt L6
        tst.l a1
        jlt L7
L3:
        movem.l (sp)+,#124
        rts
L7:
        sub.l a0,d0
        movem.l (sp)+,#124
        rts
L6:
        sub.l a1,d0
        tst.l a1
        jge L3


Maybe someone will teach GCC 4.4.x to use optimized code with:

res = ((uint64_t)resh << 32) | resl;

?

Regards


-- 
           Summary: Problem with code like this: res = ((uint64_t)resh <<
                    32) | resl;
           Product: gcc
           Version: 4.4.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: ami_stuff at o2 dot pl
  GCC host triplet: i686-cygwin
GCC target triplet: m68k-amigaos


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40977

[Bug c/40977] New: Problem with code like this: res = ((uint64_t)resh << 32) | resl;

Reply via email to