The function below, compiled with -O9 -fomit-frame-pointer, fails to optimize 
the final 16-bit swap 
into an x86 "roll" instruction *if* the return type is uint64_t.  If the return 
type is uint32_t, a roll 
instruction is generated, and the resulting code is 6 instructions (19 bytes) 
shorter.  All of the math in 
the function is done in 32-bit unsigned values, so ideally the only effect of 
setting the return type to 
uint64_t should be that the compiler arranges for %edx to be zero at the end.

typedef unsigned long long uint64_t;
typedef unsigned long uint32_t;
#define REV32STEP(VAR, SHIFT, MASK) \
  VAR = (((VAR >> SHIFT) & MASK) | ((VAR << SHIFT) & (0xFFFFFFFFUL & ~MASK)))
uint64_t bitreverse2 (uint32_t n) {
  uint32_t lower = n;
  REV32STEP(lower,  1, 0x55555555UL); /* odd/even bits */
  REV32STEP(lower,  2, 0x33333333UL); /* bitpairs */
  REV32STEP(lower,  4, 0x0F0F0F0FUL); /* nibbles */
  REV32STEP(lower,  8, 0x00FF00FFUL); /* bytes */
  REV32STEP(lower, 16, 0x0000FFFFUL); /* halfwords */
  return lower;
}

64-bit version:

bitreverse2:
        movl    4(%esp), %edx
        movl    %edx, %eax
        addl    %edx, %edx
        shrl    %eax
        andl    $-1431655766, %edx
        andl    $1431655765, %eax
        orl     %edx, %eax
        movl    %eax, %edx
        shrl    $2, %edx
        sall    $2, %eax
        andl    $858993459, %edx
        andl    $-858993460, %eax
        orl     %eax, %edx
        movl    %edx, %eax
        shrl    $4, %eax
        sall    $4, %edx
        andl    $252645135, %eax
        andl    $-252645136, %edx
        orl     %edx, %eax
        xorl    %edx, %edx   # <- edx is cleared here
        movl    %eax, %ecx
        shrl    $8, %ecx
        sall    $8, %eax
        andl    $16711935, %ecx
        andl    $-16711936, %eax
        orl     %eax, %ecx
        # final swap starts here
        movl    %ecx, %eax
        shrl    $16, %eax
        sall    $16, %ecx
        andl    $65535, %eax    # <- redundant after shift
        andl    $-65536, %ecx  # <- redundant after shift
        orl     %ecx, %eax
        ret

32-bit version:

bitreverse2:
        movl    4(%esp), %edx
        movl    %edx, %eax
        addl    %edx, %edx
        shrl    %eax
        andl    $-1431655766, %edx
        andl    $1431655765, %eax
        orl     %edx, %eax
        movl    %eax, %edx
        shrl    $2, %edx
        sall    $2, %eax
        andl    $858993459, %edx
        andl    $-858993460, %eax
        orl     %eax, %edx
        movl    %edx, %ecx
        shrl    $4, %ecx
        sall    $4, %edx
        andl    $252645135, %ecx
        andl    $-252645136, %edx
        orl     %edx, %ecx
        movl    %ecx, %eax
        shrl    $8, %eax
        sall    $8, %ecx
        andl    $16711935, %eax
        andl    $-16711936, %ecx
        orl     %ecx, %eax
        roll    $16, %eax     # <- final swap
        ret

This is with compiler sources from CVS as of about 6AM this morning US/Eastern.

-- 
           Summary: returning 64-bit value turns off some 32-bit
                    optimizations
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: minor
          Priority: P2
         Component: rtl-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: raeburn at raeburn dot org
                CC: gcc-bugs at gcc dot gnu dot org
 GCC build triplet: i686-pc-linux-gnu
  GCC host triplet: i686-pc-linux-gnu
GCC target triplet: i686-pc-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23811

Reply via email to