The function below, compiled with -O9 -fomit-frame-pointer, fails to optimize the final 16-bit swap into an x86 "roll" instruction *if* the return type is uint64_t. If the return type is uint32_t, a roll instruction is generated, and the resulting code is 6 instructions (19 bytes) shorter. All of the math in the function is done in 32-bit unsigned values, so ideally the only effect of setting the return type to uint64_t should be that the compiler arranges for %edx to be zero at the end.
typedef unsigned long long uint64_t; typedef unsigned long uint32_t; #define REV32STEP(VAR, SHIFT, MASK) \ VAR = (((VAR >> SHIFT) & MASK) | ((VAR << SHIFT) & (0xFFFFFFFFUL & ~MASK))) uint64_t bitreverse2 (uint32_t n) { uint32_t lower = n; REV32STEP(lower, 1, 0x55555555UL); /* odd/even bits */ REV32STEP(lower, 2, 0x33333333UL); /* bitpairs */ REV32STEP(lower, 4, 0x0F0F0F0FUL); /* nibbles */ REV32STEP(lower, 8, 0x00FF00FFUL); /* bytes */ REV32STEP(lower, 16, 0x0000FFFFUL); /* halfwords */ return lower; } 64-bit version: bitreverse2: movl 4(%esp), %edx movl %edx, %eax addl %edx, %edx shrl %eax andl $-1431655766, %edx andl $1431655765, %eax orl %edx, %eax movl %eax, %edx shrl $2, %edx sall $2, %eax andl $858993459, %edx andl $-858993460, %eax orl %eax, %edx movl %edx, %eax shrl $4, %eax sall $4, %edx andl $252645135, %eax andl $-252645136, %edx orl %edx, %eax xorl %edx, %edx # <- edx is cleared here movl %eax, %ecx shrl $8, %ecx sall $8, %eax andl $16711935, %ecx andl $-16711936, %eax orl %eax, %ecx # final swap starts here movl %ecx, %eax shrl $16, %eax sall $16, %ecx andl $65535, %eax # <- redundant after shift andl $-65536, %ecx # <- redundant after shift orl %ecx, %eax ret 32-bit version: bitreverse2: movl 4(%esp), %edx movl %edx, %eax addl %edx, %edx shrl %eax andl $-1431655766, %edx andl $1431655765, %eax orl %edx, %eax movl %eax, %edx shrl $2, %edx sall $2, %eax andl $858993459, %edx andl $-858993460, %eax orl %eax, %edx movl %edx, %ecx shrl $4, %ecx sall $4, %edx andl $252645135, %ecx andl $-252645136, %edx orl %edx, %ecx movl %ecx, %eax shrl $8, %eax sall $8, %ecx andl $16711935, %eax andl $-16711936, %ecx orl %ecx, %eax roll $16, %eax # <- final swap ret This is with compiler sources from CVS as of about 6AM this morning US/Eastern. -- Summary: returning 64-bit value turns off some 32-bit optimizations Product: gcc Version: unknown Status: UNCONFIRMED Severity: minor Priority: P2 Component: rtl-optimization AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: raeburn at raeburn dot org CC: gcc-bugs at gcc dot gnu dot org GCC build triplet: i686-pc-linux-gnu GCC host triplet: i686-pc-linux-gnu GCC target triplet: i686-pc-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23811