http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54802
Bug #: 54802 Summary: Trivial code changes result in different assembly with respect to rotations and bswap. Classification: Unclassified Product: gcc Version: 4.8.0 Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: c AssignedTo: unassig...@gcc.gnu.org ReportedBy: jasongross9+bugzi...@gmail.com Created attachment 28347 --> http://gcc.gnu.org/bugzilla/attachment.cgi?id=28347 Code files In some C code, manually inlining constants changes whether or not gcc compiles things to rotations or to bswaps. In particular, the following code uint64_t reverse0(uint64_t v) { v = ((v >> 1) & 0x5555555555555555ULL) | ((v & 0x5555555555555555ULL) << 1); v = ((v >> 2) & 0x3333333333333333ULL) | ((v & 0x3333333333333333ULL) << 2); v = ((v >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((v & 0x0F0F0F0F0F0F0F0FULL) << 4); v = ((v >> 8) & 0x00FF00FF00FF00FFULL) | ((v & 0x00FF00FF00FF00FFULL) << 8); v = ((v >> 16) & 0x0000FFFF0000FFFFULL) | ((v & 0x0000FFFF0000FFFFULL) << 16); const uint64_t va = ((v >> 32) & 0x00000000FFFFFFFFULL), vb = ((v & 0x00000000FFFFFFFFULL) << 32); v = va | vb; return v; } uint64_t reverse1(uint64_t v) { v = ((v >> 1) & 0x5555555555555555ULL) | ((v & 0x5555555555555555ULL) << 1); v = ((v >> 2) & 0x3333333333333333ULL) | ((v & 0x3333333333333333ULL) << 2); v = ((v >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((v & 0x0F0F0F0F0F0F0F0FULL) << 4); v = ((v >> 8) & 0x00FF00FF00FF00FFULL) | ((v & 0x00FF00FF00FF00FFULL) << 8); v = ((v >> 16) & 0x0000FFFF0000FFFFULL) | ((v & 0x0000FFFF0000FFFFULL) << 16); v = ((v >> 32) & 0x00000000FFFFFFFFULL) | ((v & 0x00000000FFFFFFFFULL) << 32); return v; } compiles to reverse0: .LFB8: .cfi_startproc movq %rdi, %rdx movabsq $6148914691236517205, %rax movabsq $3689348814741910323, %rcx shrq %rdx andq %rax, %rdx andq %rdi, %rax addq %rax, %rax orq %rdx, %rax movq %rax, %rdx andq %rcx, %rax shrq $2, %rdx salq $2, %rax andq %rcx, %rdx movabsq $1085102592571150095, %rcx orq %rdx, %rax movq %rax, %rdx andq %rcx, %rax shrq $4, %rdx salq $4, %rax andq %rcx, %rdx orq %rdx, %rax bswap %rax ret .cfi_endproc .LFE8: .size reverse0, .-reverse0 .p2align 4,,15 .globl reverse1 .type reverse1, @function reverse1: .LFB9: .cfi_startproc movq %rdi, %rdx movabsq $6148914691236517205, %rax movabsq $3689348814741910323, %rcx shrq %rdx andq %rax, %rdx andq %rdi, %rax addq %rax, %rax orq %rdx, %rax movq %rax, %rdx andq %rcx, %rax shrq $2, %rdx salq $2, %rax andq %rcx, %rdx movabsq $1085102592571150095, %rcx orq %rdx, %rax movq %rax, %rdx andq %rcx, %rax shrq $4, %rdx salq $4, %rax andq %rcx, %rdx movabsq $71777214294589695, %rcx orq %rdx, %rax movq %rax, %rdx andq %rcx, %rax shrq $8, %rdx salq $8, %rax andq %rcx, %rdx movabsq $281470681808895, %rcx orq %rdx, %rax movq %rax, %rdx andq %rcx, %rax shrq $16, %rdx salq $16, %rax andq %rcx, %rdx orq %rdx, %rax rorq $32, %rax ret .cfi_endproc .LFE9: .size reverse1, .-reverse1 .p2align 4,,15 .globl reverse2 .type reverse2, @function In the code that I'm using this in, reverse0 is 30% faster than reverse1. I don't think that manual constant inlining, when each constant is used exactly once, should change the assembly code that gcc compiles to. The relevant (.c, .i, .s, and a log of the command line) files are attached.