https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65056
Bug ID: 65056 Summary: Missed optimization (x86): Redundant test/compare after arithmetic operations Product: gcc Version: 5.0 URL: http://marc.info/?l=linux-kernel&m=142373514630907 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: linux at horizon dot com Host: i586-linux-gnu Target: x86_64-*-*, i?86-*-* The following code seems to miss some really obvious optimizations on x86, both in -m32 and -m64. Gcc is generating separate test & compare instruction for conditions which are available in the condition codes set by preceding arithmetic operations. Bugs #30455 and #31799 are similar, but the problems there are caused by a memory destination, which isn't the case here. This happens with -O2, -O3 and -Os and with various models specified, so it doesn't appear to be some obscure model-specific optimization. Versions tested: * gcc (Debian 4.9.2-10) 4.9.2 * gcc-5 (Debian 5-20150205-1) 5.0.0 20150205 (experimental) [trunk revision 220455] * gcc (Debian 20150211-1) 5.0.0 20150211 (experimental) [trunk revision 220605] #include <stddef.h> #define BITS_PER_LONG (8 * sizeof(long)) #define DIV_ROUND_UP(x,n) (((x) + (n) - 1) / (n)) #define LAST_WORD_MASK(x) (~0ull >> (-(x) & BITS_PER_LONG - 1)) /* * __fls: find last set bit in word * @word: The word to search * * Undefined if no set bit exists, so code should check against 0 first. */ static inline unsigned long __fls(unsigned long word) { asm("bsr %1,%0" : "=r" (word) : "rm" (word)); return word; } size_t find_last_bit(const unsigned long *addr, size_t size) { size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG); unsigned long mask = LAST_WORD_MASK(size); while (idx--) { unsigned long val = addr[idx] & mask; if (val) return idx * BITS_PER_LONG + __fls(val); mask = ~0ul; } return size; } The code generated is (-m32 is equivalent): .file "flb.c" .section .text.unlikely,"ax",@progbits .LCOLDB0: .text .LHOTB0: .p2align 4,,15 .globl find_last_bit .type find_last_bit, @function find_last_bit: .LFB1: .cfi_startproc movl %esi, %ecx leaq 63(%rsi), %rdx movq $-1, %r8 negl %ecx shrq %cl, %r8 shrq $6, %rdx movq %r8, %rcx jmp .L2 .p2align 4,,10 .p2align 3 .L4: andq (%rdi,%rdx,8), %rcx movq %rcx, %r8 movq $-1, %rcx testq %r8, %r8 jne .L8 .L2: subq $1, %rdx cmpq $-1, %rdx jne .L4 movq %rsi, %rax ret .p2align 4,,10 .p2align 3 .L8: salq $6, %rdx #APP # 15 "flb.c" 1 bsr %r8,%r8 # 0 "" 2 #NO_APP leaq (%rdx,%r8), %rax ret .cfi_endproc .LFE1: .size find_last_bit, .-find_last_bit .section .text.unlikely .LCOLDE0: .text .LHOTE0: .ident "GCC: (Debian 20150211-1) 5.0.0 20150211 (experimental) [trunk revision 220605]" .section .note.GNU-stack,"",@progbits In the loop at .L4, there's a completely unnecessary "movq %rcx, %r8; testq %r8, %r8", when the jne could go right after the andq (and the code at .L8 changed to expect the masked value in %rcx rather than %r8). At .L2, it's even more ridiculous. The subq generates a borrow if the value wraps to -1. Why is that not just "subq $1, %rdx; jnc .L4"? A smarter compiler would notice that %rdx must have its top 6 bits clear and thus "decq %rdx; jpl .L4" would also be legal. (For non-x86 weenies, the "dec" instructions do not modify the carry flag, originally so they could be used for loop control in multi-word arithmetic. This partial flags update makes them slower than "subq $1" on many processors, so which is used depends on the model flags.) I tried reorganizing the source to encourage the first optimization: size_t find_last_bit2(const unsigned long *addr, size_t size) { unsigned long val = LAST_WORD_MASK(size); size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG); while (idx--) { val &= addr[idx]; if (val) return idx * BITS_PER_LONG + __fls(val); val = ~0ul; } return size; } ... but the generated code is identical. This version: size_t find_last_bit3(const unsigned long *addr, size_t size) { if (size) { unsigned long val = LAST_WORD_MASK(size); size_t idx = (size-1) / BITS_PER_LONG; do { val &= addr[idx]; if (val) return idx * BITS_PER_LONG + __fls(val); val = ~0ul; } while (idx--); } return size; } Makes the first optimziation, and is at least clever with the second, but it's still three instructions rather than two for an absolutely bog-standard decrement loop: find_last_bit3: .LFB3: .cfi_startproc xorl %eax, %eax testq %rsi, %rsi je .L17 movl %esi, %ecx leaq -1(%rsi), %rax movq $-1, %rdx negl %ecx shrq %cl, %rdx shrq $6, %rax jmp .L19 .p2align 4,,10 .p2align 3 .L18: subq $1, %rax movq $-1, %rdx cmpq %rdx, %rax je .L23 .L19: andq (%rdi,%rax,8), %rdx je .L18 salq $6, %rax #APP # 15 "flb.c" 1 bsr %rdx,%rdx # 0 "" 2 #NO_APP addq %rdx, %rax ret .p2align 4,,10 .p2align 3 .L23: movq %rsi, %rax .L17: rep ret