https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65056

            Bug ID: 65056
           Summary: Missed optimization (x86): Redundant test/compare
                    after arithmetic operations
           Product: gcc
           Version: 5.0
               URL: http://marc.info/?l=linux-kernel&m=142373514630907
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: linux at horizon dot com
              Host: i586-linux-gnu
            Target: x86_64-*-*, i?86-*-*

The following code seems to miss some really obvious optimizations on
x86, both in -m32 and -m64.  Gcc is generating separate test & compare
instruction for conditions which are available in the condition codes set by
preceding arithmetic operations.

Bugs #30455 and #31799 are similar, but the problems there are caused by a
memory destination, which isn't the case here.

This happens with -O2, -O3 and -Os and with various models specified, so it
doesn't appear to be some obscure model-specific optimization.

Versions tested:
* gcc (Debian 4.9.2-10) 4.9.2
* gcc-5 (Debian 5-20150205-1) 5.0.0 20150205 (experimental) [trunk revision
220455]
* gcc (Debian 20150211-1) 5.0.0 20150211 (experimental) [trunk revision 220605]

#include <stddef.h>

#define BITS_PER_LONG (8 * sizeof(long))
#define DIV_ROUND_UP(x,n) (((x) + (n) - 1) / (n))
#define LAST_WORD_MASK(x) (~0ull >> (-(x) & BITS_PER_LONG - 1))

/*
 * __fls: find last set bit in word
 * @word: The word to search
 *
 * Undefined if no set bit exists, so code should check against 0 first.
 */
static inline unsigned long __fls(unsigned long word)
{
    asm("bsr %1,%0"
        : "=r" (word)
        : "rm" (word));
    return word;
}

size_t find_last_bit(const unsigned long *addr, size_t size)
{
    size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG);
    unsigned long mask = LAST_WORD_MASK(size);

    while (idx--) {
        unsigned long val = addr[idx] & mask;
        if (val)
            return idx * BITS_PER_LONG + __fls(val);
        mask = ~0ul;
    }
    return size;
}

The code generated is (-m32 is equivalent):

    .file    "flb.c"
    .section    .text.unlikely,"ax",@progbits
.LCOLDB0:
    .text
.LHOTB0:
    .p2align 4,,15
    .globl    find_last_bit
    .type    find_last_bit, @function
find_last_bit:
.LFB1:
    .cfi_startproc
    movl    %esi, %ecx
    leaq    63(%rsi), %rdx
    movq    $-1, %r8
    negl    %ecx
    shrq    %cl, %r8
    shrq    $6, %rdx
    movq    %r8, %rcx
    jmp    .L2
    .p2align 4,,10
    .p2align 3
.L4:
    andq    (%rdi,%rdx,8), %rcx
    movq    %rcx, %r8
    movq    $-1, %rcx
    testq    %r8, %r8
    jne    .L8
.L2:
    subq    $1, %rdx
    cmpq    $-1, %rdx
    jne    .L4
    movq    %rsi, %rax
    ret
    .p2align 4,,10
    .p2align 3
.L8:
    salq    $6, %rdx
#APP
# 15 "flb.c" 1
    bsr %r8,%r8
# 0 "" 2
#NO_APP
    leaq    (%rdx,%r8), %rax
    ret
    .cfi_endproc
.LFE1:
    .size    find_last_bit, .-find_last_bit
    .section    .text.unlikely
.LCOLDE0:
    .text
.LHOTE0:
    .ident    "GCC: (Debian 20150211-1) 5.0.0 20150211 (experimental) [trunk
revision 220605]"
    .section    .note.GNU-stack,"",@progbits

In the loop at .L4, there's a completely unnecessary "movq %rcx, %r8;
testq %r8, %r8", when the jne could go right after the andq (and the
code at .L8 changed to expect the masked value in %rcx rather than %r8).

At .L2, it's even more ridiculous.  The subq generates a borrow if the value
wraps to -1.  Why is that not just "subq $1, %rdx; jnc .L4"?

A smarter compiler would notice that %rdx must have its top 6 bits clear
and thus "decq %rdx; jpl .L4" would also be legal.  (For non-x86 weenies,
the "dec" instructions do not modify the carry flag, originally so they
could be used for loop control in multi-word arithmetic.  This partial flags
update makes them slower than "subq $1" on many processors, so which is used
depends on the model flags.)

I tried reorganizing the source to encourage the first optimization:

size_t find_last_bit2(const unsigned long *addr, size_t size)
{
    unsigned long val = LAST_WORD_MASK(size);
    size_t idx = DIV_ROUND_UP(size, BITS_PER_LONG);

    while (idx--) {
        val &= addr[idx];
        if (val)
            return idx * BITS_PER_LONG + __fls(val);
        val = ~0ul;
    }
    return size;
}

... but the generated code is identical.


This version:

size_t find_last_bit3(const unsigned long *addr, size_t size)
{
    if (size) {
        unsigned long val = LAST_WORD_MASK(size);
        size_t idx = (size-1) / BITS_PER_LONG;

        do {
            val &= addr[idx];
            if (val)
                return idx * BITS_PER_LONG + __fls(val);
            val = ~0ul;
        } while (idx--);
    }
    return size;
}

Makes the first optimziation, and is at least clever with the second, but it's
still three instructions rather than two for an absolutely bog-standard
decrement loop:

find_last_bit3:
.LFB3:
    .cfi_startproc
    xorl    %eax, %eax
    testq    %rsi, %rsi
    je    .L17
    movl    %esi, %ecx
    leaq    -1(%rsi), %rax
    movq    $-1, %rdx
    negl    %ecx
    shrq    %cl, %rdx
    shrq    $6, %rax
    jmp    .L19
    .p2align 4,,10
    .p2align 3
.L18:
    subq    $1, %rax
    movq    $-1, %rdx
    cmpq    %rdx, %rax
    je    .L23
.L19:
    andq    (%rdi,%rax,8), %rdx
    je    .L18
    salq    $6, %rax
#APP
# 15 "flb.c" 1
    bsr %rdx,%rdx
# 0 "" 2
#NO_APP
    addq    %rdx, %rax
    ret
    .p2align 4,,10
    .p2align 3
.L23:
    movq    %rsi, %rax
.L17:
    rep ret

Reply via email to