[Bug tree-optimization/49203] New: missed-optimization: useless expressions not moved out of loop

wouter.vermaelen at scarlet dot be Fri, 27 May 2011 15:03:55 -0700

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=49203


           Summary: missed-optimization: useless expressions not moved out
                    of loop
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: wouter.vermae...@scarlet.be


Hi all,

Below is (a simplified version of) some real code I recently
encountered. The stores to the 'output' array are written in the inner
loop, but the intention was probably to have them in the outer loop.

Gcc is able to 'correct' this programming mistake, but only partly:
the stores itself are moved to the outer loop, but the instructions that
calculate those values remain in the inner loop.

For this particular example, the best solution is of course to fix the
C code. But maybe this missed-optimization can also occur in other,
more valid, contexts.

Below I've included the generated x86_64 code for this example by
recent versions of both gcc and llvm.

///////////////////////////////////////////////////////////////////

unsigned char input[100];
unsigned char output[100];

void f() {
    for (int i = 0; i < 32; i += 4) {
        unsigned tmp = 0;
        for (int j = 0; j < 16; ++j) {
            tmp = (tmp << 2) | (input[i + j] & 0x03);
            output[i + 0] = (tmp >> 24) & 0xFF;
            output[i + 1] = (tmp >> 16) & 0xFF;
            output[i + 2] = (tmp >>  8) & 0xFF;
            output[i + 3] = (tmp >>  0) & 0xFF;
        }
    }
}

///////////////////////////////////////////////////////////////////

g++ (GCC) 4.7.0 20110527 (experimental)
g++ -O2 -S

        movl    $output, %r10d
        movq    %r10, %r9
        .p2align 4,,10
        .p2align 3
.L2:
        movl    %r9d, %esi
        xorl    %edx, %edx
        xorl    %eax, %eax
        subl    %r10d, %esi
        .p2align 4,,10
        .p2align 3
.L3:
        leal    0(,%rax,4), %ecx
        leal    (%rdx,%rsi), %eax
        addl    $1, %edx
        cltq
        movzbl  input(%rax), %eax
        andl    $3, %eax
        orl     %ecx, %eax
        movl    %eax, %r8d
        movl    %eax, %edi
        movl    %eax, %ecx
        shrl    $24, %r8d
        shrl    $16, %edi
        shrl    $8, %ecx
        cmpl    $16, %edx
        jne     .L3
        movb    %r8b, (%r9)
        movb    %dil, 1(%r9)
        movb    %cl, 2(%r9)
        movb    %al, 3(%r9)
        addq    $4, %r9
        cmpq    $output+32, %r9
        jne     .L2
        rep
        ret

///////////////////////////////////////////////////////////////////

clang version 3.0 (http://llvm.org/git/clang.git
855f41963e545172a935d07b4713d079e258a207)
clang++ -O2 -S

# BB#0:                                 # %entry
        xorl    %eax, %eax
        .align  16, 0x90
.LBB0_1:                                # %for.cond4.preheader
                                        # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        xorl    %esi, %esi
        movq    $-16, %rdx
        .align  16, 0x90
.LBB0_2:                                # %for.body7
                                        #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        movl    %esi, %ecx
        movzbl  input+16(%rdx,%rax), %edi
        andl    $3, %edi
        leal    (,%rcx,4), %esi
        orl     %edi, %esi
        incq    %rdx
        jne     .LBB0_2
# BB#3:                                 # %for.inc44
                                        #   in Loop: Header=BB0_1 Depth=1
        movb    %sil, output+3(%rax)
        movl    %ecx, %edx
        shrl    $6, %edx
        movb    %dl, output+2(%rax)
        movl    %ecx, %edx
        shrl    $14, %edx
        movb    %dl, output+1(%rax)
        shrl    $22, %ecx
        movb    %cl, output(%rax)
        addq    $4, %rax
        cmpq    $32, %rax
        jne     .LBB0_1
# BB#4:                                 # %for.end47
        ret

[Bug tree-optimization/49203] New: missed-optimization: useless expressions not moved out of loop

Reply via email to