[Bug rtl-optimization/115344] New: Missing loop counter reversal

cmuellner at gcc dot gnu.org via Gcc-bugs Tue, 04 Jun 2024 05:47:17 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344


            Bug ID: 115344
           Summary: Missing loop counter reversal
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: cmuellner at gcc dot gnu.org
  Target Milestone: ---

Let's take a simple for-loop with an unknown bound:

void bar ();
void foo1 (int n) {
    for (int i = 0; i < n; i++) {
        bar ();
    }
}

We see that two variables are in the program,
but we could eliminate the loop variable `i` as follows:

void bar ();
void foo2 (int n) {
    while (n) {
        bar ();
        n--;
    }
}

Optimizing the loop as above has the following benefits:
- No need for a register for the loop variable `i`
- No need for an additional slot in the stack frame
- No need for instructions to save/restore the loop variable register in the
prologue/epilogue
- No need for an initialization instruction for the loop variable `i` (to zero)

LLVM does this transformation on (at least) x86-64,  RISC-V (rv64gc),
and AArch64 with -O3, but GCC does not.
Tests have been done with trunk and older GCC releases (I've tested down to GCC
4.4).

Related bug tickets:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22041 (open - with uses of the
loop counter as an array index)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=31238 (closed - fixed for GCC
4.5.0)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40886 (closed - fixed for GCC
4.5.0)

GCC AArch64 -O3:
foo1:
        cmp     w0, 0
        ble     .L6
        stp     x29, x30, [sp, -32]!
        mov     x29, sp
        stp     x19, x20, [sp, 16]
        mov     w20, w0
        mov     w19, 0
.L3:
        add     w19, w19, 1
        bl      bar
        cmp     w20, w19
        bne     .L3
        ldp     x19, x20, [sp, 16]
        ldp     x29, x30, [sp], 32
        ret
.L6:
        ret
foo2:
        cbz     w0, .L18
        stp     x29, x30, [sp, -32]!
        mov     x29, sp
        str     x19, [sp, 16]
        mov     w19, w0
.L12:
        bl      bar
        subs    w19, w19, #1
        bne     .L12
        ldr     x19, [sp, 16]
        ldp     x29, x30, [sp], 32
        ret
.L18:
        ret

LLVM AArch64 -O3:
foo1:                                   // @foo1
        cmp     w0, #1
        b.lt    .LBB0_4
        stp     x29, x30, [sp, #-32]!           // 16-byte Folded Spill
        str     x19, [sp, #16]                  // 8-byte Folded Spill
        mov     x29, sp
        mov     w19, w0
.LBB0_2:                                // =>This Inner Loop Header: Depth=1
        bl      bar
        subs    w19, w19, #1
        b.ne    .LBB0_2
        ldr     x19, [sp, #16]                  // 8-byte Folded Reload
        ldp     x29, x30, [sp], #32             // 16-byte Folded Reload
.LBB0_4:
        ret
foo2:                                   // @foo2
        cbz     w0, .LBB1_4
        stp     x29, x30, [sp, #-32]!           // 16-byte Folded Spill
        str     x19, [sp, #16]                  // 8-byte Folded Spill
        mov     x29, sp
        mov     w19, w0
.LBB1_2:                                // =>This Inner Loop Header: Depth=1
        bl      bar
        subs    w19, w19, #1
        b.ne    .LBB1_2
        ldr     x19, [sp, #16]                  // 8-byte Folded Reload
        ldp     x29, x30, [sp], #32             // 16-byte Folded Reload
.LBB1_4:
        ret

GCC RISC-V -O3 -march=rv64gc:
foo1:
        ble     a0,zero,.L6
        addi    sp,sp,-32
        sd      s0,16(sp)
        sd      s1,8(sp)
        sd      ra,24(sp)
        mv      s1,a0
        li      s0,0
.L3:
        addiw   s0,s0,1
        call    bar
        bne     s1,s0,.L3
        ld      ra,24(sp)
        ld      s0,16(sp)
        ld      s1,8(sp)
        addi    sp,sp,32
        jr      ra
.L6:
        ret
foo2:
        beq     a0,zero,.L18
        addi    sp,sp,-16
        sd      s0,0(sp)
        sd      ra,8(sp)
        mv      s0,a0
.L12:
        addiw   s0,s0,-1
        call    bar
        bne     s0,zero,.L12
        ld      ra,8(sp)
        ld      s0,0(sp)
        addi    sp,sp,16
        jr      ra
.L18:
        ret

LLVM RISC-V -O3 -march=rv64gc
foo1:                                   # @foo1
        blez    a0, .LBB0_4
        addi    sp, sp, -16
        sd      ra, 8(sp)                       # 8-byte Folded Spill
        sd      s0, 0(sp)                       # 8-byte Folded Spill
        mv      s0, a0
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        call    bar
        addiw   s0, s0, -1
        bnez    s0, .LBB0_2
        ld      ra, 8(sp)                       # 8-byte Folded Reload
        ld      s0, 0(sp)                       # 8-byte Folded Reload
        addi    sp, sp, 16
.LBB0_4:
        ret
foo2:                                   # @foo2
        beqz    a0, .LBB1_4
        addi    sp, sp, -16
        sd      ra, 8(sp)                       # 8-byte Folded Spill
        sd      s0, 0(sp)                       # 8-byte Folded Spill
        mv      s0, a0
.LBB1_2:                                # =>This Inner Loop Header: Depth=1
        call    bar
        addiw   s0, s0, -1
        bnez    s0, .LBB1_2
        ld      ra, 8(sp)                       # 8-byte Folded Reload
        ld      s0, 0(sp)                       # 8-byte Folded Reload
        addi    sp, sp, 16
.LBB1_4:
        ret

GCC x86-64 -O3:
foo1:
        testl   %edi, %edi
        jle     .L6
        pushq   %rbp
        movl    %edi, %ebp
        pushq   %rbx
        xorl    %ebx, %ebx
        subq    $8, %rsp
.L3:
        xorl    %eax, %eax
        addl    $1, %ebx
        call    bar
        cmpl    %ebx, %ebp
        jne     .L3
        addq    $8, %rsp
        popq    %rbx
        popq    %rbp
        ret
.L6:
        ret
foo2:
        testl   %edi, %edi
        je      .L18
        pushq   %rbx
        movl    %edi, %ebx
.L12:
        xorl    %eax, %eax
        call    bar
        subl    $1, %ebx
        jne     .L12
        popq    %rbx
        ret

LLVM x86-64 -O3:
foo1:                                   # @foo1
        testl   %edi, %edi
        jle     .LBB0_4
        pushq   %rbx
        movl    %edi, %ebx
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        xorl    %eax, %eax
        callq   bar@PLT
        decl    %ebx
        jne     .LBB0_2
        popq    %rbx
.LBB0_4:
        retq
foo2:                                   # @foo2
        testl   %edi, %edi
        je      .LBB1_4
        pushq   %rbx
        movl    %edi, %ebx
.LBB1_2:                                # =>This Inner Loop Header: Depth=1
        xorl    %eax, %eax
        callq   bar@PLT
        decl    %ebx
        jne     .LBB1_2
        popq    %rbx
.LBB1_4:
        retq

[Bug rtl-optimization/115344] New: Missing loop counter reversal

Reply via email to