https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
Bug ID: 115344
Summary: Missing loop counter reversal
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: cmuellner at gcc dot gnu.org
Target Milestone: ---
Let's take a simple for-loop with an unknown bound:
void bar ();
void foo1 (int n) {
for (int i = 0; i < n; i++) {
bar ();
}
}
We see that two variables are in the program,
but we could eliminate the loop variable `i` as follows:
void bar ();
void foo2 (int n) {
while (n) {
bar ();
n--;
}
}
Optimizing the loop as above has the following benefits:
- No need for a register for the loop variable `i`
- No need for an additional slot in the stack frame
- No need for instructions to save/restore the loop variable register in the
prologue/epilogue
- No need for an initialization instruction for the loop variable `i` (to zero)
LLVM does this transformation on (at least) x86-64, RISC-V (rv64gc),
and AArch64 with -O3, but GCC does not.
Tests have been done with trunk and older GCC releases (I've tested down to GCC
4.4).
Related bug tickets:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22041 (open - with uses of the
loop counter as an array index)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=31238 (closed - fixed for GCC
4.5.0)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40886 (closed - fixed for GCC
4.5.0)
GCC AArch64 -O3:
foo1:
cmp w0, 0
ble .L6
stp x29, x30, [sp, -32]!
mov x29, sp
stp x19, x20, [sp, 16]
mov w20, w0
mov w19, 0
.L3:
add w19, w19, 1
bl bar
cmp w20, w19
bne .L3
ldp x19, x20, [sp, 16]
ldp x29, x30, [sp], 32
ret
.L6:
ret
foo2:
cbz w0, .L18
stp x29, x30, [sp, -32]!
mov x29, sp
str x19, [sp, 16]
mov w19, w0
.L12:
bl bar
subs w19, w19, #1
bne .L12
ldr x19, [sp, 16]
ldp x29, x30, [sp], 32
ret
.L18:
ret
LLVM AArch64 -O3:
foo1: // @foo1
cmp w0, #1
b.lt .LBB0_4
stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
str x19, [sp, #16] // 8-byte Folded Spill
mov x29, sp
mov w19, w0
.LBB0_2: // =>This Inner Loop Header: Depth=1
bl bar
subs w19, w19, #1
b.ne .LBB0_2
ldr x19, [sp, #16] // 8-byte Folded Reload
ldp x29, x30, [sp], #32 // 16-byte Folded Reload
.LBB0_4:
ret
foo2: // @foo2
cbz w0, .LBB1_4
stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
str x19, [sp, #16] // 8-byte Folded Spill
mov x29, sp
mov w19, w0
.LBB1_2: // =>This Inner Loop Header: Depth=1
bl bar
subs w19, w19, #1
b.ne .LBB1_2
ldr x19, [sp, #16] // 8-byte Folded Reload
ldp x29, x30, [sp], #32 // 16-byte Folded Reload
.LBB1_4:
ret
GCC RISC-V -O3 -march=rv64gc:
foo1:
ble a0,zero,.L6
addi sp,sp,-32
sd s0,16(sp)
sd s1,8(sp)
sd ra,24(sp)
mv s1,a0
li s0,0
.L3:
addiw s0,s0,1
call bar
bne s1,s0,.L3
ld ra,24(sp)
ld s0,16(sp)
ld s1,8(sp)
addi sp,sp,32
jr ra
.L6:
ret
foo2:
beq a0,zero,.L18
addi sp,sp,-16
sd s0,0(sp)
sd ra,8(sp)
mv s0,a0
.L12:
addiw s0,s0,-1
call bar
bne s0,zero,.L12
ld ra,8(sp)
ld s0,0(sp)
addi sp,sp,16
jr ra
.L18:
ret
LLVM RISC-V -O3 -march=rv64gc
foo1: # @foo1
blez a0, .LBB0_4
addi sp, sp, -16
sd ra, 8(sp) # 8-byte Folded Spill
sd s0, 0(sp) # 8-byte Folded Spill
mv s0, a0
.LBB0_2: # =>This Inner Loop Header: Depth=1
call bar
addiw s0, s0, -1
bnez s0, .LBB0_2
ld ra, 8(sp) # 8-byte Folded Reload
ld s0, 0(sp) # 8-byte Folded Reload
addi sp, sp, 16
.LBB0_4:
ret
foo2: # @foo2
beqz a0, .LBB1_4
addi sp, sp, -16
sd ra, 8(sp) # 8-byte Folded Spill
sd s0, 0(sp) # 8-byte Folded Spill
mv s0, a0
.LBB1_2: # =>This Inner Loop Header: Depth=1
call bar
addiw s0, s0, -1
bnez s0, .LBB1_2
ld ra, 8(sp) # 8-byte Folded Reload
ld s0, 0(sp) # 8-byte Folded Reload
addi sp, sp, 16
.LBB1_4:
ret
GCC x86-64 -O3:
foo1:
testl %edi, %edi
jle .L6
pushq %rbp
movl %edi, %ebp
pushq %rbx
xorl %ebx, %ebx
subq $8, %rsp
.L3:
xorl %eax, %eax
addl $1, %ebx
call bar
cmpl %ebx, %ebp
jne .L3
addq $8, %rsp
popq %rbx
popq %rbp
ret
.L6:
ret
foo2:
testl %edi, %edi
je .L18
pushq %rbx
movl %edi, %ebx
.L12:
xorl %eax, %eax
call bar
subl $1, %ebx
jne .L12
popq %rbx
ret
LLVM x86-64 -O3:
foo1: # @foo1
testl %edi, %edi
jle .LBB0_4
pushq %rbx
movl %edi, %ebx
.LBB0_2: # =>This Inner Loop Header: Depth=1
xorl %eax, %eax
callq bar@PLT
decl %ebx
jne .LBB0_2
popq %rbx
.LBB0_4:
retq
foo2: # @foo2
testl %edi, %edi
je .LBB1_4
pushq %rbx
movl %edi, %ebx
.LBB1_2: # =>This Inner Loop Header: Depth=1
xorl %eax, %eax
callq bar@PLT
decl %ebx
jne .LBB1_2
popq %rbx
.LBB1_4:
retq