https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115344
Bug ID: 115344 Summary: Missing loop counter reversal Product: gcc Version: 15.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: cmuellner at gcc dot gnu.org Target Milestone: --- Let's take a simple for-loop with an unknown bound: void bar (); void foo1 (int n) { for (int i = 0; i < n; i++) { bar (); } } We see that two variables are in the program, but we could eliminate the loop variable `i` as follows: void bar (); void foo2 (int n) { while (n) { bar (); n--; } } Optimizing the loop as above has the following benefits: - No need for a register for the loop variable `i` - No need for an additional slot in the stack frame - No need for instructions to save/restore the loop variable register in the prologue/epilogue - No need for an initialization instruction for the loop variable `i` (to zero) LLVM does this transformation on (at least) x86-64, RISC-V (rv64gc), and AArch64 with -O3, but GCC does not. Tests have been done with trunk and older GCC releases (I've tested down to GCC 4.4). Related bug tickets: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=22041 (open - with uses of the loop counter as an array index) https://gcc.gnu.org/bugzilla/show_bug.cgi?id=31238 (closed - fixed for GCC 4.5.0) https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40886 (closed - fixed for GCC 4.5.0) GCC AArch64 -O3: foo1: cmp w0, 0 ble .L6 stp x29, x30, [sp, -32]! mov x29, sp stp x19, x20, [sp, 16] mov w20, w0 mov w19, 0 .L3: add w19, w19, 1 bl bar cmp w20, w19 bne .L3 ldp x19, x20, [sp, 16] ldp x29, x30, [sp], 32 ret .L6: ret foo2: cbz w0, .L18 stp x29, x30, [sp, -32]! mov x29, sp str x19, [sp, 16] mov w19, w0 .L12: bl bar subs w19, w19, #1 bne .L12 ldr x19, [sp, 16] ldp x29, x30, [sp], 32 ret .L18: ret LLVM AArch64 -O3: foo1: // @foo1 cmp w0, #1 b.lt .LBB0_4 stp x29, x30, [sp, #-32]! // 16-byte Folded Spill str x19, [sp, #16] // 8-byte Folded Spill mov x29, sp mov w19, w0 .LBB0_2: // =>This Inner Loop Header: Depth=1 bl bar subs w19, w19, #1 b.ne .LBB0_2 ldr x19, [sp, #16] // 8-byte Folded Reload ldp x29, x30, [sp], #32 // 16-byte Folded Reload .LBB0_4: ret foo2: // @foo2 cbz w0, .LBB1_4 stp x29, x30, [sp, #-32]! // 16-byte Folded Spill str x19, [sp, #16] // 8-byte Folded Spill mov x29, sp mov w19, w0 .LBB1_2: // =>This Inner Loop Header: Depth=1 bl bar subs w19, w19, #1 b.ne .LBB1_2 ldr x19, [sp, #16] // 8-byte Folded Reload ldp x29, x30, [sp], #32 // 16-byte Folded Reload .LBB1_4: ret GCC RISC-V -O3 -march=rv64gc: foo1: ble a0,zero,.L6 addi sp,sp,-32 sd s0,16(sp) sd s1,8(sp) sd ra,24(sp) mv s1,a0 li s0,0 .L3: addiw s0,s0,1 call bar bne s1,s0,.L3 ld ra,24(sp) ld s0,16(sp) ld s1,8(sp) addi sp,sp,32 jr ra .L6: ret foo2: beq a0,zero,.L18 addi sp,sp,-16 sd s0,0(sp) sd ra,8(sp) mv s0,a0 .L12: addiw s0,s0,-1 call bar bne s0,zero,.L12 ld ra,8(sp) ld s0,0(sp) addi sp,sp,16 jr ra .L18: ret LLVM RISC-V -O3 -march=rv64gc foo1: # @foo1 blez a0, .LBB0_4 addi sp, sp, -16 sd ra, 8(sp) # 8-byte Folded Spill sd s0, 0(sp) # 8-byte Folded Spill mv s0, a0 .LBB0_2: # =>This Inner Loop Header: Depth=1 call bar addiw s0, s0, -1 bnez s0, .LBB0_2 ld ra, 8(sp) # 8-byte Folded Reload ld s0, 0(sp) # 8-byte Folded Reload addi sp, sp, 16 .LBB0_4: ret foo2: # @foo2 beqz a0, .LBB1_4 addi sp, sp, -16 sd ra, 8(sp) # 8-byte Folded Spill sd s0, 0(sp) # 8-byte Folded Spill mv s0, a0 .LBB1_2: # =>This Inner Loop Header: Depth=1 call bar addiw s0, s0, -1 bnez s0, .LBB1_2 ld ra, 8(sp) # 8-byte Folded Reload ld s0, 0(sp) # 8-byte Folded Reload addi sp, sp, 16 .LBB1_4: ret GCC x86-64 -O3: foo1: testl %edi, %edi jle .L6 pushq %rbp movl %edi, %ebp pushq %rbx xorl %ebx, %ebx subq $8, %rsp .L3: xorl %eax, %eax addl $1, %ebx call bar cmpl %ebx, %ebp jne .L3 addq $8, %rsp popq %rbx popq %rbp ret .L6: ret foo2: testl %edi, %edi je .L18 pushq %rbx movl %edi, %ebx .L12: xorl %eax, %eax call bar subl $1, %ebx jne .L12 popq %rbx ret LLVM x86-64 -O3: foo1: # @foo1 testl %edi, %edi jle .LBB0_4 pushq %rbx movl %edi, %ebx .LBB0_2: # =>This Inner Loop Header: Depth=1 xorl %eax, %eax callq bar@PLT decl %ebx jne .LBB0_2 popq %rbx .LBB0_4: retq foo2: # @foo2 testl %edi, %edi je .LBB1_4 pushq %rbx movl %edi, %ebx .LBB1_2: # =>This Inner Loop Header: Depth=1 xorl %eax, %eax callq bar@PLT decl %ebx jne .LBB1_2 popq %rbx .LBB1_4: retq