https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109537
Bug ID: 109537 Summary: Improve code generation for dynamic loop unrolling Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: helijia.i at foxmail dot com Target Milestone: --- For the current dynamic loop unrolling implementation, we will try to do the followng transform: ''' for (i = 0; i < n; i++) body; ==> (LOOP->LPT_DECISION.TIMES == 3) i = 0; mod = n % 4; switch (mod) { case 3: body; i++; case 2: body; i++; case 1: body; i++; case 0: ; } while (i < n) { body; i++; body; i++; body; i++; body; i++; } ''' It would be better if we could carry out loop unrolling in the following way (R == # unrolls) ''' i=0; if (i > n-R-1) goto remain (not needed with loop bounds as shown) for(; i< n-R-1; i+= R) { body; body; ... body; // R times } remain: if (i < n) for(; i < n; i++) body ''' For the following sample code: ''' void matrix_add_const(int N1, int* A, int val) { int i, j; for (j = 0; j < N1; j++) A[j] += val; } ''' Gcc will generate more jump instructions compared to clang. gcc's assemly ''' $ cat unroll.gcc.s .file "unroll.c" .text .p2align 4 .globl matrix_add_const .type matrix_add_const, @function matrix_add_const: .LFB0: .cfi_startproc testl %edi, %edi jle .L1 movslq %edi, %rdi leaq -4(,%rdi,4), %rax leaq (%rsi,%rdi,4), %rcx shrq $2, %rax addq $1, %rax andl $7, %eax je .L3 cmpq $1, %rax je .L26 cmpq $2, %rax je .L27 cmpq $3, %rax je .L28 cmpq $4, %rax je .L29 cmpq $5, %rax je .L30 cmpq $6, %rax jne .L41 .L31: addl %edx, (%rsi) addq $4, %rsi .L30: addl %edx, (%rsi) addq $4, %rsi .L29: addl %edx, (%rsi) addq $4, %rsi .L28: addl %edx, (%rsi) addq $4, %rsi .L27: addl %edx, (%rsi) addq $4, %rsi .L26: addl %edx, (%rsi) addq $4, %rsi cmpq %rcx, %rsi je .L42 .L3: addl %edx, (%rsi) addl %edx, 4(%rsi) addl %edx, 8(%rsi) addl %edx, 12(%rsi) addl %edx, 16(%rsi) addl %edx, 20(%rsi) addl %edx, 24(%rsi) addl %edx, 28(%rsi) addq $32, %rsi cmpq %rcx, %rsi jne .L3 .L1: ret .p2align 4,,10 .p2align 3 .L41: addl %edx, (%rsi) addq $4, %rsi jmp .L31 .L42: ret .cfi_endproc .LFE0: .size matrix_add_const, .-matrix_add_const .ident "GCC: (GNU) 13.0.0 20221022 (experimental)" .section .note.GNU-stack,"",@progbits ''' clang's assembly ''' $ cat unroll.clang.s .text .file "unroll.c" .globl matrix_add_const # -- Begin function matrix_add_const .p2align 4, 0x90 .type matrix_add_const,@function matrix_add_const: # @matrix_add_const .cfi_startproc # %bb.0: testl %edi, %edi jle .LBB0_11 # %bb.1: movl %edi, %r9d cmpl $8, %edi jae .LBB0_3 # %bb.2: xorl %ecx, %ecx jmp .LBB0_10 .LBB0_3: movl %r9d, %ecx andl $-8, %ecx movd %edx, %xmm0 pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] leaq -8(%rcx), %rax movq %rax, %rdi shrq $3, %rdi addq $1, %rdi movl %edi, %r8d andl $1, %r8d testq %rax, %rax je .LBB0_4 # %bb.5: movq %r8, %rax subq %rdi, %rax xorl %edi, %edi .p2align 4, 0x90 .LBB0_6: # =>This Inner Loop Header: Depth=1 movdqu (%rsi,%rdi,4), %xmm1 movdqu 16(%rsi,%rdi,4), %xmm2 movdqu 32(%rsi,%rdi,4), %xmm3 movdqu 48(%rsi,%rdi,4), %xmm4 paddd %xmm0, %xmm1 paddd %xmm0, %xmm2 movdqu %xmm1, (%rsi,%rdi,4) movdqu %xmm2, 16(%rsi,%rdi,4) paddd %xmm0, %xmm3 paddd %xmm0, %xmm4 movdqu %xmm3, 32(%rsi,%rdi,4) movdqu %xmm4, 48(%rsi,%rdi,4) addq $16, %rdi addq $2, %rax jne .LBB0_6 # %bb.7: testq %r8, %r8 je .LBB0_9 .LBB0_8: movdqu (%rsi,%rdi,4), %xmm1 movdqu 16(%rsi,%rdi,4), %xmm2 paddd %xmm0, %xmm1 paddd %xmm0, %xmm2 movdqu %xmm1, (%rsi,%rdi,4) movdqu %xmm2, 16(%rsi,%rdi,4) .LBB0_9: cmpq %r9, %rcx je .LBB0_11 .p2align 4, 0x90 .LBB0_10: # =>This Inner Loop Header: Depth=1 addl %edx, (%rsi,%rcx,4) addq $1, %rcx cmpq %rcx, %r9 jne .LBB0_10 .LBB0_11: retq .LBB0_4: xorl %edi, %edi testq %r8, %r8 jne .LBB0_8 jmp .LBB0_9 .Lfunc_end0: .size matrix_add_const, .Lfunc_end0-matrix_add_const .cfi_endproc # -- End function .ident "clang version 10.0.0-4ubuntu1 " .section ".note.GNU-stack","",@progbits .addrsig '''