https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109537

            Bug ID: 109537
           Summary: Improve code generation for dynamic loop unrolling
           Product: gcc
           Version: 13.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: helijia.i at foxmail dot com
  Target Milestone: ---

For the current dynamic loop unrolling implementation, we will try to do the
followng transform:
'''
for (i = 0; i < n; i++)
     body;

   ==>  (LOOP->LPT_DECISION.TIMES == 3)

   i = 0;
   mod = n % 4;

   switch (mod)
     {
       case 3:
         body; i++;
       case 2:
         body; i++;
       case 1:
         body; i++;
       case 0: ;
     }

   while (i < n)
     {
       body; i++;
       body; i++;
       body; i++;
       body; i++;
     }
'''

It would be better if we could carry out loop unrolling in the following way (R
== # unrolls)

'''
i=0; if (i > n-R-1) goto remain (not needed with loop bounds as shown)
for(; i< n-R-1; i+= R)
  {
  body;
  body;
  ...
  body;  // R times
  }
remain:
if (i < n)
  for(; i < n; i++)
    body
'''

For the following sample code:
'''
void matrix_add_const(int N1, int* A, int val)
{
  int i, j;
  for (j = 0; j < N1; j++)
    A[j] += val;
}
'''

Gcc will generate more jump instructions compared to clang.

gcc's assemly
'''
$ cat unroll.gcc.s 
        .file   "unroll.c"
        .text
        .p2align 4
        .globl  matrix_add_const
        .type   matrix_add_const, @function
matrix_add_const:
.LFB0:
        .cfi_startproc
        testl   %edi, %edi
        jle     .L1
        movslq  %edi, %rdi
        leaq    -4(,%rdi,4), %rax
        leaq    (%rsi,%rdi,4), %rcx
        shrq    $2, %rax
        addq    $1, %rax
        andl    $7, %eax
        je      .L3
        cmpq    $1, %rax
        je      .L26
        cmpq    $2, %rax
        je      .L27
        cmpq    $3, %rax
        je      .L28
        cmpq    $4, %rax
        je      .L29
        cmpq    $5, %rax
        je      .L30
        cmpq    $6, %rax
        jne     .L41
.L31:
        addl    %edx, (%rsi)
        addq    $4, %rsi
.L30:
        addl    %edx, (%rsi)
        addq    $4, %rsi
.L29:
        addl    %edx, (%rsi)
        addq    $4, %rsi
.L28:
        addl    %edx, (%rsi)
        addq    $4, %rsi
.L27:
        addl    %edx, (%rsi)
        addq    $4, %rsi
.L26:
        addl    %edx, (%rsi)
        addq    $4, %rsi
        cmpq    %rcx, %rsi
        je      .L42
.L3:
        addl    %edx, (%rsi)
        addl    %edx, 4(%rsi)
        addl    %edx, 8(%rsi)
        addl    %edx, 12(%rsi)
        addl    %edx, 16(%rsi)
        addl    %edx, 20(%rsi)
        addl    %edx, 24(%rsi)
        addl    %edx, 28(%rsi)
        addq    $32, %rsi
        cmpq    %rcx, %rsi
        jne     .L3
.L1:
        ret
        .p2align 4,,10
        .p2align 3
.L41:
        addl    %edx, (%rsi)
        addq    $4, %rsi
        jmp     .L31
.L42:
        ret
        .cfi_endproc
.LFE0:
        .size   matrix_add_const, .-matrix_add_const
        .ident  "GCC: (GNU) 13.0.0 20221022 (experimental)"
        .section        .note.GNU-stack,"",@progbits
'''

clang's assembly
'''
$ cat unroll.clang.s 
        .text
        .file   "unroll.c"
        .globl  matrix_add_const        # -- Begin function matrix_add_const
        .p2align        4, 0x90
        .type   matrix_add_const,@function
matrix_add_const:                       # @matrix_add_const
        .cfi_startproc
# %bb.0:
        testl   %edi, %edi
        jle     .LBB0_11
# %bb.1:
        movl    %edi, %r9d
        cmpl    $8, %edi
        jae     .LBB0_3
# %bb.2:
        xorl    %ecx, %ecx
        jmp     .LBB0_10
.LBB0_3:
        movl    %r9d, %ecx
        andl    $-8, %ecx
        movd    %edx, %xmm0
        pshufd  $0, %xmm0, %xmm0        # xmm0 = xmm0[0,0,0,0]
        leaq    -8(%rcx), %rax
        movq    %rax, %rdi
        shrq    $3, %rdi
        addq    $1, %rdi
        movl    %edi, %r8d
        andl    $1, %r8d
        testq   %rax, %rax
        je      .LBB0_4
# %bb.5:
        movq    %r8, %rax
        subq    %rdi, %rax
        xorl    %edi, %edi
        .p2align        4, 0x90
.LBB0_6:                                # =>This Inner Loop Header: Depth=1
        movdqu  (%rsi,%rdi,4), %xmm1
        movdqu  16(%rsi,%rdi,4), %xmm2
        movdqu  32(%rsi,%rdi,4), %xmm3
        movdqu  48(%rsi,%rdi,4), %xmm4
        paddd   %xmm0, %xmm1
        paddd   %xmm0, %xmm2
        movdqu  %xmm1, (%rsi,%rdi,4)
        movdqu  %xmm2, 16(%rsi,%rdi,4)
        paddd   %xmm0, %xmm3
        paddd   %xmm0, %xmm4
        movdqu  %xmm3, 32(%rsi,%rdi,4)
        movdqu  %xmm4, 48(%rsi,%rdi,4)
        addq    $16, %rdi
        addq    $2, %rax
        jne     .LBB0_6
# %bb.7:
        testq   %r8, %r8
        je      .LBB0_9
.LBB0_8:
        movdqu  (%rsi,%rdi,4), %xmm1
        movdqu  16(%rsi,%rdi,4), %xmm2
        paddd   %xmm0, %xmm1
        paddd   %xmm0, %xmm2
        movdqu  %xmm1, (%rsi,%rdi,4)
        movdqu  %xmm2, 16(%rsi,%rdi,4)
.LBB0_9:
        cmpq    %r9, %rcx
        je      .LBB0_11
        .p2align        4, 0x90
.LBB0_10:                               # =>This Inner Loop Header: Depth=1
        addl    %edx, (%rsi,%rcx,4)
        addq    $1, %rcx
        cmpq    %rcx, %r9
        jne     .LBB0_10
.LBB0_11:
        retq
.LBB0_4:
        xorl    %edi, %edi
        testq   %r8, %r8
        jne     .LBB0_8
        jmp     .LBB0_9
.Lfunc_end0:
        .size   matrix_add_const, .Lfunc_end0-matrix_add_const
        .cfi_endproc
                                        # -- End function
        .ident  "clang version 10.0.0-4ubuntu1 "
        .section        ".note.GNU-stack","",@progbits
        .addrsig

'''

Reply via email to