https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109537

--- Comment #1 from Li Jia He <helijia.i at foxmail dot com> ---
Update gcc's assembly as gcc did not turn on auto vectorization during O2,
The compilation command for gcc is 'cc1 unroll.c -O3 -funroll-loops',
The compilation command for clang is 'clang unroll.c -O3 -S'
'''
$ cat unroll.gcc.s
        .file   "unroll.c"
        .text
        .p2align 4
        .globl  matrix_add_const
        .type   matrix_add_const, @function
matrix_add_const:
.LFB0:
        .cfi_startproc
        movl    %edi, %ecx
        movl    %edx, %edi
        testl   %ecx, %ecx
        jle     .L1
        leal    -1(%rcx), %eax
        cmpl    $2, %eax
        jbe     .L6
        movl    %ecx, %r9d
        movd    %edx, %xmm2
        movq    %rsi, %r10
        shrl    $2, %r9d
        pshufd  $0, %xmm2, %xmm0
        salq    $4, %r9
        leaq    (%r9,%rsi), %r8
        subq    $16, %r9
        shrq    $4, %r9
        addq    $1, %r9
        andl    $7, %r9d
        je      .L4
        cmpq    $1, %r9
        je      .L29
        cmpq    $2, %r9
        je      .L30
        cmpq    $3, %r9
        je      .L31
        cmpq    $4, %r9
        je      .L32
        cmpq    $5, %r9
        je      .L33
        cmpq    $6, %r9
        jne     .L45
.L34:
        movdqu  (%r10), %xmm3
        addq    $16, %r10
        paddd   %xmm0, %xmm3
        movups  %xmm3, -16(%r10)
.L33:
        movdqu  (%r10), %xmm4
        addq    $16, %r10
        paddd   %xmm0, %xmm4
        movups  %xmm4, -16(%r10)
.L32:
        movdqu  (%r10), %xmm5
        addq    $16, %r10
        paddd   %xmm0, %xmm5
        movups  %xmm5, -16(%r10)
.L31:
        movdqu  (%r10), %xmm6
        addq    $16, %r10
        paddd   %xmm0, %xmm6
        movups  %xmm6, -16(%r10)
.L30:
        movdqu  (%r10), %xmm7
        addq    $16, %r10
        paddd   %xmm0, %xmm7
        movups  %xmm7, -16(%r10)
.L29:
        movdqu  (%r10), %xmm8
        addq    $16, %r10
        paddd   %xmm0, %xmm8
        movups  %xmm8, -16(%r10)
        cmpq    %r8, %r10
        je      .L43
.L4:
        movdqu  (%r10), %xmm9
        movdqu  16(%r10), %xmm10
        subq    $-128, %r10
        movdqu  -96(%r10), %xmm11
        movdqu  -80(%r10), %xmm12
        movdqu  -64(%r10), %xmm13
        paddd   %xmm0, %xmm9
        paddd   %xmm0, %xmm10
        movdqu  -48(%r10), %xmm14
        movdqu  -32(%r10), %xmm15
        movdqu  -16(%r10), %xmm2
        paddd   %xmm0, %xmm11
        paddd   %xmm0, %xmm12
        paddd   %xmm0, %xmm13
        paddd   %xmm0, %xmm14
        movups  %xmm9, -128(%r10)
        paddd   %xmm0, %xmm15
        paddd   %xmm0, %xmm2
        movups  %xmm10, -112(%r10)
        movups  %xmm11, -96(%r10)
        movups  %xmm12, -80(%r10)
        movups  %xmm13, -64(%r10)
        movups  %xmm14, -48(%r10)
        movups  %xmm15, -32(%r10)
        movups  %xmm2, -16(%r10)
        cmpq    %r8, %r10
        jne     .L4
.L43:
        movl    %ecx, %edx
        andl    $-4, %edx
        testb   $3, %cl
        je      .L46
.L3:
        movslq  %edx, %r11
        leal    1(%rdx), %eax
        salq    $2, %r11
        addl    %edi, (%rsi,%r11)
        cmpl    %eax, %ecx
        jle     .L1
        addl    $2, %edx
        addl    %edi, 4(%rsi,%r11)
        cmpl    %edx, %ecx
        jle     .L1
        addl    %edi, 8(%rsi,%r11)
.L1:
        ret
        .p2align 4,,10
        .p2align 3
.L46:
        ret
        .p2align 4,,10
        .p2align 3
.L45:
        movdqu  (%rsi), %xmm1
        leaq    16(%rsi), %r10
        paddd   %xmm0, %xmm1
        movups  %xmm1, (%rsi)
        jmp     .L34
.L6:
        xorl    %edx, %edx
        jmp     .L3
        .cfi_endproc
.LFE0:
        .size   matrix_add_const, .-matrix_add_const
        .ident  "GCC: (GNU) 13.0.0 20221022 (experimental)"
        .section        .note.GNU-stack,"",@progbits
'''

Reply via email to