https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109537
--- Comment #1 from Li Jia He <helijia.i at foxmail dot com> --- Update gcc's assembly as gcc did not turn on auto vectorization during O2, The compilation command for gcc is 'cc1 unroll.c -O3 -funroll-loops', The compilation command for clang is 'clang unroll.c -O3 -S' ''' $ cat unroll.gcc.s .file "unroll.c" .text .p2align 4 .globl matrix_add_const .type matrix_add_const, @function matrix_add_const: .LFB0: .cfi_startproc movl %edi, %ecx movl %edx, %edi testl %ecx, %ecx jle .L1 leal -1(%rcx), %eax cmpl $2, %eax jbe .L6 movl %ecx, %r9d movd %edx, %xmm2 movq %rsi, %r10 shrl $2, %r9d pshufd $0, %xmm2, %xmm0 salq $4, %r9 leaq (%r9,%rsi), %r8 subq $16, %r9 shrq $4, %r9 addq $1, %r9 andl $7, %r9d je .L4 cmpq $1, %r9 je .L29 cmpq $2, %r9 je .L30 cmpq $3, %r9 je .L31 cmpq $4, %r9 je .L32 cmpq $5, %r9 je .L33 cmpq $6, %r9 jne .L45 .L34: movdqu (%r10), %xmm3 addq $16, %r10 paddd %xmm0, %xmm3 movups %xmm3, -16(%r10) .L33: movdqu (%r10), %xmm4 addq $16, %r10 paddd %xmm0, %xmm4 movups %xmm4, -16(%r10) .L32: movdqu (%r10), %xmm5 addq $16, %r10 paddd %xmm0, %xmm5 movups %xmm5, -16(%r10) .L31: movdqu (%r10), %xmm6 addq $16, %r10 paddd %xmm0, %xmm6 movups %xmm6, -16(%r10) .L30: movdqu (%r10), %xmm7 addq $16, %r10 paddd %xmm0, %xmm7 movups %xmm7, -16(%r10) .L29: movdqu (%r10), %xmm8 addq $16, %r10 paddd %xmm0, %xmm8 movups %xmm8, -16(%r10) cmpq %r8, %r10 je .L43 .L4: movdqu (%r10), %xmm9 movdqu 16(%r10), %xmm10 subq $-128, %r10 movdqu -96(%r10), %xmm11 movdqu -80(%r10), %xmm12 movdqu -64(%r10), %xmm13 paddd %xmm0, %xmm9 paddd %xmm0, %xmm10 movdqu -48(%r10), %xmm14 movdqu -32(%r10), %xmm15 movdqu -16(%r10), %xmm2 paddd %xmm0, %xmm11 paddd %xmm0, %xmm12 paddd %xmm0, %xmm13 paddd %xmm0, %xmm14 movups %xmm9, -128(%r10) paddd %xmm0, %xmm15 paddd %xmm0, %xmm2 movups %xmm10, -112(%r10) movups %xmm11, -96(%r10) movups %xmm12, -80(%r10) movups %xmm13, -64(%r10) movups %xmm14, -48(%r10) movups %xmm15, -32(%r10) movups %xmm2, -16(%r10) cmpq %r8, %r10 jne .L4 .L43: movl %ecx, %edx andl $-4, %edx testb $3, %cl je .L46 .L3: movslq %edx, %r11 leal 1(%rdx), %eax salq $2, %r11 addl %edi, (%rsi,%r11) cmpl %eax, %ecx jle .L1 addl $2, %edx addl %edi, 4(%rsi,%r11) cmpl %edx, %ecx jle .L1 addl %edi, 8(%rsi,%r11) .L1: ret .p2align 4,,10 .p2align 3 .L46: ret .p2align 4,,10 .p2align 3 .L45: movdqu (%rsi), %xmm1 leaq 16(%rsi), %r10 paddd %xmm0, %xmm1 movups %xmm1, (%rsi) jmp .L34 .L6: xorl %edx, %edx jmp .L3 .cfi_endproc .LFE0: .size matrix_add_const, .-matrix_add_const .ident "GCC: (GNU) 13.0.0 20221022 (experimental)" .section .note.GNU-stack,"",@progbits '''