https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90878
--- Comment #1 from H.J. Lu <hjl.tools at gmail dot com> --- If we make integer register store more expensive, this testcase will regress: [hjl@gnu-cfl-1 unroll]$ cat x.i void foo (long p2, long *diag, long d, long i) { long k; k = p2 < 3 ? p2 + p2 : p2 + 3; while (i < k) diag[i++] = d; } [hjl@gnu-cfl-1 unroll]$ make /export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/xgcc -B/export/build/gnu/tools-build/gcc-wip-debug/build-x86_64-linux/gcc/ -O3 -march=skylake -S x.i [hjl@gnu-cfl-1 unroll]$ cat x.s .file "x.i" .text .p2align 4 .globl foo .type foo, @function foo: .LFB0: .cfi_startproc leaq (%rdi,%rdi), %rax leaq 3(%rdi), %r8 cmpq $2, %rdi cmovle %rax, %r8 cmpq %rcx, %r8 jle .L10 movq %rcx, %rax notq %rax movq %r8, %r9 addq %r8, %rax subq %rcx, %r9 cmpq $3, %rax jbe .L5 movq %r9, %rdi shrq $2, %rdi vmovq %rdx, %xmm1 leaq (%rsi,%rcx,8), %rax salq $5, %rdi vpbroadcastq %xmm1, %ymm0 addq %rax, %rdi .p2align 4,,10 .p2align 3 .L6: vmovdqu %ymm0, (%rax) addq $32, %rax cmpq %rdi, %rax jne .L6 movq %r9, %rax andq $-4, %rax addq %rax, %rcx cmpq %rax, %r9 je .L9 vzeroupper .L5: leaq 1(%rcx), %rax movq %rdx, (%rsi,%rcx,8) cmpq %r8, %rax jge .L10 leaq 2(%rcx), %rdi movq %rdx, (%rsi,%rax,8) cmpq %rdi, %r8 jle .L10 addq $3, %rcx movq %rdx, (%rsi,%rdi,8) cmpq %rcx, %r8 jle .L10 movq %rdx, (%rsi,%rcx,8) ret .p2align 4,,10 .p2align 3 .L9: vzeroupper .L10: ret .cfi_endproc .LFE0: .size foo, .-foo .ident "GCC: (GNU) 10.0.0 20190613 (experimental)" .section .note.GNU-stack,"",@progbits [hjl@gnu-cfl-1 unroll]$ since higher integer register store cost will reduce loop unroll count.