https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81303
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> --- Without peeling for alignment the numbers improve but we still regress from 176s to 205s. The innermost (unrolled) loop is: .L11: vmovsd (%rdi,%r15,2), %xmm2 vmovsd (%rsi,%r15,2), %xmm1 movq -56(%rbp), %rbx vmovhpd (%rdi,%r14), %xmm2, %xmm0 vmovsd (%rdi), %xmm2 vmovhpd (%rsi,%r14), %xmm1, %xmm6 vmovsd (%rsi), %xmm1 vmovhpd (%rdi,%r15), %xmm2, %xmm2 vmovhpd (%rsi,%r15), %xmm1, %xmm1 addq %r11, %rdi addq %r11, %rsi vinsertf128 $0x1, %xmm0, %ymm2, %ymm2 vmovsd (%rcx,%r15,2), %xmm0 vinsertf128 $0x1, %xmm6, %ymm1, %ymm1 vmulpd (%rbx,%rax), %ymm2, %ymm3 movq -72(%rbp), %rbx vmovapd %ymm1, %ymm2 vmovhpd (%rcx,%r14), %xmm0, %xmm6 vmovsd (%rcx), %xmm0 vfmadd132pd (%rbx,%rax), %ymm3, %ymm2 movq -88(%rbp), %rbx vmovhpd (%rcx,%r15), %xmm0, %xmm0 addq %r11, %rcx vinsertf128 $0x1, %xmm6, %ymm0, %ymm0 vmulpd (%rbx,%rax), %ymm0, %ymm3 vmovsd (%rdx,%r15,2), %xmm0 movq -64(%rbp), %rbx vmovhpd (%rdx,%r14), %xmm0, %xmm6 vmovsd (%rdx), %xmm0 vmovhpd (%rdx,%r15), %xmm0, %xmm0 addq %r11, %rdx vinsertf128 $0x1, %xmm6, %ymm0, %ymm0 vfmadd132pd (%rbx,%rax), %ymm3, %ymm0 vaddpd %ymm0, %ymm2, %ymm1 vmovsd (%r9,%r15,2), %xmm0 vmovhpd (%r9,%r14), %xmm0, %xmm3 vmovsd (%r9), %xmm0 vmovhpd (%r9,%r15), %xmm0, %xmm0 addq %r11, %r9 vinsertf128 $0x1, %xmm3, %ymm0, %ymm0 vmulpd (%r12,%rax), %ymm0, %ymm2 vmovsd (%r8,%r15,2), %xmm0 vmovhpd (%r8,%r14), %xmm0, %xmm3 vmovsd (%r8), %xmm0 vmovhpd (%r8,%r15), %xmm0, %xmm0 movq -80(%rbp), %rbx addq %r11, %r8 vinsertf128 $0x1, %xmm3, %ymm0, %ymm0 vfmadd132pd (%rbx,%rax), %ymm2, %ymm0 vaddpd %ymm0, %ymm1, %ymm0 vmovsd (%r10,%r15,2), %xmm1 vmovhpd (%r10,%r14), %xmm1, %xmm2 vmovsd (%r10), %xmm1 vmovhpd (%r10,%r15), %xmm1, %xmm1 addq %r11, %r10 vinsertf128 $0x1, %xmm2, %ymm1, %ymm1 vfmadd231pd 0(%r13,%rax), %ymm1, %ymm4 addq $32, %rax vaddpd %ymm4, %ymm0, %ymm4 cmpq -96(%rbp), %rax jne .L11 vs .L10: vmovsd (%rax,%rbx,8), %xmm0 vmulsd (%r15,%rdx), %xmm0, %xmm0 vmovsd (%r8,%rdx), %xmm1 vfmadd132sd (%rax,%r11,8), %xmm0, %xmm1 vmovsd (%rax,%rsi,8), %xmm0 vmulsd (%r12,%rdx), %xmm0, %xmm0 vmovsd 0(%rbp,%rdx), %xmm4 vfmadd231sd (%rax), %xmm4, %xmm0 vmovsd (%r14,%rdx), %xmm5 vmovsd (%rdi,%rdx), %xmm6 vfmadd231sd (%rax,%r9,8), %xmm6, %xmm2 vaddsd %xmm0, %xmm1, %xmm0 vmovsd (%rax,%r10,8), %xmm1 vmulsd 0(%r13,%rdx), %xmm1, %xmm1 vfmadd231sd (%rax,%rcx,8), %xmm5, %xmm1 addq -112(%rsp), %rdx addq $8, %rax vaddsd %xmm2, %xmm1, %xmm2 vaddsd %xmm2, %xmm0, %xmm2 cmpq -120(%rsp), %rax jne .L10 looks like register pressure is high and IVO doesn't do the best job either. The vectorized loop might also run into CPU arch limits with respect to loop cache (it's 310 bytes long).