https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98172
--- Comment #9 from Hongtao.liu <crazylht at gmail dot com> --- > .L3: > vmovupd (%rcx,%rax), %xmm3 > vmovupd (%rsi,%rax), %xmm4 > vinsertf128 $0x1, 16(%rcx,%rax), %ymm3, %ymm0 > vinsertf128 $0x1, 16(%rsi,%rax), %ymm4, %ymm2 > vmovupd (%rdi,%rax), %xmm5 > vinsertf128 $0x1, 16(%rdi,%rax), %ymm5, %ymm1 > vfmadd132pd %ymm2, %ymm1, %ymm0 > vmovupd %xmm0, (%rdx,%rax) > vextractf128 $0x1, %ymm0, 16(%rdx,%rax) > addq $32, %rax > cmpq $2048, %rax > jne .L3 > vzeroupper > ret The kernel loop could be better as .L3: vmovupd (%rcx,%rax), %ymm0 vmovupd (%rdi,%rax), %ymm1 vfmadd132pd (%rsi,%rax), %ymm1, %ymm0 vmovupd %ymm0, (%rdx,%rax) addq $32, %rax cmpq $2048, %rax jne .L3