https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105617
--- Comment #20 from Mason <slash.tmp at free dot fr> --- Doh! You're right. I come from a background where overlapping/aliasing inputs are heresy, thus got blindsided :( This would be the optimal code, right? add4i: # rdi = dst, rsi = a, rdx = b movq 0(%rdx), %r8 movq 8(%rdx), %rax movq 16(%rdx), %rcx movq 24(%rdx), %rdx addq 0(%rsi), %r8 adcq 8(%rsi), %rax adcq 16(%rsi), %rcx adcq 24(%rsi), %rdx movq %r8, 0(%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %rdx, 24(%rdi) ret FWIW, trunk generates even nastier code for znver2: add4i: movq (%rdx), %rax addq (%rsi), %rax movq 8(%rsi), %rcx adcq 8(%rdx), %rcx movq 16(%rsi), %r8 adcq 16(%rdx), %r8 movq 24(%rdx), %rdx adcq 24(%rsi), %rdx vmovq %rax, %xmm2 vpinsrq $1, %rcx, %xmm2, %xmm0 vmovq %r8, %xmm1 vpinsrq $1, %rdx, %xmm1, %xmm1 vinserti128 $0x1, %xmm1, %ymm0, %ymm0 vmovdqu %ymm0, (%rdi) vzeroupper ret