https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105617

--- Comment #20 from Mason <slash.tmp at free dot fr> ---
Doh! You're right.
I come from a background where overlapping/aliasing inputs are heresy,
thus got blindsided :(

This would be the optimal code, right?

add4i:
# rdi = dst, rsi = a, rdx = b
        movq     0(%rdx), %r8
        movq     8(%rdx), %rax
        movq    16(%rdx), %rcx
        movq    24(%rdx), %rdx
        addq     0(%rsi), %r8
        adcq     8(%rsi), %rax
        adcq    16(%rsi), %rcx
        adcq    24(%rsi), %rdx
        movq    %r8,   0(%rdi)
        movq    %rax,  8(%rdi)
        movq    %rcx, 16(%rdi)
        movq    %rdx, 24(%rdi)
        ret


FWIW, trunk generates even nastier code for znver2:

add4i:
        movq    (%rdx), %rax
        addq    (%rsi), %rax
        movq    8(%rsi), %rcx
        adcq    8(%rdx), %rcx
        movq    16(%rsi), %r8
        adcq    16(%rdx), %r8
        movq    24(%rdx), %rdx
        adcq    24(%rsi), %rdx
        vmovq   %rax, %xmm2
        vpinsrq $1, %rcx, %xmm2, %xmm0
        vmovq   %r8, %xmm1
        vpinsrq $1, %rdx, %xmm1, %xmm1
        vinserti128     $0x1, %xmm1, %ymm0, %ymm0
        vmovdqu %ymm0, (%rdi)
        vzeroupper
        ret

Reply via email to