https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105816

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
For example for

void __attribute__((noipa)) test_lo (short * __restrict dst, short *src1, short
*src2, int n)
{
  for (int i = 0; i < n; ++i)
    {
      dst[0] = src1[0];
      dst[1] = src1[1];
      dst[2] = src1[2];
      dst[3] = src1[3];
      dst[4] = src2[0];
      dst[5] = src2[1];
      dst[6] = src2[2];
      dst[7] = src2[3];
      dst+=8;
      src1+=4;
      src2+=4;
    }
}

we generate

.L4:
        movdqu  (%rsi,%rax), %xmm0
        movq    %xmm0, (%rdi,%rax,2)
        movhps  %xmm0, 16(%rdi,%rax,2)
        movdqu  (%rdx,%rax), %xmm0
        movq    %xmm0, 8(%rdi,%rax,2)
        movhps  %xmm0, 24(%rdi,%rax,2)
        addq    $16, %rax
        cmpq    %r8, %rax
        jne     .L4

but ideally we'd interleave two V2DImode vectors and perform two SSE vector
stores.  With AVX2 the above gets

.L4:
        vmovdqu (%r10,%rdx), %ymm0
        addq    $64, %rax
        vmovq   %xmm0, -64(%rax)
        vpextrq $1, %xmm0, -48(%rax)
        vextracti128    $0x1, %ymm0, %xmm0
        vmovq   %xmm0, -32(%rax)
        vpextrq $1, %xmm0, -16(%rax)
        vmovdqu (%r9,%rdx), %ymm0
        addq    $32, %rdx
        vmovq   %xmm0, -56(%rax)
        vpextrq $1, %xmm0, -40(%rax)
        vextracti128    $0x1, %ymm0, %xmm0
        vmovq   %xmm0, -24(%rax)
        vpextrq $1, %xmm0, -8(%rax)
        cmpq    %rcx, %rdx
        jne     .L4

the unpacks would be there in the SLP tree if we would not split the
instance.

Reply via email to