https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105816
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> --- For example for void __attribute__((noipa)) test_lo (short * __restrict dst, short *src1, short *src2, int n) { for (int i = 0; i < n; ++i) { dst[0] = src1[0]; dst[1] = src1[1]; dst[2] = src1[2]; dst[3] = src1[3]; dst[4] = src2[0]; dst[5] = src2[1]; dst[6] = src2[2]; dst[7] = src2[3]; dst+=8; src1+=4; src2+=4; } } we generate .L4: movdqu (%rsi,%rax), %xmm0 movq %xmm0, (%rdi,%rax,2) movhps %xmm0, 16(%rdi,%rax,2) movdqu (%rdx,%rax), %xmm0 movq %xmm0, 8(%rdi,%rax,2) movhps %xmm0, 24(%rdi,%rax,2) addq $16, %rax cmpq %r8, %rax jne .L4 but ideally we'd interleave two V2DImode vectors and perform two SSE vector stores. With AVX2 the above gets .L4: vmovdqu (%r10,%rdx), %ymm0 addq $64, %rax vmovq %xmm0, -64(%rax) vpextrq $1, %xmm0, -48(%rax) vextracti128 $0x1, %ymm0, %xmm0 vmovq %xmm0, -32(%rax) vpextrq $1, %xmm0, -16(%rax) vmovdqu (%r9,%rdx), %ymm0 addq $32, %rdx vmovq %xmm0, -56(%rax) vpextrq $1, %xmm0, -40(%rax) vextracti128 $0x1, %ymm0, %xmm0 vmovq %xmm0, -24(%rax) vpextrq $1, %xmm0, -8(%rax) cmpq %rcx, %rdx jne .L4 the unpacks would be there in the SLP tree if we would not split the instance.