https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106475

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
The loop seems to be vectorized just fine?  The issue is just that we need a
runtime alias check because of the variable stride and the fact that we need a
VF of two to fill up to 16 byte vectors:

.L5:
        movq    (%rcx), %xmm1
        movq    (%rdx), %xmm0
        addl    $1, %esi
        movhps  (%rcx,%r10), %xmm1
        movhps  (%rdx,%r9), %xmm0
        addq    %r14, %rcx
        addq    %r13, %rdx
        paddb   %xmm1, %xmm0
        paddb   %xmm2, %xmm0
        movq    %xmm0, (%rax)
        movhps  %xmm0, (%rax,%rdi)
        addq    %r12, %rax
        cmpl    %esi, %r15d
        jne     .L5
        movl    %ebp, %eax
        andl    $-2, %eax
        andl    $1, %ebp
        je      .L1
.L4:
        imulq   %rax, %r10
        imulq   %rax, %r9
        imulq   %rdi, %rax
        movq    (%rbx,%r10), %xmm0
        movq    (%r8,%r9), %xmm1
        paddb   %xmm1, %xmm0
        movq    .LC1(%rip), %xmm1
        paddb   %xmm1, %xmm0
        movq    %xmm0, (%r11,%rax)

yes, the BB vectorization result is smaller but only uses half of a vector:

.L3:
        movq    (%r8), %xmm0
        movq    (%rdx), %xmm1
        addl    $1, %ecx
        addq    %rdi, %rdx
        addq    %rsi, %r8
        paddb   %xmm1, %xmm0
        paddb   %xmm2, %xmm0
        movq    %xmm0, (%rax)
        addq    %r10, %rax
        cmpl    %ecx, %r11d
        jne     .L3

Reply via email to