https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96208

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Smaller testcase, avoiding reductions and negative step:

void
test(double * __restrict a, double *b, double *k, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[2*i] = b[2*i] * k[ONE*i];
      a[2*i + 1] = b[2*i + 1] * k[ONE*i];
    }
}

this is vectorized with interleaving with -DONE=1 as SLP discovery fails:

t.c:4:21: missed:   Build SLP failed: not grouped load _9 = *_8;

.L4:
        movupd  (%r8,%rax,2), %xmm1
        movupd  (%rdx,%rax), %xmm2
        movupd  16(%r8,%rax,2), %xmm0
        movlpd  8(%r8,%rax,2), %xmm0
        movhpd  16(%r8,%rax,2), %xmm1
        mulpd   %xmm2, %xmm1
        mulpd   %xmm2, %xmm0
        movapd  %xmm1, %xmm2
        unpcklpd        %xmm0, %xmm2
        unpckhpd        %xmm0, %xmm1
        movups  %xmm2, (%rdi,%rax,2)
        movups  %xmm1, 16(%rdi,%rax,2)
        addq    $16, %rax
        cmpq    %rax, %rsi
        jne     .L4

but when we make the access to 'k' non-unit-stride with -DONE=2
SLP works fine

t.c:4:21: note:   Vectorizing SLP tree:
t.c:4:21: note:   node 0x48580d0 (max_nunits=2, refcnt=1) vector(2) double
t.c:4:21: note:   op template: *_8 = _9;
t.c:4:21: note:         stmt 0 *_8 = _9;
t.c:4:21: note:         stmt 1 *_14 = _15;
t.c:4:21: note:         children 0x4858158
t.c:4:21: note:   node 0x4858158 (max_nunits=2, refcnt=1) vector(2) double
t.c:4:21: note:   op template: _9 = _5 * _7;
t.c:4:21: note:         stmt 0 _9 = _5 * _7;
t.c:4:21: note:         stmt 1 _15 = _7 * _13;
t.c:4:21: note:         children 0x4858268 0x48582f0
t.c:4:21: note:   node 0x4858268 (max_nunits=2, refcnt=1) vector(2) double
t.c:4:21: note:   op template: _5 = *_4;
t.c:4:21: note:         stmt 0 _5 = *_4;
t.c:4:21: note:         stmt 1 _13 = *_12;
t.c:4:21: note:   node 0x48582f0 (max_nunits=2, refcnt=1) vector(2) double
t.c:4:21: note:   op template: _7 = *_6;
t.c:4:21: note:         stmt 0 _7 = *_6;
t.c:4:21: note:         stmt 1 _7 = *_6;
t.c:4:21: note:         load permutation { 0 0 }

and we get

.L4:
        movupd  (%rdx,%rax), %xmm0
        movupd  (%rsi,%rax,2), %xmm2
        unpcklpd        %xmm0, %xmm0
        mulpd   %xmm2, %xmm0
        movups  %xmm0, (%rdi,%rax,2)
        addq    $8, %rax
        cmpq    %rax, %r8
        jne     .L4

Reply via email to