https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66721

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
So I have a "solution" that I hope to prettify a bit still.  The basic issue
is that SLP is "broken" in that it builds a tree of operations instead of a
graph.
That is, it un-CSEs all n in the testcase:

  for (i = 0; i < size; ++i)
    {
      float n = sqrt (in[i].x * in[i].x + in[i].y * in[i].y + in[i].z *
in[i].z);
      out[i].x = in[i].x / n;
      out[i].y = in[i].y / n;
      out[i].z = in[i].z / n;
    }

which results in bloat in the SLP tree and initial vectorized code and also
cost which is good in this case.  Usually with nicely vector-size aligned
things later passes can CSE all the mess again and the generated code isn't
so bad (so the costs are overly conservative).  But in this case with an
"interleaving size" of 3 we end up with three vectors containing 4 groups
we'd want to CSE.

 sqrt { in[0].x * in[0].x + in[0].y * in[0].y + ...,
        in[0].x * ....
        in[0].x * ...,
        in[1].x * in[1].x + in[1].y * ... }
 sqrt { ... }
 sqrt { ... }

this way we avoid the permutes on the stores of course, but the lack of
CSE here makes the generated code worse (when you look at it with
-fno-vect-cost-model, the cost-model rejects the code even though it
might be profitable at a slightly larger theshold in practice).

.L5:
        movups  (%r8), %xmm8
        addl    $1, %r10d
        addq    $48, %r8
        addq    $48, %rax
        movups  -16(%r8), %xmm6
        movups  -32(%r8), %xmm7
        movaps  %xmm8, %xmm13
        movdqa  %xmm6, %xmm11
        movdqa  %xmm6, %xmm10
        movaps  %xmm6, %xmm1
        movdqa  %xmm7, %xmm9
        movdqa  %xmm7, %xmm0
        movaps  %xmm7, %xmm12
        shufps  $252, %xmm6, %xmm1
        palignr $8, %xmm7, %xmm11
        shufps  $175, %xmm7, %xmm13
        mulps   %xmm1, %xmm1
        movaps  %xmm7, %xmm2
        palignr $12, %xmm7, %xmm10
        shufps  $252, %xmm11, %xmm11
        mulps   %xmm11, %xmm11
        shufps  $240, %xmm7, %xmm12
        shufps  $252, %xmm10, %xmm10
        mulps   %xmm10, %xmm10
        palignr $4, %xmm8, %xmm9
        shufps  $5, %xmm6, %xmm2
        shufps  $192, %xmm9, %xmm9
        mulps   %xmm9, %xmm9
        palignr $8, %xmm8, %xmm0
        shufps  $192, %xmm0, %xmm0
        mulps   %xmm0, %xmm0
        mulps   %xmm13, %xmm13
        addps   %xmm10, %xmm11
        mulps   %xmm12, %xmm12
        movaps  %xmm8, %xmm10
        shufps  $192, %xmm8, %xmm10
        mulps   %xmm10, %xmm10
        mulps   %xmm2, %xmm2
        addps   %xmm11, %xmm1
        addps   %xmm13, %xmm12
        addps   %xmm10, %xmm9
        movaps  %xmm3, %xmm10
        addps   %xmm12, %xmm2
        addps   %xmm9, %xmm0
        rsqrtps %xmm0, %xmm9
        cmpneqps        %xmm0, %xmm10
        andps   %xmm10, %xmm9
        mulps   %xmm9, %xmm0
        movaps  %xmm3, %xmm10
        cmpneqps        %xmm2, %xmm10
        mulps   %xmm0, %xmm9
        mulps   %xmm4, %xmm0
        addps   %xmm5, %xmm9
        mulps   %xmm9, %xmm0
        rsqrtps %xmm2, %xmm9
        andps   %xmm10, %xmm9
        mulps   %xmm9, %xmm2
        movaps  %xmm3, %xmm10
        cmpneqps        %xmm1, %xmm10
        mulps   %xmm2, %xmm9
        mulps   %xmm4, %xmm2
        addps   %xmm5, %xmm9
        mulps   %xmm9, %xmm2
        rsqrtps %xmm1, %xmm9
        andps   %xmm10, %xmm9
        mulps   %xmm9, %xmm1
        mulps   %xmm1, %xmm9
        mulps   %xmm4, %xmm1
        addps   %xmm5, %xmm9
        mulps   %xmm9, %xmm1
        rcpps   %xmm0, %xmm9
        mulps   %xmm9, %xmm0
        mulps   %xmm9, %xmm0
        addps   %xmm9, %xmm9
        subps   %xmm0, %xmm9
        rcpps   %xmm2, %xmm0
        mulps   %xmm9, %xmm8
        mulps   %xmm0, %xmm2
        movups  %xmm8, -48(%rax)
        mulps   %xmm0, %xmm2
        addps   %xmm0, %xmm0
        subps   %xmm2, %xmm0
        mulps   %xmm7, %xmm0
        movups  %xmm0, -32(%rax)
        rcpps   %xmm1, %xmm0
        mulps   %xmm0, %xmm1
        mulps   %xmm0, %xmm1
        addps   %xmm0, %xmm0
        subps   %xmm1, %xmm0
        mulps   %xmm6, %xmm0
        movups  %xmm0, -16(%rax)
        cmpl    %r10d, %r9d
        ja      .L5

of course it doesn't use any blend instruction.  The testcase was to
test interleaving of size 3 support which should better have used
different operations to make SLP impossible (of course the testcase
is also from real-world code which means it is a relevant regression).

Reply via email to