[Bug rtl-optimization/67577] New: Trivial float-vectorization foiled by a loop

bisqwit at iki dot fi Mon, 14 Sep 2015 10:24:16 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67577


            Bug ID: 67577
           Summary: Trivial float-vectorization foiled by a loop
           Product: gcc
           Version: 5.2.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bisqwit at iki dot fi
  Target Milestone: ---

This code is written as if tailored to be SIMD-optimized by GCC...
But GCC somehow blows it.

    template<typename T, unsigned N>
    struct vec
    {
        T d[N];

        vec<T,N> operator* (const T& b)
        {
            vec<T,N> result;
            for(unsigned n=0u; n<N; ++n) result.d[n] = d[n] * b;
            return result;
        }
        vec<T,N> operator+ (const vec<T,N>& b)
        {
            vec<T,N> result;
            for(unsigned n=0u; n<N; ++n) result.d[n] = d[n] + b.d[n];
            return result;
        }
        vec<T,N> operator- (const vec<T,N>& b)
        {
            vec<T,N> result;
            for(unsigned n=0u; n<N; ++n) result.d[n] = d[n] - b.d[n];
            return result;
        }
    };


    float scale;
    vec<float,8> a, b, c;

    void x()
    {
        for(int n=0; n<1; ++n)
        {
            vec<float,8> result = b + (a - b) * scale;
            c = result;
        }
    }

Generated code (inner loop):

        movss   b+4(%rip), %xmm6
        movss   a+4(%rip), %xmm7
        subss   %xmm6, %xmm7
        movss   scale(%rip), %xmm0
        movss   b+8(%rip), %xmm5
        movss   b+12(%rip), %xmm4
        movss   b+16(%rip), %xmm3
        mulss   %xmm0, %xmm7
        movss   b+20(%rip), %xmm1
        movss   b+24(%rip), %xmm2
        movss   b+28(%rip), %xmm9
        movss   b(%rip), %xmm8
        addss   %xmm6, %xmm7
        movss   a+8(%rip), %xmm6
        subss   %xmm5, %xmm6
        movss   %xmm7, c+4(%rip)
        mulss   %xmm0, %xmm6
        addss   %xmm5, %xmm6
        movss   a+12(%rip), %xmm5
        subss   %xmm4, %xmm5
        movss   %xmm6, c+8(%rip)
        mulss   %xmm0, %xmm5
        addss   %xmm4, %xmm5
        movss   a+16(%rip), %xmm4
        subss   %xmm3, %xmm4
        movss   %xmm5, c+12(%rip)
        mulss   %xmm0, %xmm4
        addss   %xmm3, %xmm4
        movss   a+20(%rip), %xmm3
        subss   %xmm1, %xmm3
        movss   %xmm4, c+16(%rip)
        mulss   %xmm0, %xmm3
        addss   %xmm1, %xmm3
        movss   a+24(%rip), %xmm1
        subss   %xmm2, %xmm1
        movss   %xmm3, c+20(%rip)
        mulss   %xmm0, %xmm1
        addss   %xmm2, %xmm1
        movss   a+28(%rip), %xmm2
        subss   %xmm9, %xmm2
        movss   %xmm1, c+24(%rip)
        mulss   %xmm0, %xmm2
        addss   %xmm9, %xmm2
        movss   a(%rip), %xmm9
        subss   %xmm8, %xmm9
        movss   %xmm2, c+28(%rip)
        mulss   %xmm9, %xmm0
        addss   %xmm8, %xmm0
        movss   %xmm0, c(%rip)

Platform: amd64; GCC version 5.2.1.

If I comment away the dummy for-loop, or I change the float "scale" variable
into a function parameter, the inner loop changes into a much simpler code that
vectorizes like I meant to:

        movaps  b(%rip), %xmm3
        movaps  b+16(%rip), %xmm1
        movaps  a+16(%rip), %xmm0
        movaps  a(%rip), %xmm2
        subps   %xmm1, %xmm0
        movss   scale(%rip), %xmm4
        subps   %xmm3, %xmm2
        shufps  $0, %xmm4, %xmm4
        mulps   %xmm4, %xmm0
        mulps   %xmm4, %xmm2
        addps   %xmm1, %xmm0
        addps   %xmm3, %xmm2
        movaps  %xmm0, -24(%rsp)
        movq    -16(%rsp), %rax
        movaps  %xmm2, -40(%rsp)
        movq    %xmm2, c(%rip)
        movq    %xmm0, c+16(%rip)
        movq    -32(%rsp), %rdx
        movq    %rax, c+24(%rip)
        movq    %rdx, c+8(%rip)

Although there's still some glitch in the generated code causing dummy memory
transfers, at least it now did the calculations using packed registers.

If I change the global "scale" variable into a function parameter, the
following shorter code is generated instead (essentially the same what Clang
successfully produces for all three cases).

        movaps  b+16(%rip), %xmm2
        shufps  $0, %xmm0, %xmm0
        movaps  a+16(%rip), %xmm1
        subps   %xmm2, %xmm1
        movaps  b(%rip), %xmm3
        mulps   %xmm0, %xmm1
        addps   %xmm2, %xmm1
        movaps  a(%rip), %xmm2
        subps   %xmm3, %xmm2
        movaps  %xmm1, c+16(%rip)
        mulps   %xmm2, %xmm0
        addps   %xmm3, %xmm0
        movaps  %xmm0, c(%rip)

Something causes GCC's tree-vectorization to be really rickety and easily
foiled by trivial changes in code, and I'd like to see it fixed at least in
these particular cases.

[Bug rtl-optimization/67577] New: Trivial float-vectorization foiled by a loop

Reply via email to