[Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization

siavashserver at gmail dot com Tue, 06 Aug 2013 22:14:21 -0700

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095


--- Comment #3 from Siavash Eliasi <siavashserver at gmail dot com> ---
I did an experiment with using raw float data types instead of __m128 data
type. This time GCC, Clang and ICC were able to generate desired code, even
without using __restric__ keyword, but a little more dirty (Pointer
arithmetics).

Not most, but I'm sure that new video decoder/encoder, game engines and similar
applications are using __m128 data types directly instead of float data types,
because (1) it guarantees them to be 16byte aligned, (2) removes the need to
manually load/store data from memory to XMM/YMM registers, (3) makes the source
code smaller and easier to maintain and (4) much more clean and smaller
generated code.

In conclusion, I don't think issue me and other people are facing is related to
not using __restrict__ keyword. All compilers fail to generate optimal code
when facing __m128 data types. However as an exception, ICC is able to generate
optimal code when facing __m128 data types and __restrict__ keyword mixed.

Here is what I have tried:

#include <xmmintrin.h>

void fooFloat(float* a, float* b, float* d, float* c, unsigned int size)
{
    for (unsigned int i = 0; i < size; i+=32)
    {
        __m128 ax[8], bx[8], cx[8], dx[8];

        ax[0] = _mm_load_ps(&a[i*32+0]);
        ax[1] = _mm_load_ps(&a[i*32+4]);
        ax[2] = _mm_load_ps(&a[i*32+8]);
        ax[3] = _mm_load_ps(&a[i*32+12]);
        ax[4] = _mm_load_ps(&a[i*32+16]);
        ax[5] = _mm_load_ps(&a[i*32+20]);
        ax[6] = _mm_load_ps(&a[i*32+24]);
        ax[7] = _mm_load_ps(&a[i*32+28]);

        bx[0] = _mm_load_ps(&b[i*32+0]);
        bx[1] = _mm_load_ps(&b[i*32+4]);
        bx[2] = _mm_load_ps(&b[i*32+8]);
        bx[3] = _mm_load_ps(&b[i*32+12]);
        bx[4] = _mm_load_ps(&b[i*32+16]);
        bx[5] = _mm_load_ps(&b[i*32+20]);
        bx[6] = _mm_load_ps(&b[i*32+24]);
        bx[7] = _mm_load_ps(&b[i*32+28]);

        dx[0] = _mm_load_ps(&d[i*32+0]);
        dx[1] = _mm_load_ps(&d[i*32+4]);
        dx[2] = _mm_load_ps(&d[i*32+8]);
        dx[3] = _mm_load_ps(&d[i*32+12]);
        dx[4] = _mm_load_ps(&d[i*32+16]);
        dx[5] = _mm_load_ps(&d[i*32+20]);
        dx[6] = _mm_load_ps(&d[i*32+24]);
        dx[7] = _mm_load_ps(&d[i*32+28]);

        cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0]));
        cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1]));
        cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2]));
        cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3]));
        cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4]));
        cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5]));
        cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6]));
        cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7]));

        _mm_store_ps(&c[i*32+0], cx[0]);
        _mm_store_ps(&c[i*32+4], cx[1]);
        _mm_store_ps(&c[i*32+8], cx[2]);
        _mm_store_ps(&c[i*32+12], cx[3]);
        _mm_store_ps(&c[i*32+16], cx[4]);
        _mm_store_ps(&c[i*32+20], cx[5]);
        _mm_store_ps(&c[i*32+24], cx[6]);
        _mm_store_ps(&c[i*32+28], cx[7]);
    }
}

And its output using GCC 4.8.1 -O2 :

fooFloat(float*, float*, float*, float*, unsigned int):
    push    r15
    xor    r15d, r15d
    test    r8d, r8d
    mov    eax, 4
    push    r14
    push    r13
    push    r12
    push    rbp
    push    rbx
    je    .L15
.L19:
    lea    r12d, [rax+4]
    lea    ebp, [rax+8]
    lea    ebx, [rax+12]
    lea    r11d, [rax+16]
    lea    r10d, [rax+20]
    lea    r9d, [rax+24]
    mov    r14d, r15d
    mov    r13d, eax
    add    r15d, 32
    sal    r14d, 5
    movaps    xmm6, XMMWORD PTR [rdx+r13*4]
    add    eax, 1024
    cmp    r8d, r15d
    movaps    xmm7, XMMWORD PTR [rdx+r14*4]
    mulps    xmm6, XMMWORD PTR [rsi+r13*4]
    movaps    xmm5, XMMWORD PTR [rdx+r12*4]
    mulps    xmm7, XMMWORD PTR [rsi+r14*4]
    movaps    xmm4, XMMWORD PTR [rdx+rbp*4]
    mulps    xmm5, XMMWORD PTR [rsi+r12*4]
    movaps    xmm3, XMMWORD PTR [rdx+rbx*4]
    mulps    xmm4, XMMWORD PTR [rsi+rbp*4]
    movaps    xmm2, XMMWORD PTR [rdx+r11*4]
    mulps    xmm3, XMMWORD PTR [rsi+rbx*4]
    movaps    xmm1, XMMWORD PTR [rdx+r10*4]
    mulps    xmm2, XMMWORD PTR [rsi+r11*4]
    movaps    xmm0, XMMWORD PTR [rdx+r9*4]
    mulps    xmm1, XMMWORD PTR [rsi+r10*4]
    addps    xmm7, XMMWORD PTR [rdi+r14*4]
    mulps    xmm0, XMMWORD PTR [rsi+r9*4]
    addps    xmm6, XMMWORD PTR [rdi+r13*4]
    addps    xmm5, XMMWORD PTR [rdi+r12*4]
    addps    xmm4, XMMWORD PTR [rdi+rbp*4]
    addps    xmm3, XMMWORD PTR [rdi+rbx*4]
    addps    xmm2, XMMWORD PTR [rdi+r11*4]
    addps    xmm1, XMMWORD PTR [rdi+r10*4]
    addps    xmm0, XMMWORD PTR [rdi+r9*4]
    movaps    XMMWORD PTR [rcx+r14*4], xmm7
    movaps    XMMWORD PTR [rcx+r13*4], xmm6
    movaps    XMMWORD PTR [rcx+r12*4], xmm5
    movaps    XMMWORD PTR [rcx+rbp*4], xmm4
    movaps    XMMWORD PTR [rcx+rbx*4], xmm3
    movaps    XMMWORD PTR [rcx+r11*4], xmm2
    movaps    XMMWORD PTR [rcx+r10*4], xmm1
    movaps    XMMWORD PTR [rcx+r9*4], xmm0
    ja    .L19
.L15:
    pop    rbx
    pop    rbp
    pop    r12
    pop    r13
    pop    r14
    pop    r15
    ret

[Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization

Reply via email to