http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095

--- Comment #4 from Siavash Eliasi <siavashserver at gmail dot com> ---
In the end, here is what I really like GCC to generate for me. Same output as
function (bar) for function (foo) when using GCC with -O3 -march=core2
switches:

#include <xmmintrin.h>

#define BATCHSIZE 8

void foo(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE],
__m128 c[][BATCHSIZE], unsigned int size)
{
    for (unsigned int i = 0; i < size; i++)
    {
        for (unsigned int j=0; j<BATCHSIZE; j++)
        {
            c[i][j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
        }
    }
}

void bar(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE],
__m128 c[][BATCHSIZE], unsigned int size)
{
    for (unsigned int i = 0; i < size; i++)
    {
        __m128 cx[BATCHSIZE];

        for (unsigned int j=0; j<BATCHSIZE; j++)
        {
            cx[j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
        }

        for (unsigned int j=0; j<BATCHSIZE; j++)
        {
            c[i][j] = cx[j]; 
        }
    }
}

Generated asm code:

foo(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8],
float __vector (*) [8], unsigned int):
    test    r8d, r8d
    je    .L1
    xor    eax, eax
.L4:
    movaps    xmm0, XMMWORD PTR [rdx]
    add    eax, 1
    sub    rsi, -128
    sub    rdx, -128
    sub    rdi, -128
    sub    rcx, -128
    mulps    xmm0, XMMWORD PTR [rsi-128]
    addps    xmm0, XMMWORD PTR [rdi-128]
    movaps    XMMWORD PTR [rcx-128], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-112]
    mulps    xmm0, XMMWORD PTR [rsi-112]
    addps    xmm0, XMMWORD PTR [rdi-112]
    movaps    XMMWORD PTR [rcx-112], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-96]
    mulps    xmm0, XMMWORD PTR [rsi-96]
    addps    xmm0, XMMWORD PTR [rdi-96]
    movaps    XMMWORD PTR [rcx-96], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-80]
    mulps    xmm0, XMMWORD PTR [rsi-80]
    addps    xmm0, XMMWORD PTR [rdi-80]
    movaps    XMMWORD PTR [rcx-80], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-64]
    mulps    xmm0, XMMWORD PTR [rsi-64]
    addps    xmm0, XMMWORD PTR [rdi-64]
    movaps    XMMWORD PTR [rcx-64], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-48]
    mulps    xmm0, XMMWORD PTR [rsi-48]
    addps    xmm0, XMMWORD PTR [rdi-48]
    movaps    XMMWORD PTR [rcx-48], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-32]
    mulps    xmm0, XMMWORD PTR [rsi-32]
    addps    xmm0, XMMWORD PTR [rdi-32]
    movaps    XMMWORD PTR [rcx-32], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-16]
    mulps    xmm0, XMMWORD PTR [rsi-16]
    addps    xmm0, XMMWORD PTR [rdi-16]
    movaps    XMMWORD PTR [rcx-16], xmm0
    cmp    eax, r8d
    jne    .L4
.L1:
    rep; ret
bar(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8],
float __vector (*) [8], unsigned int):
    test    r8d, r8d
    je    .L6
    xor    eax, eax
.L9:
    movaps    xmm7, XMMWORD PTR [rdx]
    add    eax, 1
    sub    rsi, -128
    movaps    xmm6, XMMWORD PTR [rdx+16]
    sub    rdi, -128
    sub    rdx, -128
    movaps    xmm5, XMMWORD PTR [rdx-96]
    sub    rcx, -128
    movaps    xmm4, XMMWORD PTR [rdx-80]
    movaps    xmm3, XMMWORD PTR [rdx-64]
    movaps    xmm2, XMMWORD PTR [rdx-48]
    movaps    xmm1, XMMWORD PTR [rdx-32]
    movaps    xmm0, XMMWORD PTR [rdx-16]
    mulps    xmm7, XMMWORD PTR [rsi-128]
    mulps    xmm6, XMMWORD PTR [rsi-112]
    mulps    xmm5, XMMWORD PTR [rsi-96]
    mulps    xmm4, XMMWORD PTR [rsi-80]
    mulps    xmm3, XMMWORD PTR [rsi-64]
    mulps    xmm2, XMMWORD PTR [rsi-48]
    mulps    xmm1, XMMWORD PTR [rsi-32]
    mulps    xmm0, XMMWORD PTR [rsi-16]
    addps    xmm7, XMMWORD PTR [rdi-128]
    addps    xmm6, XMMWORD PTR [rdi-112]
    addps    xmm5, XMMWORD PTR [rdi-96]
    addps    xmm4, XMMWORD PTR [rdi-80]
    addps    xmm3, XMMWORD PTR [rdi-64]
    addps    xmm2, XMMWORD PTR [rdi-48]
    addps    xmm1, XMMWORD PTR [rdi-32]
    addps    xmm0, XMMWORD PTR [rdi-16]
    movaps    XMMWORD PTR [rcx-128], xmm7
    movaps    XMMWORD PTR [rcx-112], xmm6
    movaps    XMMWORD PTR [rcx-96], xmm5
    movaps    XMMWORD PTR [rcx-80], xmm4
    movaps    XMMWORD PTR [rcx-64], xmm3
    movaps    XMMWORD PTR [rcx-48], xmm2
    movaps    XMMWORD PTR [rcx-32], xmm1
    movaps    XMMWORD PTR [rcx-16], xmm0
    cmp    eax, r8d
    jne    .L9
.L6:
    rep; ret

Reply via email to