http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095
--- Comment #4 from Siavash Eliasi <siavashserver at gmail dot com> --- In the end, here is what I really like GCC to generate for me. Same output as function (bar) for function (foo) when using GCC with -O3 -march=core2 switches: #include <xmmintrin.h> #define BATCHSIZE 8 void foo(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size) { for (unsigned int i = 0; i < size; i++) { for (unsigned int j=0; j<BATCHSIZE; j++) { c[i][j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j])); } } } void bar(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size) { for (unsigned int i = 0; i < size; i++) { __m128 cx[BATCHSIZE]; for (unsigned int j=0; j<BATCHSIZE; j++) { cx[j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j])); } for (unsigned int j=0; j<BATCHSIZE; j++) { c[i][j] = cx[j]; } } } Generated asm code: foo(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int): test r8d, r8d je .L1 xor eax, eax .L4: movaps xmm0, XMMWORD PTR [rdx] add eax, 1 sub rsi, -128 sub rdx, -128 sub rdi, -128 sub rcx, -128 mulps xmm0, XMMWORD PTR [rsi-128] addps xmm0, XMMWORD PTR [rdi-128] movaps XMMWORD PTR [rcx-128], xmm0 movaps xmm0, XMMWORD PTR [rdx-112] mulps xmm0, XMMWORD PTR [rsi-112] addps xmm0, XMMWORD PTR [rdi-112] movaps XMMWORD PTR [rcx-112], xmm0 movaps xmm0, XMMWORD PTR [rdx-96] mulps xmm0, XMMWORD PTR [rsi-96] addps xmm0, XMMWORD PTR [rdi-96] movaps XMMWORD PTR [rcx-96], xmm0 movaps xmm0, XMMWORD PTR [rdx-80] mulps xmm0, XMMWORD PTR [rsi-80] addps xmm0, XMMWORD PTR [rdi-80] movaps XMMWORD PTR [rcx-80], xmm0 movaps xmm0, XMMWORD PTR [rdx-64] mulps xmm0, XMMWORD PTR [rsi-64] addps xmm0, XMMWORD PTR [rdi-64] movaps XMMWORD PTR [rcx-64], xmm0 movaps xmm0, XMMWORD PTR [rdx-48] mulps xmm0, XMMWORD PTR [rsi-48] addps xmm0, XMMWORD PTR [rdi-48] movaps XMMWORD PTR [rcx-48], xmm0 movaps xmm0, XMMWORD PTR [rdx-32] mulps xmm0, XMMWORD PTR [rsi-32] addps xmm0, XMMWORD PTR [rdi-32] movaps XMMWORD PTR [rcx-32], xmm0 movaps xmm0, XMMWORD PTR [rdx-16] mulps xmm0, XMMWORD PTR [rsi-16] addps xmm0, XMMWORD PTR [rdi-16] movaps XMMWORD PTR [rcx-16], xmm0 cmp eax, r8d jne .L4 .L1: rep; ret bar(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int): test r8d, r8d je .L6 xor eax, eax .L9: movaps xmm7, XMMWORD PTR [rdx] add eax, 1 sub rsi, -128 movaps xmm6, XMMWORD PTR [rdx+16] sub rdi, -128 sub rdx, -128 movaps xmm5, XMMWORD PTR [rdx-96] sub rcx, -128 movaps xmm4, XMMWORD PTR [rdx-80] movaps xmm3, XMMWORD PTR [rdx-64] movaps xmm2, XMMWORD PTR [rdx-48] movaps xmm1, XMMWORD PTR [rdx-32] movaps xmm0, XMMWORD PTR [rdx-16] mulps xmm7, XMMWORD PTR [rsi-128] mulps xmm6, XMMWORD PTR [rsi-112] mulps xmm5, XMMWORD PTR [rsi-96] mulps xmm4, XMMWORD PTR [rsi-80] mulps xmm3, XMMWORD PTR [rsi-64] mulps xmm2, XMMWORD PTR [rsi-48] mulps xmm1, XMMWORD PTR [rsi-32] mulps xmm0, XMMWORD PTR [rsi-16] addps xmm7, XMMWORD PTR [rdi-128] addps xmm6, XMMWORD PTR [rdi-112] addps xmm5, XMMWORD PTR [rdi-96] addps xmm4, XMMWORD PTR [rdi-80] addps xmm3, XMMWORD PTR [rdi-64] addps xmm2, XMMWORD PTR [rdi-48] addps xmm1, XMMWORD PTR [rdi-32] addps xmm0, XMMWORD PTR [rdi-16] movaps XMMWORD PTR [rcx-128], xmm7 movaps XMMWORD PTR [rcx-112], xmm6 movaps XMMWORD PTR [rcx-96], xmm5 movaps XMMWORD PTR [rcx-80], xmm4 movaps XMMWORD PTR [rcx-64], xmm3 movaps XMMWORD PTR [rcx-48], xmm2 movaps XMMWORD PTR [rcx-32], xmm1 movaps XMMWORD PTR [rcx-16], xmm0 cmp eax, r8d jne .L9 .L6: rep; ret