http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095
--- Comment #3 from Siavash Eliasi <siavashserver at gmail dot com> --- I did an experiment with using raw float data types instead of __m128 data type. This time GCC, Clang and ICC were able to generate desired code, even without using __restric__ keyword, but a little more dirty (Pointer arithmetics). Not most, but I'm sure that new video decoder/encoder, game engines and similar applications are using __m128 data types directly instead of float data types, because (1) it guarantees them to be 16byte aligned, (2) removes the need to manually load/store data from memory to XMM/YMM registers, (3) makes the source code smaller and easier to maintain and (4) much more clean and smaller generated code. In conclusion, I don't think issue me and other people are facing is related to not using __restrict__ keyword. All compilers fail to generate optimal code when facing __m128 data types. However as an exception, ICC is able to generate optimal code when facing __m128 data types and __restrict__ keyword mixed. Here is what I have tried: #include <xmmintrin.h> void fooFloat(float* a, float* b, float* d, float* c, unsigned int size) { for (unsigned int i = 0; i < size; i+=32) { __m128 ax[8], bx[8], cx[8], dx[8]; ax[0] = _mm_load_ps(&a[i*32+0]); ax[1] = _mm_load_ps(&a[i*32+4]); ax[2] = _mm_load_ps(&a[i*32+8]); ax[3] = _mm_load_ps(&a[i*32+12]); ax[4] = _mm_load_ps(&a[i*32+16]); ax[5] = _mm_load_ps(&a[i*32+20]); ax[6] = _mm_load_ps(&a[i*32+24]); ax[7] = _mm_load_ps(&a[i*32+28]); bx[0] = _mm_load_ps(&b[i*32+0]); bx[1] = _mm_load_ps(&b[i*32+4]); bx[2] = _mm_load_ps(&b[i*32+8]); bx[3] = _mm_load_ps(&b[i*32+12]); bx[4] = _mm_load_ps(&b[i*32+16]); bx[5] = _mm_load_ps(&b[i*32+20]); bx[6] = _mm_load_ps(&b[i*32+24]); bx[7] = _mm_load_ps(&b[i*32+28]); dx[0] = _mm_load_ps(&d[i*32+0]); dx[1] = _mm_load_ps(&d[i*32+4]); dx[2] = _mm_load_ps(&d[i*32+8]); dx[3] = _mm_load_ps(&d[i*32+12]); dx[4] = _mm_load_ps(&d[i*32+16]); dx[5] = _mm_load_ps(&d[i*32+20]); dx[6] = _mm_load_ps(&d[i*32+24]); dx[7] = _mm_load_ps(&d[i*32+28]); cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0])); cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1])); cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2])); cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3])); cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4])); cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5])); cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6])); cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7])); _mm_store_ps(&c[i*32+0], cx[0]); _mm_store_ps(&c[i*32+4], cx[1]); _mm_store_ps(&c[i*32+8], cx[2]); _mm_store_ps(&c[i*32+12], cx[3]); _mm_store_ps(&c[i*32+16], cx[4]); _mm_store_ps(&c[i*32+20], cx[5]); _mm_store_ps(&c[i*32+24], cx[6]); _mm_store_ps(&c[i*32+28], cx[7]); } } And its output using GCC 4.8.1 -O2 : fooFloat(float*, float*, float*, float*, unsigned int): push r15 xor r15d, r15d test r8d, r8d mov eax, 4 push r14 push r13 push r12 push rbp push rbx je .L15 .L19: lea r12d, [rax+4] lea ebp, [rax+8] lea ebx, [rax+12] lea r11d, [rax+16] lea r10d, [rax+20] lea r9d, [rax+24] mov r14d, r15d mov r13d, eax add r15d, 32 sal r14d, 5 movaps xmm6, XMMWORD PTR [rdx+r13*4] add eax, 1024 cmp r8d, r15d movaps xmm7, XMMWORD PTR [rdx+r14*4] mulps xmm6, XMMWORD PTR [rsi+r13*4] movaps xmm5, XMMWORD PTR [rdx+r12*4] mulps xmm7, XMMWORD PTR [rsi+r14*4] movaps xmm4, XMMWORD PTR [rdx+rbp*4] mulps xmm5, XMMWORD PTR [rsi+r12*4] movaps xmm3, XMMWORD PTR [rdx+rbx*4] mulps xmm4, XMMWORD PTR [rsi+rbp*4] movaps xmm2, XMMWORD PTR [rdx+r11*4] mulps xmm3, XMMWORD PTR [rsi+rbx*4] movaps xmm1, XMMWORD PTR [rdx+r10*4] mulps xmm2, XMMWORD PTR [rsi+r11*4] movaps xmm0, XMMWORD PTR [rdx+r9*4] mulps xmm1, XMMWORD PTR [rsi+r10*4] addps xmm7, XMMWORD PTR [rdi+r14*4] mulps xmm0, XMMWORD PTR [rsi+r9*4] addps xmm6, XMMWORD PTR [rdi+r13*4] addps xmm5, XMMWORD PTR [rdi+r12*4] addps xmm4, XMMWORD PTR [rdi+rbp*4] addps xmm3, XMMWORD PTR [rdi+rbx*4] addps xmm2, XMMWORD PTR [rdi+r11*4] addps xmm1, XMMWORD PTR [rdi+r10*4] addps xmm0, XMMWORD PTR [rdi+r9*4] movaps XMMWORD PTR [rcx+r14*4], xmm7 movaps XMMWORD PTR [rcx+r13*4], xmm6 movaps XMMWORD PTR [rcx+r12*4], xmm5 movaps XMMWORD PTR [rcx+rbp*4], xmm4 movaps XMMWORD PTR [rcx+rbx*4], xmm3 movaps XMMWORD PTR [rcx+r11*4], xmm2 movaps XMMWORD PTR [rcx+r10*4], xmm1 movaps XMMWORD PTR [rcx+r9*4], xmm0 ja .L19 .L15: pop rbx pop rbp pop r12 pop r13 pop r14 pop r15 ret