------- Additional Comments From uros at kss-loka dot si 2005-01-12 10:54 ------- Another testcase that I think should be optimized: #include <xmmintrin.h>
__m128 test() { float val1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; return _mm_loadu_ps(val1); } This is currently compiled to: test: pushl %ebp movl $0x00000000, %eax movl %esp, %ebp subl $16, %esp movl %eax, -16(%ebp) movl %eax, -12(%ebp) movl %eax, -8(%ebp) movl %eax, -4(%ebp) movups -16(%ebp), %xmm0 leave ret But I think gcc it should produce something like: test: pushl %ebp xorps %xmm0, %xmm0 movl %esp, %ebp subl $16, %esp (*) movups %xmm0, -16(%ebp) leave ret Perhaps even the store to stack is not necessary in this particular case. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18562