------- Comment #7 from ubizjak at gmail dot com 2008-03-08 07:21 ------- For updated testcase:
typedef int __m64 __attribute__ ((__vector_size__ (8))); typedef long long __v1di __attribute__ ((__vector_size__ (8))); __m64 unsigned_add3 (const __m64 * a, const __m64 * b, unsigned long count) { __m64 sum; unsigned int i; for (i = 1; i < count; i++) sum = (__m64) __builtin_ia32_paddq ((__v1di) a[i], (__v1di) b[i]); return sum; } we now generate: .L3: addl $1, %eax movq (%edx), %mm0 addl $8, %edx paddq (%ecx), %mm0 addl $8, %ecx cmpl %eax, %ebx ja .L3 and adding -mno-ivopts we are back to optimal code: .L3: movq (%ebx,%eax,8), %mm0 paddq (%ecx,%eax,8), %mm0 addl $1, %eax cmpl %eax, %edx ja .L3 So, fixed. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22152