------- Comment #7 from ubizjak at gmail dot com 2008-03-08 07:21 -------
For updated testcase:
typedef int __m64 __attribute__ ((__vector_size__ (8)));
typedef long long __v1di __attribute__ ((__vector_size__ (8)));
__m64
unsigned_add3 (const __m64 * a, const __m64 * b, unsigned long count)
{
__m64 sum;
unsigned int i;
for (i = 1; i < count; i++)
sum = (__m64) __builtin_ia32_paddq ((__v1di) a[i], (__v1di) b[i]);
return sum;
}
we now generate:
.L3:
addl $1, %eax
movq (%edx), %mm0
addl $8, %edx
paddq (%ecx), %mm0
addl $8, %ecx
cmpl %eax, %ebx
ja .L3
and adding -mno-ivopts we are back to optimal code:
.L3:
movq (%ebx,%eax,8), %mm0
paddq (%ecx,%eax,8), %mm0
addl $1, %eax
cmpl %eax, %edx
ja .L3
So, fixed.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22152