------- Comment #7 from ubizjak at gmail dot com  2008-03-08 07:21 -------
For updated testcase:

typedef int __m64 __attribute__ ((__vector_size__ (8)));
typedef long long __v1di __attribute__ ((__vector_size__ (8)));

__m64
unsigned_add3 (const __m64 * a, const __m64 * b, unsigned long count)
{
  __m64 sum;
  unsigned int i;

  for (i = 1; i < count; i++)
    sum = (__m64) __builtin_ia32_paddq ((__v1di) a[i], (__v1di) b[i]);

  return sum;
}

we now generate:

.L3:
        addl    $1, %eax
        movq    (%edx), %mm0
        addl    $8, %edx
        paddq   (%ecx), %mm0
        addl    $8, %ecx
        cmpl    %eax, %ebx
        ja      .L3

and adding -mno-ivopts we are back to optimal code:

.L3:
        movq    (%ebx,%eax,8), %mm0
        paddq   (%ecx,%eax,8), %mm0
        addl    $1, %eax
        cmpl    %eax, %edx
        ja      .L3

So, fixed.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22152

Reply via email to