[Bug c/31661] New: mmintrin calls are slower than plain C

effbiae at gmail dot com Sun, 22 Apr 2007 19:01:02 -0700

hi,

this thread:
 http://gcc.gnu.org/ml/gcc-help/2007-04/msg00201.html
details my problems.  (duplicated here)


i want to sum an array of longs using mmx.  i use the functions:
   _mm_set_pi32 and _m_paddd
but the resultant binary contains significantly less efficient code
than inline asm or even plain C ( for(i=0;i<n;i++)total+=a[i]; ).
here's the relevant function:

simd_mmintrin(n, is)
I *is;
{   __m64 q,r;
  I i;
  _m_empty();
  q=_m_from_int(0);
  for (i=0; i < n; i+=W) {
      r=_mm_set_pi32(is[i],is[i+1]);
      q=_m_paddd(q,r);
  }
  union {long a[2];__m64 m;}u;
  u.m=q;
  return u.a[0]+u.a[1];
}

i have a script RUNME.sh:

$ sh RUNME.sh
---
expect: 199990000
impl: C (SISD)
199990000


real    0m0.604s
user    0m0.580s
sys     0m0.004s



---
expect: 199990000
impl: ASM (SIMD)
199990000



real    0m0.377s
user    0m0.360s
sys     0m0.008s



---
expect: 199990000
impl: MMINTRIN (SIMD)
199990000


real    0m1.235s
user    0m1.228s
sys     0m0.004s




$ cat RUNME.sh
#!/bin/sh
repeats=4000        # number of times to repeat the test
vectorsize=10000   # size of the vector in 32 bit ints
gcc -O -mmmx v.c -o v
for which in 0 1 2; do time ./v $repeats $vectorsize $which; done


$ cat v.c
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <mmintrin.h>


typedef long I;typedef unsigned long J;
typedef char C;
#define IZ sizeof(I)
#define W 2



simd_mmintrin(n, is)
I *is;
{   __v2si q,r;
  I i;
  _m_empty();
  q=_m_from_int(0);
  for (i=0; i < n; i+=W) {
      memcpy(&r,is+i,IZ*W);
      q=_m_paddd(q,r);
  }
  I*qq=(I*)&q;
  return qq[0]+qq[1];
}


simd_asm(n, is)
I *is;
{   I i,*r=malloc(IZ*W*8);
  asm("emms");
  asm("pxor %mm0,%mm0");
  for (i=0; i < n; i+=W) {
      asm("movq %0,%%mm1\n\t"
          "paddd %%mm1,%%mm0"
          :
          :"m"(is[i])           );
  }
  asm("movq %%mm0,%0":"=m"(*(__m64*)r));
  return r[0]+r[1];
}


sisd(n, is)
I *is;
{
  I i = 0, j = 0;
  for (i = 0; i < n; i++)
      j += is[i];
  return j;
}


main(c, v)
C **v;
{
  I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]);
  I result, *is=malloc(IZ*(z*=2)), i;
  int(*fs[])()={sisd,simd_asm,simd_mmintrin,0};
  C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"};
  for(i=0;i<z;i++)is[i]=i;
  printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2);
  printf("impl: %s\n",ss[m]);
  while (n--)
      result=fs[m](z, is);
  printf("%d\n",result);
}



[EMAIL PROTECTED] i]$ gcc -v
Using built-in specs.
Target: i386-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man
--infodir=/u
sr/share/info --enable-shared --enable-threads=posix --enable-checking=release
-
-with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions
--enable-
libgcj-multifile --enable-languages=c,c++,objc,obj-c++,java,fortran,ada
--enable
-java-awt=gtk --disable-dssi
--with-java-home=/usr/lib/jvm/java-1.4.2-gcj-1.4.2.
0/jre --with-cpu=generic --host=i386-redhat-linux
Thread model: posix
gcc version 4.1.0 20060304 (Red Hat 4.1.0-3)


-- 
           Summary: mmintrin calls are slower than plain C
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: effbiae at gmail dot com
  GCC host triplet: fedora core 5; pentium III


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31661

[Bug c/31661] New: mmintrin calls are slower than plain C

Reply via email to