hi, this thread: http://gcc.gnu.org/ml/gcc-help/2007-04/msg00201.html details my problems. (duplicated here)
i want to sum an array of longs using mmx. i use the functions: _mm_set_pi32 and _m_paddd but the resultant binary contains significantly less efficient code than inline asm or even plain C ( for(i=0;i<n;i++)total+=a[i]; ). here's the relevant function: simd_mmintrin(n, is) I *is; { __m64 q,r; I i; _m_empty(); q=_m_from_int(0); for (i=0; i < n; i+=W) { r=_mm_set_pi32(is[i],is[i+1]); q=_m_paddd(q,r); } union {long a[2];__m64 m;}u; u.m=q; return u.a[0]+u.a[1]; } i have a script RUNME.sh: $ sh RUNME.sh --- expect: 199990000 impl: C (SISD) 199990000 real 0m0.604s user 0m0.580s sys 0m0.004s --- expect: 199990000 impl: ASM (SIMD) 199990000 real 0m0.377s user 0m0.360s sys 0m0.008s --- expect: 199990000 impl: MMINTRIN (SIMD) 199990000 real 0m1.235s user 0m1.228s sys 0m0.004s $ cat RUNME.sh #!/bin/sh repeats=4000 # number of times to repeat the test vectorsize=10000 # size of the vector in 32 bit ints gcc -O -mmmx v.c -o v for which in 0 1 2; do time ./v $repeats $vectorsize $which; done $ cat v.c #include <string.h> #include <stdio.h> #include <stdlib.h> #include <assert.h> #include <mmintrin.h> typedef long I;typedef unsigned long J; typedef char C; #define IZ sizeof(I) #define W 2 simd_mmintrin(n, is) I *is; { __v2si q,r; I i; _m_empty(); q=_m_from_int(0); for (i=0; i < n; i+=W) { memcpy(&r,is+i,IZ*W); q=_m_paddd(q,r); } I*qq=(I*)&q; return qq[0]+qq[1]; } simd_asm(n, is) I *is; { I i,*r=malloc(IZ*W*8); asm("emms"); asm("pxor %mm0,%mm0"); for (i=0; i < n; i+=W) { asm("movq %0,%%mm1\n\t" "paddd %%mm1,%%mm0" : :"m"(is[i]) ); } asm("movq %%mm0,%0":"=m"(*(__m64*)r)); return r[0]+r[1]; } sisd(n, is) I *is; { I i = 0, j = 0; for (i = 0; i < n; i++) j += is[i]; return j; } main(c, v) C **v; { I n=atol(v[1]), z=atol(v[2]), m=atol(v[3]); I result, *is=malloc(IZ*(z*=2)), i; int(*fs[])()={sisd,simd_asm,simd_mmintrin,0}; C*ss[]={"C (SISD)","ASM (SIMD)","MMINTRIN (SIMD)"}; for(i=0;i<z;i++)is[i]=i; printf("\n\n---\nexpect: %d\n",(z)*(z-1)/2); printf("impl: %s\n",ss[m]); while (n--) result=fs[m](z, is); printf("%d\n",result); } [EMAIL PROTECTED] i]$ gcc -v Using built-in specs. Target: i386-redhat-linux Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/u sr/share/info --enable-shared --enable-threads=posix --enable-checking=release - -with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable- libgcj-multifile --enable-languages=c,c++,objc,obj-c++,java,fortran,ada --enable -java-awt=gtk --disable-dssi --with-java-home=/usr/lib/jvm/java-1.4.2-gcj-1.4.2. 0/jre --with-cpu=generic --host=i386-redhat-linux Thread model: posix gcc version 4.1.0 20060304 (Red Hat 4.1.0-3) -- Summary: mmintrin calls are slower than plain C Product: gcc Version: 4.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: effbiae at gmail dot com GCC host triplet: fedora core 5; pentium III http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31661