------- Comment #3 from pinskia at gmail dot com 2009-04-21 19:41 ------- Subject: Re: Non-optimal (or wrong) implementation of SSE intrinsics
Gcc 4.4 and above supports different target options on the function level but not on a basic block level. So you can create an interneral version for AVX. Sent from my iPhone On Apr 21, 2009, at 12:37 PM, "drepper at redhat dot com" <gcc-bugzi...@gcc.gnu.org > wrote: > > > ------- Comment #2 from drepper at redhat dot com 2009-04-21 19:37 > ------- > [I couldn't attach the code as an attachment, bugzilla has a bug.] > > The program below has to be compiled with -mavx to allow the AVX > intrinsics > being used. But this also triggers using the use of the vmovss > instruction to > load the parameter for the sin() call from memory. > > (Forget the reference to memset in the original report, it's as > simple as > passing floating point parameters that triggers the problem.) > > #include <math.h> > #include <stdio.h> > #include <immintrin.h> > > > static unsigned int eax, ebx, ecx, edx; > > > static int > has_avx (void) > { > if ((ecx & (1 << 27)) == 0) > /* No OSXSAVE. */ > return 0; > > unsigned int feat_eax, feat_edx; > asm ("xgetbv" : "=a" (feat_eax), "=d" (feat_edx) : "c" (0)); > if ((feat_eax & 6) != 6) > return 0; > > return (ecx & (1 << 28)) != 0; > } > > > template <typename T, int N> > struct vec { > union { > T n[N]; > __v4sf f[N / (sizeof (__v4sf) / sizeof (T))]; > __v8sf fa[N / (sizeof (__v8sf) / sizeof (T))]; > }; > }; > > > template <typename T, int N> > T > optscalar(const vec<T,N> &src1, const vec<T,N> &src2) > { > T r = 0; > for (int i = 0; i < N; ++i) > r += src1[i] * src2[i]; > return r; > } > > > template <int N> > float > optscalar(const vec<float,N> &src1, const vec<float,N> &src2) > { > if (has_avx ()) > { > __m256 tmp = _mm256_setzero_ps (); > for (int i = 0; i < N / 8; ++i) > tmp = _mm256_add_ps (tmp, _mm256_mul_ps (src1.fa[i], > src2.fa[i])); > tmp = _mm256_hadd_ps (tmp, tmp); > tmp = _mm256_hadd_ps (tmp, tmp); > tmp = _mm256_hadd_ps (tmp, tmp); > union > { > __m256 v; > float a[8]; > } cvt = { tmp }; > return cvt.a[0]; > } > else > { > __m128 tmp = _mm_setzero_ps (); > for (int i = 0; i < N / 4; ++i) > tmp = _mm_add_ps (tmp, _mm_mul_ps (src1.f[i], src2.f[i])); > tmp = _mm_hadd_ps (tmp, tmp); > tmp = _mm_hadd_ps (tmp, tmp); > return __builtin_ia32_vec_ext_v4sf (tmp, 0); > } > } > > > #define N 100000 > #define DEF(type) vec<type,N> v##type##1, v##type##2; type > type##res, type##cmp > DEF(float); > > float g; > > int > main () > { > float f = sinf (g); > printf ("%g\n", f); > > asm volatile ("cpuid" > : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) > : "0" (1)); > > float floatres = optscalar (vfloat1, vfloat2); > printf ("%g\n", floatres); > > return 0; > } > > > -- > > drepper at redhat dot com changed: > > What |Removed |Added > --- > --- > ---------------------------------------------------------------------- > Status|WAITING |UNCONFIRMED > > > http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840 > -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840