------- Comment #2 from drepper at redhat dot com 2009-04-21 19:37 ------- [I couldn't attach the code as an attachment, bugzilla has a bug.]
The program below has to be compiled with -mavx to allow the AVX intrinsics being used. But this also triggers using the use of the vmovss instruction to load the parameter for the sin() call from memory. (Forget the reference to memset in the original report, it's as simple as passing floating point parameters that triggers the problem.) #include <math.h> #include <stdio.h> #include <immintrin.h> static unsigned int eax, ebx, ecx, edx; static int has_avx (void) { if ((ecx & (1 << 27)) == 0) /* No OSXSAVE. */ return 0; unsigned int feat_eax, feat_edx; asm ("xgetbv" : "=a" (feat_eax), "=d" (feat_edx) : "c" (0)); if ((feat_eax & 6) != 6) return 0; return (ecx & (1 << 28)) != 0; } template <typename T, int N> struct vec { union { T n[N]; __v4sf f[N / (sizeof (__v4sf) / sizeof (T))]; __v8sf fa[N / (sizeof (__v8sf) / sizeof (T))]; }; }; template <typename T, int N> T optscalar(const vec<T,N> &src1, const vec<T,N> &src2) { T r = 0; for (int i = 0; i < N; ++i) r += src1[i] * src2[i]; return r; } template <int N> float optscalar(const vec<float,N> &src1, const vec<float,N> &src2) { if (has_avx ()) { __m256 tmp = _mm256_setzero_ps (); for (int i = 0; i < N / 8; ++i) tmp = _mm256_add_ps (tmp, _mm256_mul_ps (src1.fa[i], src2.fa[i])); tmp = _mm256_hadd_ps (tmp, tmp); tmp = _mm256_hadd_ps (tmp, tmp); tmp = _mm256_hadd_ps (tmp, tmp); union { __m256 v; float a[8]; } cvt = { tmp }; return cvt.a[0]; } else { __m128 tmp = _mm_setzero_ps (); for (int i = 0; i < N / 4; ++i) tmp = _mm_add_ps (tmp, _mm_mul_ps (src1.f[i], src2.f[i])); tmp = _mm_hadd_ps (tmp, tmp); tmp = _mm_hadd_ps (tmp, tmp); return __builtin_ia32_vec_ext_v4sf (tmp, 0); } } #define N 100000 #define DEF(type) vec<type,N> v##type##1, v##type##2; type type##res, type##cmp DEF(float); float g; int main () { float f = sinf (g); printf ("%g\n", f); asm volatile ("cpuid" : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (1)); float floatres = optscalar (vfloat1, vfloat2); printf ("%g\n", floatres); return 0; } -- drepper at redhat dot com changed: What |Removed |Added ---------------------------------------------------------------------------- Status|WAITING |UNCONFIRMED http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840