[Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics

pinskia at gmail dot com Tue, 21 Apr 2009 12:42:17 -0700


------- Comment #3 from pinskia at gmail dot com  2009-04-21 19:41 -------
Subject: Re:  Non-optimal (or wrong) implementation of SSE intrinsics


Gcc 4.4 and above supports different target options on the function  
level but not on a basic block level. So you can create an interneral  
version for AVX.

Sent from my iPhone

On Apr 21, 2009, at 12:37 PM, "drepper at redhat dot com"
<gcc-bugzi...@gcc.gnu.org 
 > wrote:

>
>
> ------- Comment #2 from drepper at redhat dot com  2009-04-21 19:37  
> -------
> [I couldn't attach the code as an attachment, bugzilla has a bug.]
>
> The program below has to be compiled with -mavx to allow the AVX  
> intrinsics
> being used.  But this also triggers using the use of the vmovss  
> instruction to
> load the parameter for the sin() call from memory.
>
> (Forget the reference to memset in the original report, it's as  
> simple as
> passing floating point parameters that triggers the problem.)
>
> #include <math.h>
> #include <stdio.h>
> #include <immintrin.h>
>
>
> static unsigned int eax, ebx, ecx, edx;
>
>
> static int
> has_avx (void)
> {
>  if ((ecx & (1 << 27)) == 0)
>    /* No OSXSAVE.  */
>    return 0;
>
>  unsigned int feat_eax, feat_edx;
>  asm ("xgetbv" : "=a" (feat_eax), "=d" (feat_edx) : "c" (0));
>  if ((feat_eax & 6) != 6)
>    return 0;
>
>  return (ecx & (1 << 28)) != 0;
> }
>
>
> template <typename T, int N>
> struct vec {
>  union {
>    T n[N];
>    __v4sf f[N / (sizeof (__v4sf) / sizeof (T))];
>    __v8sf fa[N / (sizeof (__v8sf) / sizeof (T))];
>  };
> };
>
>
> template <typename T, int N>
> T
> optscalar(const vec<T,N> &src1, const vec<T,N> &src2)
> {
>  T r = 0;
>  for (int i = 0; i < N; ++i)
>    r += src1[i] * src2[i];
>  return r;
> }
>
>
> template <int N>
> float
> optscalar(const vec<float,N> &src1, const vec<float,N> &src2)
> {
>  if (has_avx ())
>    {
>      __m256 tmp = _mm256_setzero_ps ();
>      for (int i = 0; i < N / 8; ++i)
>        tmp = _mm256_add_ps (tmp, _mm256_mul_ps (src1.fa[i],  
> src2.fa[i]));
>      tmp = _mm256_hadd_ps (tmp, tmp);
>      tmp = _mm256_hadd_ps (tmp, tmp);
>      tmp = _mm256_hadd_ps (tmp, tmp);
>      union
>      {
>        __m256 v;
>        float a[8];
>      } cvt = { tmp };
>      return cvt.a[0];
>    }
>  else
>    {
>      __m128 tmp = _mm_setzero_ps ();
>      for (int i = 0; i < N / 4; ++i)
>        tmp = _mm_add_ps (tmp, _mm_mul_ps (src1.f[i], src2.f[i]));
>      tmp = _mm_hadd_ps (tmp, tmp);
>      tmp = _mm_hadd_ps (tmp, tmp);
>      return __builtin_ia32_vec_ext_v4sf (tmp, 0);
>    }
> }
>
>
> #define N 100000
> #define DEF(type) vec<type,N> v##type##1, v##type##2; type  
> type##res, type##cmp
> DEF(float);
>
> float g;
>
> int
> main ()
> {
>  float f = sinf  (g);
>  printf ("%g\n", f);
>
>  asm volatile ("cpuid"
>                : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
>                : "0" (1));
>
>  float floatres = optscalar (vfloat1, vfloat2);
>  printf ("%g\n", floatres);
>
>  return 0;
> }
>
>
> -- 
>
> drepper at redhat dot com changed:
>
>           What    |Removed                     |Added
> --- 
> --- 
> ----------------------------------------------------------------------
>             Status|WAITING                     |UNCONFIRMED
>
>
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840
>


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840

[Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics

Reply via email to