[Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics

drepper at redhat dot com Tue, 21 Apr 2009 12:38:04 -0700


------- Comment #2 from drepper at redhat dot com  2009-04-21 19:37 -------
[I couldn't attach the code as an attachment, bugzilla has a bug.]


The program below has to be compiled with -mavx to allow the AVX intrinsics
being used.  But this also triggers using the use of the vmovss instruction to
load the parameter for the sin() call from memory.

(Forget the reference to memset in the original report, it's as simple as
passing floating point parameters that triggers the problem.)

#include <math.h>
#include <stdio.h>
#include <immintrin.h>


static unsigned int eax, ebx, ecx, edx;


static int
has_avx (void)
{
  if ((ecx & (1 << 27)) == 0)
    /* No OSXSAVE.  */
    return 0;

  unsigned int feat_eax, feat_edx;
  asm ("xgetbv" : "=a" (feat_eax), "=d" (feat_edx) : "c" (0));
  if ((feat_eax & 6) != 6)
    return 0;

  return (ecx & (1 << 28)) != 0;
}


template <typename T, int N>
struct vec {
  union {
    T n[N];
    __v4sf f[N / (sizeof (__v4sf) / sizeof (T))];
    __v8sf fa[N / (sizeof (__v8sf) / sizeof (T))];
  };
};


template <typename T, int N>
T
optscalar(const vec<T,N> &src1, const vec<T,N> &src2)
{
  T r = 0;
  for (int i = 0; i < N; ++i)
    r += src1[i] * src2[i];
  return r;
}


template <int N>
float
optscalar(const vec<float,N> &src1, const vec<float,N> &src2)
{
  if (has_avx ())
    {
      __m256 tmp = _mm256_setzero_ps ();
      for (int i = 0; i < N / 8; ++i)
        tmp = _mm256_add_ps (tmp, _mm256_mul_ps (src1.fa[i], src2.fa[i]));
      tmp = _mm256_hadd_ps (tmp, tmp);
      tmp = _mm256_hadd_ps (tmp, tmp);
      tmp = _mm256_hadd_ps (tmp, tmp);
      union
      {
        __m256 v;
        float a[8];
      } cvt = { tmp };
      return cvt.a[0];
    }
  else
    {
      __m128 tmp = _mm_setzero_ps ();
      for (int i = 0; i < N / 4; ++i)
        tmp = _mm_add_ps (tmp, _mm_mul_ps (src1.f[i], src2.f[i]));
      tmp = _mm_hadd_ps (tmp, tmp);
      tmp = _mm_hadd_ps (tmp, tmp);
      return __builtin_ia32_vec_ext_v4sf (tmp, 0);
    }
}


#define N 100000
#define DEF(type) vec<type,N> v##type##1, v##type##2; type type##res, type##cmp
DEF(float);

float g;

int
main ()
{
  float f = sinf  (g);
  printf ("%g\n", f);

  asm volatile ("cpuid"
                : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
                : "0" (1));

  float floatres = optscalar (vfloat1, vfloat2);
  printf ("%g\n", floatres);

  return 0;
}


-- 

drepper at redhat dot com changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|WAITING                     |UNCONFIRMED


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840

[Bug middle-end/39840] Non-optimal (or wrong) implementation of SSE intrinsics

Reply via email to