Re: [Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad

Song, Ruiling Mon, 18 Jul 2016 23:22:21 -0700

This one plus the previous version three patches looks good. This is big 
performance improvement to the math functions.


Thanks!
Ruiling
> -----Original Message-----
> From: Beignet [mailto:[email protected]] On Behalf Of
> Grigore Lupescu
> Sent: Tuesday, July 19, 2016 4:38 AM
> To: [email protected]
> Subject: [Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad
> 
> From: Grigore Lupescu <grigore.lupescu at intel.com>
> 
> Affected functions:
> __gen_ocl_internal_log
> __gen_ocl_internal_log10
> __gen_ocl_internal_log2
> __kernel_sinf
> __kernel_cosf
> __gen_ocl_internal_cbrt
> __gen_ocl_asin_util
> tan
> log1p
> 
> Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
> ---
>  backend/src/libocl/tmpl/ocl_math.tmpl.cl | 385 
> +++++++++++++------------------
>  1 file changed, 159 insertions(+), 226 deletions(-)
> 
> diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> index 9ea0817..149ba01 100644
> --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
> @@ -164,7 +164,7 @@ OVERLOADABLE float
> __gen_ocl_internal_copysign(float x, float y) {
>    return ux.f;
>  }
> 
> -OVERLOADABLE float __gen_ocl_internal_log(float x) {
> +OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
>  /*
>   *  Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected]
>   * ====================================================
> @@ -178,187 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float
> x) {
>   */
>    union { unsigned int i; float f; } u;
>    const float
> -  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
> -  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
> -  two25 =    3.355443200e+07, /* 0x4c000000 */
> +  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
> +  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
> +  two25 =  3.355443200e+07, /* 0x4c000000 */
>    Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
>    Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
>    Lg3 = 2.8571429849e-01, /* 3E924925 */
>    Lg4 = 2.2222198546e-01; /* 3E638E29 */
> 
>    const float zero   =  0.0;
> -  float hfsq,f,s,z,R,w,t1,t2,dk;
> -  int k,ix,i,j;
> +  float fsq, f, s, z, R, w, t1, t2, partial;
> +  int k, ix, i, j;
> 
>    u.f = x;  ix = u.i;
> -  k=0;
> -  if (ix < 0x00800000) {      /* x < 2**-126  */
> -      if ((ix&0x7fffffff)==0)
> -    return -two25/zero;   /* log(+-0)=-inf */
> -      if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
> -      return -INFINITY;  /* Gen does not support subnormal number now */
> -      //k -= 25; x *= two25; /* subnormal number, scale up x */
> -      //u.f = x;  ix = u.i;
> -  }
> -  if (ix >= 0x7f800000) return x+x;
> -  k += (ix>>23)-127;
> +  k = 0;
> +
> +  k += (ix>>23) - 127;
>    ix &= 0x007fffff;
> -  i = (ix+(0x95f64<<3))&0x800000;
> -  u.i = ix|(i^0x3f800000); x = u.f;
> +  i = (ix + (0x95f64<<3)) & 0x800000;
> +  u.i = ix | (i^0x3f800000); x = u.f;
>    k += (i>>23);
> -  f = x-(float)1.0;
> -  if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
> -      if(f==zero) {
> -        if(k==0) return zero;
> -        else {
> -          dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
> -        }
> -      }
> -      R = f*f*((float)0.5-(float)0.33333333333333333*f);
> -      if(k==0)
> -        return f-R;
> -      else {
> -        dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
> -      }
> +  f = x - 1.0f;
> +  fsq = f * f;
> +
> +  if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
> +      R = fsq * (0.5f - 0.33333333333333333f * f);
> +      return k * ln2_hi + k * ln2_lo + f - R;
>    }
> -  s = f/((float)2.0+f);
> -  dk = (float)k;
> -  z = s*s;
> -  i = ix-(0x6147a<<3);
> -  w = z*z;
> -  j = (0x6b851<<3)-ix;
> -  t1= w*(Lg2+w*Lg4);
> -  t2= z*(Lg1+w*Lg3);
> +
> +  s = f / (2.0f + f);
> +  z = s * s;
> +  i = ix - (0x6147a << 3);
> +  w = z * z;
> +  j = (0x6b851 << 3) - ix;
> +  t1= w * mad(w, Lg4, Lg2);
> +  t2= z * mad(w, Lg3, Lg1);
>    i |= j;
> -  R = t2+t1;
> -  if(i>0) {
> -      hfsq=(float)0.5*f*f;
> -      if(k==0) return f-(hfsq-s*(hfsq+R)); else
> -         return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
> -  } else {
> -      if(k==0) return f-s*(f-R); else
> -         return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
> -  }
> +  R = t2 + t1;
> +  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
> +
> +  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
>  }
> 
> +OVERLOADABLE float __gen_ocl_internal_log(float x)
> +{
> +  union { unsigned int i; float f; } u;
> +  u.f = x;
> +  int ix = u.i;
> 
> -OVERLOADABLE float __gen_ocl_internal_log10(float x) {
> -/*
> - *  Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected]
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> +  if (ix < 0 )
> +     return NAN;  /* log(-#) = NaN */
> +  if (ix >= 0x7f800000)
> +    return NAN;
> 
> -  union {float f; unsigned i; }u;
> +  return __gen_ocl_internal_log_valid(x);
> +}
> +
> +OVERLOADABLE float __gen_ocl_internal_log10(float x)
> +{
> +  union { float f; unsigned i; } u;
>    const float
> -  zero       = 0.0,
> -  two25      =  3.3554432000e+07, /* 0x4c000000 */
>    ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
>    log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
>    log10_2lo  =  7.9034151668e-07; /* 0x355427db */
> 
> -  float y,z;
> -  int i,k,hx;
> +  float y, z;
> +  int i, k, hx;
> 
>    u.f = x; hx = u.i;
> -  k=0;
> -  if (hx < 0x00800000) {                  /* x < 2**-126  */
> -    if ((hx&0x7fffffff)==0)
> -      return -two25/zero;             /* log(+-0)=-inf */
> -    if (hx<0) return NAN;        /* log(-#) = NaN */
> -    return -INFINITY;      /* Gen does not support subnormal now */
> -    //k -= 25; x *= two25; /* subnormal number, scale up x */
> -    //u.f = x; hx = u.i;
> -  }
> -  if (hx >= 0x7f800000) return x+x;
> -  k += (hx>>23)-127;
> -  i  = ((unsigned)k&0x80000000)>>31;
> -  hx = (hx&0x007fffff)|((0x7f-i)<<23);
> -  y  = (float)(k+i);
> +
> +  if (hx<0)
> +    return NAN; /* log(-#) = NaN */
> +  if (hx >= 0x7f800000)
> +    return NAN;
> +
> +  k = (hx >> 23) - 127;
> +  i  = ((unsigned)k & 0x80000000) >> 31;
> +  hx = (hx&0x007fffff) | ((0x7f-i) << 23);
> +  y  = (float)(k + i);
>    u.i = hx; x = u.f;
> -  z  = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
> -  return  z+y*log10_2hi;
> +
> +  return  y * log10_2lo + y * log10_2hi + ivln10 *
> __gen_ocl_internal_log_valid(x);
>  }
> 
> 
> -OVERLOADABLE float __gen_ocl_internal_log2(float x) {
> -/*
> - *  Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected]
> - *  adapted for log2 by Ulrich Drepper <[email protected]>
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> +OVERLOADABLE float __gen_ocl_internal_log2(float x)
> +{
>    const float zero   =  0.0,
> -  ln2 = 0.69314718055994530942,
> -  two25 =    3.355443200e+07, /** 0x4c000000 */
> -  Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
> -  Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
> -  Lg3 = 2.8571429849e-01, /** 3E924925 */
> -  Lg4 = 2.2222198546e-01; /** 3E638E29 */
> -
> -  float hfsq,f,s,z,R,w,t1,t2,dk;
> -  int k,ix,i,j;
> +  invln2 = 0x1.715476p+0f;
> +  int ix;
> 
> -  union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
> +  union { float f; int i; } u;
>    u.f = x; ix = u.i;
> 
> -  k=0;
> -  if (ix < 0x00800000) {           /** x < 2**-126  */
> -      if ((ix&0x7fffffff)==0)
> -      return -two25/(x-x);        /** log(+-0)=-inf */
> -
> -      if (ix<0) return (x-x)/(x-x);    /** log(-#) = NaN */
> -      return -INFINITY;
> -      k -= 25; x *= two25; /** subnormal number, scale up x */
> -      u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
> -  }
> -
> -  if (ix >= 0x7f800000) return x+x;
> -
> -  k += (ix>>23)-127;
> -  ix &= 0x007fffff;
> -  i = (ix+(0x95f64<<3))&0x800000;
> -
> -  u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));
> /** normalize x or x/2 */
> -  k += (i>>23);
> -  dk = (float)k;
> -  f = x-(float)1.0;
> -
> -  if((0x007fffff&(15+ix))<16) {    /** |f| < 2**-20 */
> -      if(f==zero) return dk;
> +  if (ix < 0)
> +     return NAN;    /** log(-#) = NaN */
> +  if (ix >= 0x7f800000)
> +     return NAN;
> 
> -      R = f*f*((float)0.5-(float)0.33333333333333333*f);
> -      return dk-(R-f)/ln2;
> -  }
> -
> -  s = f/((float)2.0+f);
> -  z = s*s;
> -  i = ix-(0x6147a<<3);
> -  w = z*z;
> -  j = (0x6b851<<3)-ix;
> -  t1= w*(Lg2+w*Lg4);
> -  t2= z*(Lg1+w*Lg3);
> -  i |= j;
> -  R = t2+t1;
> -
> -  if(i>0) {
> -      hfsq=(float)0.5*f*f;
> -      return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
> -  } else {
> -      return dk-((s*(f-R))-f)/ln2;
> -  }
> +  return invln2 * __gen_ocl_internal_log_valid(x);
>  }
> 
> 
> @@ -545,9 +463,13 @@ OVERLOADABLE float __kernel_sinf(float x)
>    float z,r,v;
>    z =  x*x;
>    v =  z*x;
> -  r =  S2+z*(S3+z*(S4));
> +  r = mad(z,
> +               mad(z,
> +                       mad(z, S4, S3),
> +               S2),
> +       S1);
> 
> -  return x+v*(S1+z*r);
> +  return mad(v, r, x);
>  }
> 
>  float __kernel_cosf(float x, float y)
> @@ -563,7 +485,9 @@ float __kernel_cosf(float x, float y)
>    GEN_OCL_GET_FLOAT_WORD(ix,x);
>    ix &= 0x7fffffff;     /* ix = |x|'s high word*/
>    z  = x*x;
> -  r = z*(C1+z*(C2+z*(C3)));
> +  r = z * mad(z,
> +               mad(z, C3, C2),
> +       C1);
> 
>    if(ix < 0x3e99999a)       /* if |x| < 0.3 */
>        return one - ((float)0.5*z - (z*r - x*y));
> @@ -671,24 +595,27 @@ float __kernel_tanf(float x, float y, int iy)
>              }
>          if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
>              if(hx<0) {x = -x; y = -y;}
> -
> -
>              z = pio4-x;
>              w = pio4lo-y;
>              x = z+w; y = 0.0;
>          }
>          z       =  x*x;
>          w       =  z*z;
> -    /* Break x^5*(T[1]+x^2*T[2]+...) into
> -     *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
> -     *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
> -     */
> -
> -        r = T[1]+w*(T[3]+w*(T[5]+w*T[7]));
> -        v = z*(T[2]+w*(T[4]+w*T[6]));
> +             /* Break x^5*(T[1]+x^2*T[2]+...) into
> +              *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
> +              *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
> +              */
> +
> +        r = mad(w,
> +                     mad(w,
> +                                     mad(w, T[7],
> +                                     T[5]),
> +                             T[3]),
> +                     T[1]);
> +        v = z* mad(w, mad(w, T[6], T[4]), T[2]);
> 
>          s = z*x;
> -        r = y + z*(s*(r+v)+y);
> +        r = mad(z, mad(s, r + v, y), y);
>          r += T[0]*s;
>          w = x+r;
>          if(ix>=0x3f2ca140) {
> @@ -696,21 +623,8 @@ float __kernel_tanf(float x, float y, int iy)
>              return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
>          }
>          if(iy==1) return w;
> -        else {          /* if allow error up to 2 ulp
> -                           simply return -1.0/(x+r) here */
> -     /*  compute -1.0/(x+r) accurately */
> -            float a,t;
> -            int i;
> -            z  = w;
> -            GEN_OCL_GET_FLOAT_WORD(i,z);
> -            GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
> -            v  = r-(z - x);     /* z+v = r+x */
> -            t = a  = -(float)1.0/w;     /* a = -1.0/w */
> -            GEN_OCL_GET_FLOAT_WORD(i,t);
> -            GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
> -            s  = (float)1.0+t*z;
> -            return t+a*(s+t*v);
> -        }
> +        else
> +             return -1.0/(x+r);
>  }
> 
>  OVERLOADABLE float tan(float x)
> @@ -897,44 +811,45 @@ OVERLOADABLE float lgamma(float x) {
>               switch (i) {
>               case 0:
>                       z = y * y;
> -                     p1 = a0 + z * a2;
> -                     p2 = z * (a1 + z * a3);
> -                     p = y * p1 + p2;
> +                     p1 = mad(z, a2, a0);
> +                     p2 = z * mad(z, a3, a1);
> +                     p = mad(y, p1, p2);
>                       r += (p - (float) 0.5 * y);
>                       break;
>               case 1:
>                       z = y * y;
>                       w = z * y;
> -                     p1 = t0 + w * t3;
> -                     p2 = t1 + w * t4;
> -                     p3 = t2 + w * t5;
> -                     p = z * p1 - (tt - w * (p2 + y * p3));
> +                     p1 = mad(w, t3, t0);
> +                     p2 = mad(w, t4, t1);
> +                     p3 = mad(w, t5, t2);
> +                     p = z * p1 - (tt - w * mad(y, p3, p2));
>                       r += (tf + p);
>                       break;
>               case 2:
> -                     p1 = y * (u0 + y * u1);
> -                     p2 = one + y * (v1 + y * v2);
> +                     p1 = y * mad(y, u1, u0);
> +                     p2 = one + y * mad(y, v2, v1);
>                       r += (-(float) 0.5 * y + p1 / p2);
>               }
>       } else if (ix < 0x41000000) {
>               i = (int) x;
>               t = zero;
>               y = x - (float) i;
> -             p = y * (s0 + y * (s1 + y * s2));
> -             q = one + y * (r1 + y * r2);
> +             p = y * mad(y, mad(y, s2, s1), s0);
> +             q = mad(y, mad(y, r2, r1), 1.0f);
>               r = .5f * y + p / q;
>               z = one;
> +
>               switch (i) {
>               case 7:
> -                     z *= (y + (float) 6.0);
> +                     z *= (y + 6.0f);
>               case 6:
> -                     z *= (y + (float) 5.0);
> +                     z *= (y + 5.0f);
>               case 5:
> -                     z *= (y + (float) 4.0);
> +                     z *= (y + 4.0f);
>               case 4:
> -                     z *= (y + (float) 3.0);
> +                     z *= (y + 3.0f);
>               case 3:
> -                     z *= (y + (float) 2.0);
> +                     z *= (y + 2.0f);
>                       r += native_log(z);
>                       break;
>               }
> @@ -943,7 +858,7 @@ OVERLOADABLE float lgamma(float x) {
>               t = native_log(x);
>               z = one / x;
>               y = z * z;
> -             w = w0 + z * (w1 + y * w2);
> +             w = mad(z, mad(y, w2, 21), w0);
>               r = (x - .5f) * (t - one) + w;
>       } else
>               r = x * (native_log(x) - one);
> @@ -1053,32 +968,32 @@ OVERLOADABLE float lgamma(float x) {
>               switch (i) {  \
>               case 0:  \
>                       z = y * y;  \
> -                     p1 = a0 + z * a2;  \
> -                     p2 = z * (a1 + z *a3);  \
> -                     p = y * p1 + p2;  \
> -                     r += (p - (float) 0.5 * y);  \
> +                     p1 = mad(z, a2, a0);  \
> +                     p2 = z * mad(z, a3, a1);  \
> +                     p = mad(y, p1, p2); \
> +                     r -= mad(y, 0.5f, -p);  \
>                       break;  \
>               case 1:  \
>                       z = y * y;  \
>                       w = z * y;  \
> -                     p1 = t0 + w * t3;  \
> -                     p2 = t1 + w * t4;  \
> -                     p3 = t2 + w * t5;  \
> -                     p = z * p1 - (tt - w * (p2 + y * p3));  \
> +                     p1 = mad(w, t3, t0); \
> +                     p2 = mad(w, t4, t1); \
> +                     p3 = mad(w, t5, t2); \
> +                     p = z * p1 + mad(w, mad(y, p3, p2), -tt); \
>                       r += (tf + p);  \
>                       break;  \
>               case 2:  \
> -                     p1 = y * (u0 + y * u1);  \
> -                     p2 = one + y * (v1 + y * v2);  \
> -                     r += (-(float) 0.5 * y + p1 / p2);  \
> +                     p1 = y * mad(y, u1, u0); \
> +                     p2 = mad(y, mad(y, v2, v1), 1.0f);  \
> +                     r -= mad(y, 0.5f, -p1 / p2);  \
>               }  \
>       } else if (ix < 0x41000000) {  \
>               i = (int) x;  \
>               t = zero;  \
>               y = x - (float) i;  \
> -             p = y * (s0 + y * (s1 + y * s2));  \
> -             q = one + y * (r1 + y * r2);  \
> -             r = .5f * y + p / q;  \
> +             p = y * mad(y, mad(y, s2, s1), s0); \
> +             q = mad(y, mad(y, r2, r1), 1.0f);  \
> +             r = mad(y, 0.5f, p / q); \
>               z = one;  \
>               switch (i) {  \
>               case 7:  \
> @@ -1099,10 +1014,10 @@ OVERLOADABLE float lgamma(float x) {
>               t = native_log(x);  \
>               z = one / x;  \
>               y = z * z;  \
> -             w = w0 + z * (w1 + y * w2);  \
> -             r = (x - .5f) * (t - one) + w;  \
> +             w = mad(z, mad(y, w2, w1), w0); \
> +             r = mad(x - 0.5f, t - 1.0f, w); \
>       } else  \
> -             r = x * (native_log(x) - one);  \
> +             r = mad(x, native_log(x), -1.0f); \
>       if (hx < 0)  \
>               r = nadj - r;  \
>       return r;
> @@ -1183,20 +1098,26 @@ OVERLOADABLE float log1p(float x) {
>        f = u-(float)1.0;
>    }
>    hfsq=(float)0.5*f*f;
> -  if(hu==0) { /* |f| < 2**-20 */
> -      if(f==zero) { if(k==0) return zero;
> -      else {c += k*ln2_lo; return k*ln2_hi+c;} }
> -      R = hfsq*((float)1.0-(float)0.66666666666666666*f);
> +  if(hu==0)
> +  { /* |f| < 2**-20 */
> +      if(f==zero)
> +      {
> +       if(k==0) return zero;
> +       else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
> +      }
> +      R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
>        if(k==0) return f-R; else
> -             return k*ln2_hi-((R-(k*ln2_lo+c))-f);
> +       return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
>    }
>    s = f/((float)2.0+f);
>    z = s*s;
> -  R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4)));
> -  if(k==0) return f-(hfsq-s*(hfsq+R)); else
> -     return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
> -
> +  R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
> +  if(k==0)
> +       return f + mad(hfsq + R, s, -hfsq);
> +  else
> +       return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
>  }
> +
>  OVERLOADABLE float logb(float x) {
>    if (__ocl_math_fastpath_flag)
>      return __gen_ocl_internal_fastpath_logb(x);
> @@ -1308,14 +1229,14 @@ OVERLOADABLE float
> __gen_ocl_internal_cbrt(float x) {
> 
>      /* new cbrt to 23 bits */
>    r=t*t/x;
> -  s=C+r*t;
> +  s=mad(r, t, C);
>    t*=G+F/(s+E+D/s);
>      /* one step newton iteration to 53 bits with error less than 0.667 ulps 
> */
>    s=t*t;    /* t*t is exact */
>    r=x/s;
>    w=t+t;
>    r=(r-t)/(w+r);  /* r-s is exact */
> -  t=t+t*r;
> +  t=mad(t, r, t);
> 
>      /* retore the sign bit */
>    GEN_OCL_GET_FLOAT_WORD(high,t);
> @@ -1367,10 +1288,22 @@ INLINE float __gen_ocl_asin_util(float x) {
>    qS4 =  7.70381505559019352791e-02;
> 
>    float t = x*x;
> -  float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4))));
> -  float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
> +  float p = t * mad(t,
> +                               mad(t,
> +                                       mad(t,
> +                                               mad(t, pS4, pS3),
> +                                       pS2),
> +                               pS1),
> +                       pS0);
> +  float q = mad(t,
> +                       mad(t,
> +                               mad(t,
> +                                       mad(t, qS4, qS3),
> +                               qS2),
> +                       qS1),
> +               1.0f);
>    float w = p / q;
> -  return x + x*w;
> +  return mad(x, w, x);
>  }
> 
>  OVERLOADABLE float __gen_ocl_internal_asin(float x) {
> --
> 2.5.0
> 
> _______________________________________________
> Beignet mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/beignet
_______________________________________________
Beignet mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/beignet

Re: [Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad

Reply via email to