The patchset pushed, thanks.
> -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > [email protected] > Sent: Tuesday, July 26, 2016 21:25 > To: [email protected] > Subject: [Beignet] [PATCH 4/4] Backend: Optimization internal math, use > mad > > From: Grigore Lupescu <grigore.lupescu at intel.com> > > Affected functions: > __gen_ocl_internal_log > __gen_ocl_internal_log10 > __gen_ocl_internal_log2 > __gen_ocl_internal_log_valid > __kernel_sinf > __kernel_cosf > __gen_ocl_internal_cbrt > __gen_ocl_internal_asinh > __gen_ocl_internal_atan > __gen_ocl_asin_util > tan > log1p > lgamma_r > lgamma > > Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com> > --- > backend/src/libocl/tmpl/ocl_math.tmpl.cl | 417 ++++++++++++---------------- > --- > 1 file changed, 164 insertions(+), 253 deletions(-) > > diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl > b/backend/src/libocl/tmpl/ocl_math.tmpl.cl > index c8969a1..0d2a57d 100644 > --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl > +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl > @@ -164,7 +164,7 @@ OVERLOADABLE float > __gen_ocl_internal_copysign(float x, float y) { > return ux.f; > } > > -OVERLOADABLE float __gen_ocl_internal_log(float x) { > +OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) { > /* > * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected] > * ==================================================== > @@ -178,187 +178,105 @@ OVERLOADABLE float > __gen_ocl_internal_log(float x) { > */ > union { unsigned int i; float f; } u; > const float > - ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ > - ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */ > - two25 = 3.355443200e+07, /* 0x4c000000 */ > + ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ ln2_lo = > + 9.0580006145e-06, /* 0x3717f7d1 */ > + two25 = 3.355443200e+07, /* 0x4c000000 */ > Lg1 = 6.6666668653e-01, /* 3F2AAAAB */ > Lg2 = 4.0000000596e-01, /* 3ECCCCCD */ > Lg3 = 2.8571429849e-01, /* 3E924925 */ > Lg4 = 2.2222198546e-01; /* 3E638E29 */ > > const float zero = 0.0; > - float hfsq,f,s,z,R,w,t1,t2,dk; > - int k,ix,i,j; > + float fsq, f, s, z, R, w, t1, t2, partial; int k, ix, i, j; > > u.f = x; ix = u.i; > - k=0; > - if (ix < 0x00800000) { /* x < 2**-126 */ > - if ((ix&0x7fffffff)==0) > - return -two25/zero; /* log(+-0)=-inf */ > - if (ix<0) return (x-x)/zero; /* log(-#) = NaN */ > - return -INFINITY; /* Gen does not support subnormal number now */ > - //k -= 25; x *= two25; /* subnormal number, scale up x */ > - //u.f = x; ix = u.i; > - } > - if (ix >= 0x7f800000) return x+x; > - k += (ix>>23)-127; > + k = 0; > + > + k += (ix>>23) - 127; > ix &= 0x007fffff; > - i = (ix+(0x95f64<<3))&0x800000; > - u.i = ix|(i^0x3f800000); x = u.f; > + i = (ix + (0x95f64<<3)) & 0x800000; > + u.i = ix | (i^0x3f800000); x = u.f; > k += (i>>23); > - f = x-(float)1.0; > - if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */ > - if(f==zero) { > - if(k==0) return zero; > - else { > - dk=(float)k; return dk*ln2_hi+dk*ln2_lo; > - } > - } > - R = f*f*((float)0.5-(float)0.33333333333333333*f); > - if(k==0) > - return f-R; > - else { > - dk=(float)k; return dk*ln2_hi-((R-dk*ln2_lo)-f); > - } > + f = x - 1.0f; > + fsq = f * f; > + > + if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */ > + R = fsq * (0.5f - 0.33333333333333333f * f); > + return k * ln2_hi + k * ln2_lo + f - R; > } > - s = f/((float)2.0+f); > - dk = (float)k; > - z = s*s; > - i = ix-(0x6147a<<3); > - w = z*z; > - j = (0x6b851<<3)-ix; > - t1= w*(Lg2+w*Lg4); > - t2= z*(Lg1+w*Lg3); > + > + s = f / (2.0f + f); > + z = s * s; > + i = ix - (0x6147a << 3); > + w = z * z; > + j = (0x6b851 << 3) - ix; > + t1= w * mad(w, Lg4, Lg2); > + t2= z * mad(w, Lg3, Lg1); > i |= j; > - R = t2+t1; > - if(i>0) { > - hfsq=(float)0.5*f*f; > - if(k==0) return f-(hfsq-s*(hfsq+R)); else > - return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f); > - } else { > - if(k==0) return f-s*(f-R); else > - return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f); > - } > + R = t2 + t1; > + partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f); > + > + return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;; > } > > +OVERLOADABLE float __gen_ocl_internal_log(float x) { > + union { unsigned int i; float f; } u; > + u.f = x; > + int ix = u.i; > > -OVERLOADABLE float __gen_ocl_internal_log10(float x) { > -/* > - * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected] > - * ==================================================== > - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. > - * > - * Developed at SunPro, a Sun Microsystems, Inc. business. > - * Permission to use, copy, modify, and distribute this > - * software is freely granted, provided that this notice > - * is preserved. > - * ==================================================== > - */ > + if (ix < 0 ) > + return NAN; /* log(-#) = NaN */ > + if (ix >= 0x7f800000) > + return NAN; > > - union {float f; unsigned i; }u; > + return __gen_ocl_internal_log_valid(x); } > + > +OVERLOADABLE float __gen_ocl_internal_log10(float x) { > + union { float f; unsigned i; } u; > const float > - zero = 0.0, > - two25 = 3.3554432000e+07, /* 0x4c000000 */ > ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */ > log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */ > log10_2lo = 7.9034151668e-07; /* 0x355427db */ > > - float y,z; > - int i,k,hx; > + float y, z; > + int i, k, hx; > > u.f = x; hx = u.i; > - k=0; > - if (hx < 0x00800000) { /* x < 2**-126 */ > - if ((hx&0x7fffffff)==0) > - return -two25/zero; /* log(+-0)=-inf */ > - if (hx<0) return NAN; /* log(-#) = NaN */ > - return -INFINITY; /* Gen does not support subnormal now */ > - //k -= 25; x *= two25; /* subnormal number, scale up x */ > - //u.f = x; hx = u.i; > - } > - if (hx >= 0x7f800000) return x+x; > - k += (hx>>23)-127; > - i = ((unsigned)k&0x80000000)>>31; > - hx = (hx&0x007fffff)|((0x7f-i)<<23); > - y = (float)(k+i); > + > + if (hx<0) > + return NAN; /* log(-#) = NaN */ > + if (hx >= 0x7f800000) > + return NAN; > + > + k = (hx >> 23) - 127; > + i = ((unsigned)k & 0x80000000) >> 31; hx = (hx&0x007fffff) | > + ((0x7f-i) << 23); y = (float)(k + i); > u.i = hx; x = u.f; > - z = y*log10_2lo + ivln10*__gen_ocl_internal_log(x); > - return z+y*log10_2hi; > + > + return y * log10_2lo + y * log10_2hi + ivln10 * > + __gen_ocl_internal_log_valid(x); > } > > > -OVERLOADABLE float __gen_ocl_internal_log2(float x) { > -/* > - * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected] > - * adapted for log2 by Ulrich Drepper <[email protected]> > - * ==================================================== > - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. > - * > - * Developed at SunPro, a Sun Microsystems, Inc. business. > - * Permission to use, copy, modify, and distribute this > - * software is freely granted, provided that this notice > - * is preserved. > - * ==================================================== > - */ > +OVERLOADABLE float __gen_ocl_internal_log2(float x) { > const float zero = 0.0, > - ln2 = 0.69314718055994530942, > - two25 = 3.355443200e+07, /** 0x4c000000 */ > - Lg1 = 6.6666668653e-01, /** 3F2AAAAB */ > - Lg2 = 4.0000000596e-01, /** 3ECCCCCD */ > - Lg3 = 2.8571429849e-01, /** 3E924925 */ > - Lg4 = 2.2222198546e-01; /** 3E638E29 */ > - > - float hfsq,f,s,z,R,w,t1,t2,dk; > - int k,ix,i,j; > + invln2 = 0x1.715476p+0f; > + int ix; > > - union {float f; int i; }u;//GET_FLOAT_WORD(ix,x); > + union { float f; int i; } u; > u.f = x; ix = u.i; > > - k=0; > - if (ix < 0x00800000) { /** x < 2**-126 */ > - if ((ix&0x7fffffff)==0) > - return -two25/(x-x); /** log(+-0)=-inf */ > - > - if (ix<0) return (x-x)/(x-x); /** log(-#) = NaN */ > - return -INFINITY; > - k -= 25; x *= two25; /** subnormal number, scale up x */ > - u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x); > - } > - > - if (ix >= 0x7f800000) return x+x; > - > - k += (ix>>23)-127; > - ix &= 0x007fffff; > - i = (ix+(0x95f64<<3))&0x800000; > - > - u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000)); > /** normalize x or x/2 */ > - k += (i>>23); > - dk = (float)k; > - f = x-(float)1.0; > + if (ix < 0) > + return NAN; /** log(-#) = NaN */ > + if (ix >= 0x7f800000) > + return NAN; > > - if((0x007fffff&(15+ix))<16) { /** |f| < 2**-20 */ > - if(f==zero) return dk; > - > - R = f*f*((float)0.5-(float)0.33333333333333333*f); > - return dk-(R-f)/ln2; > - } > - > - s = f/((float)2.0+f); > - z = s*s; > - i = ix-(0x6147a<<3); > - w = z*z; > - j = (0x6b851<<3)-ix; > - t1= w*(Lg2+w*Lg4); > - t2= z*(Lg1+w*Lg3); > - i |= j; > - R = t2+t1; > - > - if(i>0) { > - hfsq=(float)0.5*f*f; > - return dk-((hfsq-(s*(hfsq+R)))-f)/ln2; > - } else { > - return dk-((s*(f-R))-f)/ln2; > - } > + return invln2 * __gen_ocl_internal_log_valid(x); > } > > > @@ -545,9 +463,9 @@ OVERLOADABLE float __kernel_sinf(float x) > float z,r,v; > z = x*x; > v = z*x; > - r = S2+z*(S3+z*(S4)); > + r = mad(z, mad(z, mad(z, S4, S3), S2), S1); > > - return x+v*(S1+z*r); > + return mad(v, r, x); > } > > float __kernel_cosf(float x, float y) > @@ -563,7 +481,7 @@ float __kernel_cosf(float x, float y) > GEN_OCL_GET_FLOAT_WORD(ix,x); > ix &= 0x7fffffff; /* ix = |x|'s high word*/ > z = x*x; > - r = z*(C1+z*(C2+z*(C3))); > + r = z * mad(z, mad(z, C3, C2), C1); > > if(ix < 0x3e99999a) /* if |x| < 0.3 */ > return one - ((float)0.5*z - (z*r - x*y)); @@ -671,24 +589,22 @@ float > __kernel_tanf(float x, float y, int iy) > } > if(ix>=0x3f2ca140) { /* |x|>=0.6744 */ > if(hx<0) {x = -x; y = -y;} > - > - > z = pio4-x; > w = pio4lo-y; > x = z+w; y = 0.0; > } > z = x*x; > w = z*z; > - /* Break x^5*(T[1]+x^2*T[2]+...) into > - * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) + > - * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12])) > - */ > + /* Break x^5*(T[1]+x^2*T[2]+...) into > + * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) + > + * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12])) > + */ > > - r = T[1]+w*(T[3]+w*(T[5]+w*T[7])); > - v = z*(T[2]+w*(T[4]+w*T[6])); > + r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]); > + v = z* mad(w, mad(w, T[6], T[4]), T[2]); > > s = z*x; > - r = y + z*(s*(r+v)+y); > + r = mad(z, mad(s, r + v, y), y); > r += T[0]*s; > w = x+r; > if(ix>=0x3f2ca140) { > @@ -696,21 +612,8 @@ float __kernel_tanf(float x, float y, int iy) > return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r))); > } > if(iy==1) return w; > - else { /* if allow error up to 2 ulp > - simply return -1.0/(x+r) here */ > - /* compute -1.0/(x+r) accurately */ > - float a,t; > - int i; > - z = w; > - GEN_OCL_GET_FLOAT_WORD(i,z); > - GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000); > - v = r-(z - x); /* z+v = r+x */ > - t = a = -(float)1.0/w; /* a = -1.0/w */ > - GEN_OCL_GET_FLOAT_WORD(i,t); > - GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000); > - s = (float)1.0+t*z; > - return t+a*(s+t*v); > - } > + else > + return -1.0/(x+r); > } > > OVERLOADABLE float tan(float x) > @@ -931,44 +834,46 @@ OVERLOADABLE float lgamma(float x) { > switch (i) { > case 0: > z = y * y; > - p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * > a10)))); > - p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + > z * > a11))))); > - p = y * p1 + p2; > + p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), > a6), a4), a2), a0); > + p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), > a7), a5), a3), a1); > + p = mad(y, p1, p2); > r += (p - (float) 0.5 * y); > break; > case 1: > z = y * y; > w = z * y; > - p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); > - p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); > - p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); > - p = z * p1 - (tt - w * (p2 + y * p3)); > + p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), > t0); > + p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), > t4), t1); > + p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), > t5), t2); > + p = mad(p1, z, mad(w, mad(y, p3, p2), -tt)); > r += (tf + p); > break; > case 2: > - p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + > y * > u5))))); > - p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * > v5)))); > + p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), > u3), u2), u1), u0); > + p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), > v2), v1), > +one); > r += (-(float) 0.5 * y + p1 / p2); > } > } else if (ix < 0x41000000) { > i = (int) x; > t = zero; > y = x - (float) i; > - p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + > y * > s6)))))); > - q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * > r6))))); > + > + p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), > s4), s3), s2), s1), s0); > + q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), > r3), r2), > +r1), one); > r = .5f * y + p / q; > z = one; > + > switch (i) { > case 7: > - z *= (y + (float) 6.0); > + z *= (y + 6.0f); > case 6: > - z *= (y + (float) 5.0); > + z *= (y + 5.0f); > case 5: > - z *= (y + (float) 4.0); > + z *= (y + 4.0f); > case 4: > - z *= (y + (float) 3.0); > + z *= (y + 3.0f); > case 3: > - z *= (y + (float) 2.0); > + z *= (y + 2.0f); > r += native_log(z); > break; > } > @@ -977,7 +882,7 @@ OVERLOADABLE float lgamma(float x) { > t = native_log(x); > z = one / x; > y = z * z; > - w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * > w6))))); > + w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), > w4), w3), w2), > +w1), w0); > r = (x - .5f) * (t - one) + w; > } else > r = x * (native_log(x) - one); > @@ -1123,32 +1028,32 @@ OVERLOADABLE float lgamma(float x) { > switch (i) { \ > case 0: \ > z = y * y; \ > - p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * > a10)))); \ > - p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + > z * > a11))))); \ > - p = y * p1 + p2; \ > - r += (p - (float) 0.5 * y); \ > + p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), > a6), a4), a2), a0); \ > + p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), > a7), a5), a3), a1); \ > + p = mad(y, p1, p2); \ > + r = r - mad(y, 0.5f, -p); \ > break; \ > case 1: \ > z = y * y; \ > w = z * y; \ > - p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); \ > - p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); \ > - p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); \ > - p = z * p1 - (tt - w * (p2 + y * p3)); \ > + p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), > t0); \ > + p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), > t4), t1); \ > + p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), > t5), t2); \ > + p = z * p1 + mad(w, mad(y, p3, p2), -tt); \ > r += (tf + p); \ > break; \ > case 2: \ > - p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + > y * > u5))))); \ > - p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * > v5)))); \ > - r += (-(float) 0.5 * y + p1 / p2); \ > + p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), > u3), u2), u1), u0); \ > + p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), > v2), v1), one); \ > + r = r + mad(y, -0.5f, p1 / p2); \ > } \ > } else if (ix < 0x41000000) { \ > i = (int) x; \ > t = zero; \ > y = x - (float) i; \ > - p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + > y * > s6)))))); \ > - q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * > r6))))); \ > - r = .5f * y + p / q; \ > + p = y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), > s4), s3), s2), s1), s0); \ > + q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), > r3), r2), r1), one); \ > + r = mad(y, 0.5f, p / q); \ > z = one; \ > switch (i) { \ > case 7: \ > @@ -1169,10 +1074,10 @@ OVERLOADABLE float lgamma(float x) { > t = native_log(x); \ > z = one / x; \ > y = z * z; \ > - w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * > w6))))); \ > + w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), > w4), w3), w2), > +w1), w0); \ > r = (x - .5f) * (t - one) + w; \ > } else \ > - r = x * (native_log(x) - one); \ > + r = x * (native_log(x) - one); \ > if (hx < 0) \ > r = nadj - r; \ > return r; > @@ -1253,20 +1158,26 @@ OVERLOADABLE float log1p(float x) { > f = u-(float)1.0; > } > hfsq=(float)0.5*f*f; > - if(hu==0) { /* |f| < 2**-20 */ > - if(f==zero) { if(k==0) return zero; > - else {c += k*ln2_lo; return k*ln2_hi+c;} } > - R = hfsq*((float)1.0-(float)0.66666666666666666*f); > + if(hu==0) > + { /* |f| < 2**-20 */ > + if(f==zero) > + { > + if(k==0) return zero; > + else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);} > + } > + R = mad(hfsq, 1.0f, -0.66666666666666666f * f); > if(k==0) return f-R; else > - return k*ln2_hi-((R-(k*ln2_lo+c))-f); > + return k * ln2_hi - (R - mad(k, ln2_lo, c) - f); > } > s = f/((float)2.0+f); > z = s*s; > - R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4))); > - if(k==0) return f-(hfsq-s*(hfsq+R)); else > - return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f); > - > + R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1); > + if(k==0) > + return f + mad(hfsq + R, s, -hfsq); > + else > + return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f); > } > + > OVERLOADABLE float logb(float x) { > if (__ocl_math_fastpath_flag) > return __gen_ocl_internal_fastpath_logb(x); > @@ -1378,14 +1289,14 @@ OVERLOADABLE float > __gen_ocl_internal_cbrt(float x) { > > /* new cbrt to 23 bits */ > r=t*t/x; > - s=C+r*t; > + s=mad(r, t, C); > t*=G+F/(s+E+D/s); > /* one step newton iteration to 53 bits with error less than 0.667 ulps > */ > s=t*t; /* t*t is exact */ > r=x/s; > w=t+t; > r=(r-t)/(w+r); /* r-s is exact */ > - t=t+t*r; > + t=mad(t, r, t); > > /* retore the sign bit */ > GEN_OCL_GET_FLOAT_WORD(high,t); > @@ -1437,10 +1348,10 @@ INLINE float __gen_ocl_asin_util(float x) { > qS4 = 7.70381505559019352791e-02; > > float t = x*x; > - float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4)))); > - float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4))); > + float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), > + pS0); float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), > + 1.0f); > float w = p / q; > - return x + x*w; > + return mad(x, w, x); > } > > OVERLOADABLE float __gen_ocl_internal_asin(float x) { @@ -1538,8 +1449,8 > @@ OVERLOADABLE float __gen_ocl_internal_atan(float x) { > z = x*x; > w = z*z; > /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */ > - s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*aT[6]))); > - s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]))); > + s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]); > + s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]); > if (id<0) return x - x*(s1+s2); > else { > z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x); @@ -1829,15 +1740,15 > @@ > sb7 = -2.2440952301e+01; /* 0xc1b38712 */ > return x + efx*x; > } > z = x*x; > - r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4))); > - s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5)))); > - y = r/s; > - return x + x*y; > + r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0); > + s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), > one); > + y = r / s; > + return mad(x, y, x); > } > if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */ > s = __gen_ocl_internal_fabs(x)-one; > - P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6))))); > - Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6))))); > + P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), > pa3), pa2), pa1), pa0); > + Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), > +qa3), qa2), qa1), one); > if(hx>=0) return erx + P/Q; else return -erx - P/Q; > } > if (ix >= 0x40c00000) { /* inf>|x|>=6 */ > @@ -1846,15 +1757,15 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */ > x = __gen_ocl_internal_fabs(x); > s = one/(x*x); > if(ix< 0x4036DB6E) { /* |x| < 1/0.35 */ > - R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*( > - ra5+s*(ra6+s*ra7)))))); > - S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*( > - sa5+s*(sa6+s*(sa7+s*sa8))))))); > + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > + ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0); > + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > + sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one); > } else { /* |x| >= 1/0.35 */ > - R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*( > - rb5+s*rb6))))); > - S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*( > - sb5+s*(sb6+s*sb7)))))); > + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > + rb6, rb5), rb4), rb3), rb2), rb1), rb0); > + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > + sb7, sb6), sb5), sb4), sb3), sb2), sb1), one); > } > GEN_OCL_GET_FLOAT_WORD(ix,x); > GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000); > @@ -1949,8 +1860,8 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */ > if(ix < 0x23800000) /* |x|<2**-56 */ > return one-x; > z = x*x; > - r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4))); > - s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5)))); > + r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0); > + s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), > +qq1), one); > y = r/s; > if(hx < 0x3e800000) { /* x<1/4 */ > return one-(x+x*y); > @@ -1962,8 +1873,8 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */ > } > if(ix < 0x3fa00000) { /* 0.84375 <= |x| < 1.25 */ > s = __gen_ocl_internal_fabs(x)-one; > - P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6))))); > - Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6))))); > + P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), > pa3), pa2), pa1), pa0); > + Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), > +qa3), qa2), qa1), one); > if(hx>=0) { > z = one-erx; return z - P/Q; > } else { > @@ -1974,16 +1885,16 @@ sb7 = -2.2440952301e+01; /* 0xc1b38712 */ > x = __gen_ocl_internal_fabs(x); > s = one/(x*x); > if(ix< 0x4036DB6D) { /* |x| < 1/.35 ~ 2.857143*/ > - R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*( > - ra5+s*(ra6+s*ra7)))))); > - S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*( > - sa5+s*(sa6+s*(sa7+s*sa8))))))); > + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > + ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0); > + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > mad(s, > + sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), > one); > } else { /* |x| >= 1/.35 ~ 2.857143 */ > if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */ > - R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*( > - rb5+s*rb6))))); > - S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*( > - sb5+s*(sb6+s*sb7)))))); > + R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > + rb6, rb5), rb4), rb3), rb2), rb1), rb0); > + S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, > + sb7, sb6), sb5), sb4), sb3), sb2), sb1), one); > } > GEN_OCL_GET_FLOAT_WORD(ix,x); > GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000); > @@ -2224,7 +2135,7 @@ OVERLOADABLE float > __gen_ocl_internal_asinh(float x){ > } else { > float xa = __gen_ocl_internal_fabs(x); > if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */ > - w = > __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa)); > + w = __gen_ocl_internal_log(mad(xa, 2.0f, one / > + (__gen_ocl_sqrt(mad(xa, xa, one)) + xa))); > } else { /* 2.0 > |x| > 2**-14 */ > float t = xa*xa; > w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t))); > -- > 2.5.0 > > _______________________________________________ > Beignet mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
