This one plus the previous version three patches looks good. This is big performance improvement to the math functions.
Thanks! Ruiling > -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Grigore Lupescu > Sent: Tuesday, July 19, 2016 4:38 AM > To: [email protected] > Subject: [Beignet] [PATCH 4/4] Backend: Optimization internal math, use mad > > From: Grigore Lupescu <grigore.lupescu at intel.com> > > Affected functions: > __gen_ocl_internal_log > __gen_ocl_internal_log10 > __gen_ocl_internal_log2 > __kernel_sinf > __kernel_cosf > __gen_ocl_internal_cbrt > __gen_ocl_asin_util > tan > log1p > > Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com> > --- > backend/src/libocl/tmpl/ocl_math.tmpl.cl | 385 > +++++++++++++------------------ > 1 file changed, 159 insertions(+), 226 deletions(-) > > diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl > b/backend/src/libocl/tmpl/ocl_math.tmpl.cl > index 9ea0817..149ba01 100644 > --- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl > +++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl > @@ -164,7 +164,7 @@ OVERLOADABLE float > __gen_ocl_internal_copysign(float x, float y) { > return ux.f; > } > > -OVERLOADABLE float __gen_ocl_internal_log(float x) { > +OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) { > /* > * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected] > * ==================================================== > @@ -178,187 +178,105 @@ OVERLOADABLE float __gen_ocl_internal_log(float > x) { > */ > union { unsigned int i; float f; } u; > const float > - ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ > - ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */ > - two25 = 3.355443200e+07, /* 0x4c000000 */ > + ln2_hi = 6.9313812256e-01, /* 0x3f317180 */ > + ln2_lo = 9.0580006145e-06, /* 0x3717f7d1 */ > + two25 = 3.355443200e+07, /* 0x4c000000 */ > Lg1 = 6.6666668653e-01, /* 3F2AAAAB */ > Lg2 = 4.0000000596e-01, /* 3ECCCCCD */ > Lg3 = 2.8571429849e-01, /* 3E924925 */ > Lg4 = 2.2222198546e-01; /* 3E638E29 */ > > const float zero = 0.0; > - float hfsq,f,s,z,R,w,t1,t2,dk; > - int k,ix,i,j; > + float fsq, f, s, z, R, w, t1, t2, partial; > + int k, ix, i, j; > > u.f = x; ix = u.i; > - k=0; > - if (ix < 0x00800000) { /* x < 2**-126 */ > - if ((ix&0x7fffffff)==0) > - return -two25/zero; /* log(+-0)=-inf */ > - if (ix<0) return (x-x)/zero; /* log(-#) = NaN */ > - return -INFINITY; /* Gen does not support subnormal number now */ > - //k -= 25; x *= two25; /* subnormal number, scale up x */ > - //u.f = x; ix = u.i; > - } > - if (ix >= 0x7f800000) return x+x; > - k += (ix>>23)-127; > + k = 0; > + > + k += (ix>>23) - 127; > ix &= 0x007fffff; > - i = (ix+(0x95f64<<3))&0x800000; > - u.i = ix|(i^0x3f800000); x = u.f; > + i = (ix + (0x95f64<<3)) & 0x800000; > + u.i = ix | (i^0x3f800000); x = u.f; > k += (i>>23); > - f = x-(float)1.0; > - if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */ > - if(f==zero) { > - if(k==0) return zero; > - else { > - dk=(float)k; return dk*ln2_hi+dk*ln2_lo; > - } > - } > - R = f*f*((float)0.5-(float)0.33333333333333333*f); > - if(k==0) > - return f-R; > - else { > - dk=(float)k; return dk*ln2_hi-((R-dk*ln2_lo)-f); > - } > + f = x - 1.0f; > + fsq = f * f; > + > + if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */ > + R = fsq * (0.5f - 0.33333333333333333f * f); > + return k * ln2_hi + k * ln2_lo + f - R; > } > - s = f/((float)2.0+f); > - dk = (float)k; > - z = s*s; > - i = ix-(0x6147a<<3); > - w = z*z; > - j = (0x6b851<<3)-ix; > - t1= w*(Lg2+w*Lg4); > - t2= z*(Lg1+w*Lg3); > + > + s = f / (2.0f + f); > + z = s * s; > + i = ix - (0x6147a << 3); > + w = z * z; > + j = (0x6b851 << 3) - ix; > + t1= w * mad(w, Lg4, Lg2); > + t2= z * mad(w, Lg3, Lg1); > i |= j; > - R = t2+t1; > - if(i>0) { > - hfsq=(float)0.5*f*f; > - if(k==0) return f-(hfsq-s*(hfsq+R)); else > - return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f); > - } else { > - if(k==0) return f-s*(f-R); else > - return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f); > - } > + R = t2 + t1; > + partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f); > + > + return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;; > } > > +OVERLOADABLE float __gen_ocl_internal_log(float x) > +{ > + union { unsigned int i; float f; } u; > + u.f = x; > + int ix = u.i; > > -OVERLOADABLE float __gen_ocl_internal_log10(float x) { > -/* > - * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected] > - * ==================================================== > - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. > - * > - * Developed at SunPro, a Sun Microsystems, Inc. business. > - * Permission to use, copy, modify, and distribute this > - * software is freely granted, provided that this notice > - * is preserved. > - * ==================================================== > - */ > + if (ix < 0 ) > + return NAN; /* log(-#) = NaN */ > + if (ix >= 0x7f800000) > + return NAN; > > - union {float f; unsigned i; }u; > + return __gen_ocl_internal_log_valid(x); > +} > + > +OVERLOADABLE float __gen_ocl_internal_log10(float x) > +{ > + union { float f; unsigned i; } u; > const float > - zero = 0.0, > - two25 = 3.3554432000e+07, /* 0x4c000000 */ > ivln10 = 4.3429449201e-01, /* 0x3ede5bd9 */ > log10_2hi = 3.0102920532e-01, /* 0x3e9a2080 */ > log10_2lo = 7.9034151668e-07; /* 0x355427db */ > > - float y,z; > - int i,k,hx; > + float y, z; > + int i, k, hx; > > u.f = x; hx = u.i; > - k=0; > - if (hx < 0x00800000) { /* x < 2**-126 */ > - if ((hx&0x7fffffff)==0) > - return -two25/zero; /* log(+-0)=-inf */ > - if (hx<0) return NAN; /* log(-#) = NaN */ > - return -INFINITY; /* Gen does not support subnormal now */ > - //k -= 25; x *= two25; /* subnormal number, scale up x */ > - //u.f = x; hx = u.i; > - } > - if (hx >= 0x7f800000) return x+x; > - k += (hx>>23)-127; > - i = ((unsigned)k&0x80000000)>>31; > - hx = (hx&0x007fffff)|((0x7f-i)<<23); > - y = (float)(k+i); > + > + if (hx<0) > + return NAN; /* log(-#) = NaN */ > + if (hx >= 0x7f800000) > + return NAN; > + > + k = (hx >> 23) - 127; > + i = ((unsigned)k & 0x80000000) >> 31; > + hx = (hx&0x007fffff) | ((0x7f-i) << 23); > + y = (float)(k + i); > u.i = hx; x = u.f; > - z = y*log10_2lo + ivln10*__gen_ocl_internal_log(x); > - return z+y*log10_2hi; > + > + return y * log10_2lo + y * log10_2hi + ivln10 * > __gen_ocl_internal_log_valid(x); > } > > > -OVERLOADABLE float __gen_ocl_internal_log2(float x) { > -/* > - * Conversion to float by Ian Lance Taylor, Cygnus Support, [email protected] > - * adapted for log2 by Ulrich Drepper <[email protected]> > - * ==================================================== > - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. > - * > - * Developed at SunPro, a Sun Microsystems, Inc. business. > - * Permission to use, copy, modify, and distribute this > - * software is freely granted, provided that this notice > - * is preserved. > - * ==================================================== > - */ > +OVERLOADABLE float __gen_ocl_internal_log2(float x) > +{ > const float zero = 0.0, > - ln2 = 0.69314718055994530942, > - two25 = 3.355443200e+07, /** 0x4c000000 */ > - Lg1 = 6.6666668653e-01, /** 3F2AAAAB */ > - Lg2 = 4.0000000596e-01, /** 3ECCCCCD */ > - Lg3 = 2.8571429849e-01, /** 3E924925 */ > - Lg4 = 2.2222198546e-01; /** 3E638E29 */ > - > - float hfsq,f,s,z,R,w,t1,t2,dk; > - int k,ix,i,j; > + invln2 = 0x1.715476p+0f; > + int ix; > > - union {float f; int i; }u;//GET_FLOAT_WORD(ix,x); > + union { float f; int i; } u; > u.f = x; ix = u.i; > > - k=0; > - if (ix < 0x00800000) { /** x < 2**-126 */ > - if ((ix&0x7fffffff)==0) > - return -two25/(x-x); /** log(+-0)=-inf */ > - > - if (ix<0) return (x-x)/(x-x); /** log(-#) = NaN */ > - return -INFINITY; > - k -= 25; x *= two25; /** subnormal number, scale up x */ > - u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x); > - } > - > - if (ix >= 0x7f800000) return x+x; > - > - k += (ix>>23)-127; > - ix &= 0x007fffff; > - i = (ix+(0x95f64<<3))&0x800000; > - > - u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000)); > /** normalize x or x/2 */ > - k += (i>>23); > - dk = (float)k; > - f = x-(float)1.0; > - > - if((0x007fffff&(15+ix))<16) { /** |f| < 2**-20 */ > - if(f==zero) return dk; > + if (ix < 0) > + return NAN; /** log(-#) = NaN */ > + if (ix >= 0x7f800000) > + return NAN; > > - R = f*f*((float)0.5-(float)0.33333333333333333*f); > - return dk-(R-f)/ln2; > - } > - > - s = f/((float)2.0+f); > - z = s*s; > - i = ix-(0x6147a<<3); > - w = z*z; > - j = (0x6b851<<3)-ix; > - t1= w*(Lg2+w*Lg4); > - t2= z*(Lg1+w*Lg3); > - i |= j; > - R = t2+t1; > - > - if(i>0) { > - hfsq=(float)0.5*f*f; > - return dk-((hfsq-(s*(hfsq+R)))-f)/ln2; > - } else { > - return dk-((s*(f-R))-f)/ln2; > - } > + return invln2 * __gen_ocl_internal_log_valid(x); > } > > > @@ -545,9 +463,13 @@ OVERLOADABLE float __kernel_sinf(float x) > float z,r,v; > z = x*x; > v = z*x; > - r = S2+z*(S3+z*(S4)); > + r = mad(z, > + mad(z, > + mad(z, S4, S3), > + S2), > + S1); > > - return x+v*(S1+z*r); > + return mad(v, r, x); > } > > float __kernel_cosf(float x, float y) > @@ -563,7 +485,9 @@ float __kernel_cosf(float x, float y) > GEN_OCL_GET_FLOAT_WORD(ix,x); > ix &= 0x7fffffff; /* ix = |x|'s high word*/ > z = x*x; > - r = z*(C1+z*(C2+z*(C3))); > + r = z * mad(z, > + mad(z, C3, C2), > + C1); > > if(ix < 0x3e99999a) /* if |x| < 0.3 */ > return one - ((float)0.5*z - (z*r - x*y)); > @@ -671,24 +595,27 @@ float __kernel_tanf(float x, float y, int iy) > } > if(ix>=0x3f2ca140) { /* |x|>=0.6744 */ > if(hx<0) {x = -x; y = -y;} > - > - > z = pio4-x; > w = pio4lo-y; > x = z+w; y = 0.0; > } > z = x*x; > w = z*z; > - /* Break x^5*(T[1]+x^2*T[2]+...) into > - * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) + > - * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12])) > - */ > - > - r = T[1]+w*(T[3]+w*(T[5]+w*T[7])); > - v = z*(T[2]+w*(T[4]+w*T[6])); > + /* Break x^5*(T[1]+x^2*T[2]+...) into > + * x^5(T[1]+x^4*T[3]+...+x^20*T[11]) + > + * x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12])) > + */ > + > + r = mad(w, > + mad(w, > + mad(w, T[7], > + T[5]), > + T[3]), > + T[1]); > + v = z* mad(w, mad(w, T[6], T[4]), T[2]); > > s = z*x; > - r = y + z*(s*(r+v)+y); > + r = mad(z, mad(s, r + v, y), y); > r += T[0]*s; > w = x+r; > if(ix>=0x3f2ca140) { > @@ -696,21 +623,8 @@ float __kernel_tanf(float x, float y, int iy) > return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r))); > } > if(iy==1) return w; > - else { /* if allow error up to 2 ulp > - simply return -1.0/(x+r) here */ > - /* compute -1.0/(x+r) accurately */ > - float a,t; > - int i; > - z = w; > - GEN_OCL_GET_FLOAT_WORD(i,z); > - GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000); > - v = r-(z - x); /* z+v = r+x */ > - t = a = -(float)1.0/w; /* a = -1.0/w */ > - GEN_OCL_GET_FLOAT_WORD(i,t); > - GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000); > - s = (float)1.0+t*z; > - return t+a*(s+t*v); > - } > + else > + return -1.0/(x+r); > } > > OVERLOADABLE float tan(float x) > @@ -897,44 +811,45 @@ OVERLOADABLE float lgamma(float x) { > switch (i) { > case 0: > z = y * y; > - p1 = a0 + z * a2; > - p2 = z * (a1 + z * a3); > - p = y * p1 + p2; > + p1 = mad(z, a2, a0); > + p2 = z * mad(z, a3, a1); > + p = mad(y, p1, p2); > r += (p - (float) 0.5 * y); > break; > case 1: > z = y * y; > w = z * y; > - p1 = t0 + w * t3; > - p2 = t1 + w * t4; > - p3 = t2 + w * t5; > - p = z * p1 - (tt - w * (p2 + y * p3)); > + p1 = mad(w, t3, t0); > + p2 = mad(w, t4, t1); > + p3 = mad(w, t5, t2); > + p = z * p1 - (tt - w * mad(y, p3, p2)); > r += (tf + p); > break; > case 2: > - p1 = y * (u0 + y * u1); > - p2 = one + y * (v1 + y * v2); > + p1 = y * mad(y, u1, u0); > + p2 = one + y * mad(y, v2, v1); > r += (-(float) 0.5 * y + p1 / p2); > } > } else if (ix < 0x41000000) { > i = (int) x; > t = zero; > y = x - (float) i; > - p = y * (s0 + y * (s1 + y * s2)); > - q = one + y * (r1 + y * r2); > + p = y * mad(y, mad(y, s2, s1), s0); > + q = mad(y, mad(y, r2, r1), 1.0f); > r = .5f * y + p / q; > z = one; > + > switch (i) { > case 7: > - z *= (y + (float) 6.0); > + z *= (y + 6.0f); > case 6: > - z *= (y + (float) 5.0); > + z *= (y + 5.0f); > case 5: > - z *= (y + (float) 4.0); > + z *= (y + 4.0f); > case 4: > - z *= (y + (float) 3.0); > + z *= (y + 3.0f); > case 3: > - z *= (y + (float) 2.0); > + z *= (y + 2.0f); > r += native_log(z); > break; > } > @@ -943,7 +858,7 @@ OVERLOADABLE float lgamma(float x) { > t = native_log(x); > z = one / x; > y = z * z; > - w = w0 + z * (w1 + y * w2); > + w = mad(z, mad(y, w2, 21), w0); > r = (x - .5f) * (t - one) + w; > } else > r = x * (native_log(x) - one); > @@ -1053,32 +968,32 @@ OVERLOADABLE float lgamma(float x) { > switch (i) { \ > case 0: \ > z = y * y; \ > - p1 = a0 + z * a2; \ > - p2 = z * (a1 + z *a3); \ > - p = y * p1 + p2; \ > - r += (p - (float) 0.5 * y); \ > + p1 = mad(z, a2, a0); \ > + p2 = z * mad(z, a3, a1); \ > + p = mad(y, p1, p2); \ > + r -= mad(y, 0.5f, -p); \ > break; \ > case 1: \ > z = y * y; \ > w = z * y; \ > - p1 = t0 + w * t3; \ > - p2 = t1 + w * t4; \ > - p3 = t2 + w * t5; \ > - p = z * p1 - (tt - w * (p2 + y * p3)); \ > + p1 = mad(w, t3, t0); \ > + p2 = mad(w, t4, t1); \ > + p3 = mad(w, t5, t2); \ > + p = z * p1 + mad(w, mad(y, p3, p2), -tt); \ > r += (tf + p); \ > break; \ > case 2: \ > - p1 = y * (u0 + y * u1); \ > - p2 = one + y * (v1 + y * v2); \ > - r += (-(float) 0.5 * y + p1 / p2); \ > + p1 = y * mad(y, u1, u0); \ > + p2 = mad(y, mad(y, v2, v1), 1.0f); \ > + r -= mad(y, 0.5f, -p1 / p2); \ > } \ > } else if (ix < 0x41000000) { \ > i = (int) x; \ > t = zero; \ > y = x - (float) i; \ > - p = y * (s0 + y * (s1 + y * s2)); \ > - q = one + y * (r1 + y * r2); \ > - r = .5f * y + p / q; \ > + p = y * mad(y, mad(y, s2, s1), s0); \ > + q = mad(y, mad(y, r2, r1), 1.0f); \ > + r = mad(y, 0.5f, p / q); \ > z = one; \ > switch (i) { \ > case 7: \ > @@ -1099,10 +1014,10 @@ OVERLOADABLE float lgamma(float x) { > t = native_log(x); \ > z = one / x; \ > y = z * z; \ > - w = w0 + z * (w1 + y * w2); \ > - r = (x - .5f) * (t - one) + w; \ > + w = mad(z, mad(y, w2, w1), w0); \ > + r = mad(x - 0.5f, t - 1.0f, w); \ > } else \ > - r = x * (native_log(x) - one); \ > + r = mad(x, native_log(x), -1.0f); \ > if (hx < 0) \ > r = nadj - r; \ > return r; > @@ -1183,20 +1098,26 @@ OVERLOADABLE float log1p(float x) { > f = u-(float)1.0; > } > hfsq=(float)0.5*f*f; > - if(hu==0) { /* |f| < 2**-20 */ > - if(f==zero) { if(k==0) return zero; > - else {c += k*ln2_lo; return k*ln2_hi+c;} } > - R = hfsq*((float)1.0-(float)0.66666666666666666*f); > + if(hu==0) > + { /* |f| < 2**-20 */ > + if(f==zero) > + { > + if(k==0) return zero; > + else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);} > + } > + R = mad(hfsq, 1.0f, -0.66666666666666666f * f); > if(k==0) return f-R; else > - return k*ln2_hi-((R-(k*ln2_lo+c))-f); > + return k * ln2_hi - (R - mad(k, ln2_lo, c) - f); > } > s = f/((float)2.0+f); > z = s*s; > - R = z*(Lp1+z*(Lp2+z*(Lp3+z*Lp4))); > - if(k==0) return f-(hfsq-s*(hfsq+R)); else > - return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f); > - > + R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1); > + if(k==0) > + return f + mad(hfsq + R, s, -hfsq); > + else > + return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f); > } > + > OVERLOADABLE float logb(float x) { > if (__ocl_math_fastpath_flag) > return __gen_ocl_internal_fastpath_logb(x); > @@ -1308,14 +1229,14 @@ OVERLOADABLE float > __gen_ocl_internal_cbrt(float x) { > > /* new cbrt to 23 bits */ > r=t*t/x; > - s=C+r*t; > + s=mad(r, t, C); > t*=G+F/(s+E+D/s); > /* one step newton iteration to 53 bits with error less than 0.667 ulps > */ > s=t*t; /* t*t is exact */ > r=x/s; > w=t+t; > r=(r-t)/(w+r); /* r-s is exact */ > - t=t+t*r; > + t=mad(t, r, t); > > /* retore the sign bit */ > GEN_OCL_GET_FLOAT_WORD(high,t); > @@ -1367,10 +1288,22 @@ INLINE float __gen_ocl_asin_util(float x) { > qS4 = 7.70381505559019352791e-02; > > float t = x*x; > - float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*pS4)))); > - float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4))); > + float p = t * mad(t, > + mad(t, > + mad(t, > + mad(t, pS4, pS3), > + pS2), > + pS1), > + pS0); > + float q = mad(t, > + mad(t, > + mad(t, > + mad(t, qS4, qS3), > + qS2), > + qS1), > + 1.0f); > float w = p / q; > - return x + x*w; > + return mad(x, w, x); > } > > OVERLOADABLE float __gen_ocl_internal_asin(float x) { > -- > 2.5.0 > > _______________________________________________ > Beignet mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
