Hi!,
El Mon, Jan 04, 2016 at 06:33:59PM -0800, Ganesh Ajjanagadde escribio:
> This exploits an approach based on the sieve of Eratosthenes, a popular
> method for generating prime numbers.
>
> Tables are identical to previous ones.
>
> Tested with FATE with/without --enable-hardcoded-tables.
>
> Sample benchmark (Haswell, GNU/Linux+gcc):
> prev:
> 7860100 decicycles in cbrt_tableinit, 1 runs, 0 skips
> 7777490 decicycles in cbrt_tableinit, 2 runs, 0 skips
> [...]
> 7582339 decicycles in cbrt_tableinit, 256 runs, 0 skips
> 7563556 decicycles in cbrt_tableinit, 512 runs, 0 skips
>
> new:
> 2099480 decicycles in cbrt_tableinit, 1 runs, 0 skips
> 2044470 decicycles in cbrt_tableinit, 2 runs, 0 skips
> [...]
> 1796544 decicycles in cbrt_tableinit, 256 runs, 0 skips
> 1791631 decicycles in cbrt_tableinit, 512 runs, 0 skips
>
See attached code, function "test1", based on an approximation of:
(i+1)^(1/3) ~= i^(1/3) * ( 1 + 1/(3i) - 1/(9i) + 5/(81i) - .... )
Generated values are the same as original floats (max error in double
is < 4*10^-10), it is faster (and I think, simpler) than your version.
Perhaps altering the constants it could be made faster still, but it is
currently dominated by de division in the main loop.
Daniel.
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
void test1(float *out)
{
int i;
double cb;
// Really small values are filled with original function:
for(i=0; i<64; i++)
out[i] = cbrt(i) * i;
// "cb" is the cube-root approximation to "i"
cb = 4;
for(i=64; i<343; i++)
{
double s, r, t;
out[i] = cb * i;
// For small values, we use 5 terms:
r = 1.0/(3*i); // 0 A , 1 M , 1 D
t = r; //
s = 1.0 + r; // 1 A , 1 M , 1 D
r = r * r; // 1 A , 2 M , 1 D
s = s - r; // 2 A , 2 M , 1 D
r = r * t; // 2 A , 3 M , 1 D
s = s + 1.6666155 *r; // 3 A , 4 M , 1 D
r = r * t; // 3 A , 5 M , 1 D
s = s - 3.289935 *r; // 4 A , 6 M , 1 D
cb = cb * s; // 4 A , 7 M , 1 D
}
cb = 7;
for(i=343; i<8192; i++)
{
double s, r, t;
out[i] = cb * i;
// For big values, we use 4 terms:
r = 1.0/(3*i); // 0 A , 1 M , 1 D
t = r; //
s = 1.0 + r; // 1 A , 1 M , 1 D
r = r * r; // 1 A , 2 M , 1 D
s = s - r; // 2 A , 2 M , 1 D
r = r * t; // 2 A , 3 M , 1 D
s = s + 1.6644 *r; // 3 A , 4 M , 1 D
cb = cb * s; // 3 A , 5 M , 1 D
}
}
void test2(float *out)
{
int i;
for(i=0; i<8192; i++)
out[i] = cbrt(i) * i;
}
static double get_time(void)
{
struct timeval t1;
gettimeofday(&t1,0);
return t1.tv_sec * 1000000.0 + t1.tv_usec;
}
int main()
{
int i;
double t1, t2, t3;
char *orig, *fast;
orig = malloc(sizeof(float) * 8192);
fast = malloc(sizeof(float) * 8192);
// Repeat 200 times
for(i=0; i<200; i++)
{
t1 = get_time();
test1((float *)fast);
t2 = get_time();
test2((float *)orig);
t3 = get_time();
printf("Orig: %f\tFast: %f\n", (t3-t2), (t2-t1));
}
// Compare
for(i=0; i<sizeof(float)*8192; i++)
{
if( orig[i] != fast[i] )
printf("error at byte %d (word %d) [%d != %d]\n", i, (int)(i/sizeof(float)),
orig[i], fast[i]);
}
return 0;
}
_______________________________________________
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel