Computing 1000 times A = A ** B, A = (1.000002, 1.000004...), B = 1.0102003 Performance results on Intel Core i7 5600U BDW (GPU HD5500 GT2): INTERNAL [Result: 0.388 Mpow/sec] ... 1.189, 1.229, 1.277, 1.337, 1.412, ... NATIVE [Result: 13.306 Mpow/sec] ... 1.000, 1.000, 1.000, 1.000, 1.306, ...
Signed-off-by: Grigore Lupescu <[email protected]> --- benchmark/benchmark_math.cpp | 28 +++++++++++++++------------- kernels/bench_math.cl | 21 ++++++++++++++++----- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/benchmark/benchmark_math.cpp b/benchmark/benchmark_math.cpp index b93a4f3..a92b39e 100644 --- a/benchmark/benchmark_math.cpp +++ b/benchmark/benchmark_math.cpp @@ -1,6 +1,5 @@ #include "utests/utest_helper.hpp" #include <sys/time.h> - #include <cstdint> #include <cstdlib> #include <cstring> @@ -8,23 +7,27 @@ #include "utest_helper.hpp" #include <sys/time.h> -double benchmark_math_exp(void) +double benchmark_math_pow(void) { double elapsed = 0; struct timeval start,stop; const size_t global_size = 1024 * 1024; - const size_t local_size = 128; - const uint32_t reduce_loop = 10000; + const size_t local_size = 64; + + /* computes recursive base = base ** pwr, loop times */ + cl_float base = 1.000002; + cl_float pwr = 1.0102003; + uint32_t loop = 1000; /* Input set will be generated */ float* src = (float*)calloc(sizeof(float), global_size); OCL_ASSERT(src != NULL); for(uint32_t i = 0; i < global_size; i++) - src[i] = i % local_size; + src[i] = base + i * (base - 1); /* Setup kernel and buffers */ OCL_CREATE_KERNEL_FROM_FILE("bench_math", - "bench_math_exp"); + "bench_math_pow"); OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL); OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL); @@ -38,7 +41,8 @@ double benchmark_math_exp(void) OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); - OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + OCL_SET_ARG(2, sizeof(cl_float), &pwr); + OCL_SET_ARG(3, sizeof(cl_uint), &loop); /* Measure performance */ gettimeofday(&start,0); @@ -49,12 +53,10 @@ double benchmark_math_exp(void) /* Check results */ OCL_MAP_BUFFER(1); - for(uint32_t i = 0; i < global_size; i += local_size){ - //printf(" %f", ((float*)buf_data[1])[i]); - //OCL_ASSERT( ((float*)buf_data[1])[i] == (float)i ); - } + for(uint32_t i = 0; i < local_size; i ++) + printf("\t%.3f", ((float*)buf_data[1])[i]); OCL_UNMAP_BUFFER(1); - return BANDWIDTH(global_size * reduce_loop, elapsed); + return BANDWIDTH(global_size * loop, elapsed); } -MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp, "Mflops/sec"); +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_pow, "Mpow/sec"); diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl index 75da4d2..1174387 100644 --- a/kernels/bench_math.cl +++ b/kernels/bench_math.cl @@ -1,13 +1,24 @@ -kernel void bench_math_exp( +#define USE_GEN_NATIVE_POW 0 + +/* computes recursive base = base ** pwr, loop times + * base = src[get_global_id(0)] */ +kernel void bench_math_pow( global float *src, global float *dst, - uint reduce_loop) + float pwr, + uint loop) { float val = src[get_global_id(0)]; - float result = exp(result); + float result = pow(val, pwr); - for(; reduce_loop > 0; reduce_loop--) - result = exp(result); + for(; loop > 0; loop--){ +#if USE_GEN_NATIVE_POW + result = native_powr(result, pwr); +#else + result = pow(result, pwr); +#endif + } dst[get_global_id(0)] = result; } + -- 2.5.0 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
