Signed-off-by: rander <[email protected]>
---
 benchmark/CMakeLists.txt     |   6 +-
 benchmark/benchmark_math.cpp |  69 ++++++++++++-----------
 kernels/bench_math.cl        | 127 ++++++++++++++++++++++---------------------
 3 files changed, 106 insertions(+), 96 deletions(-)

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index f9b246b..e92b269 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,5 +1,7 @@
+cmake_minimum_required(VERSION 3.1)
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
                     ${CMAKE_CURRENT_SOURCE_DIR}/../utests
+                    ${CMAKE_CURRENT_SOURCE_DIR}/../
                     ${CMAKE_CURRENT_SOURCE_DIR}/../include)
 
 
@@ -23,13 +25,13 @@ set (benchmark_sources
   benchmark_math.cpp)
 
 
-SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK -std=c++11 ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DBUILD_BENCHMARK ${CMAKE_C_FLAGS}")
 
 ADD_LIBRARY(benchmarks SHARED ${ADDMATHFUNC} ${benchmark_sources})
 
 #TARGET_LINK_LIBRARIES(benchmarks cl m ${OPENGL_LIBRARIES} 
${CMAKE_THREAD_LIBS_INIT})
-TARGET_LINK_LIBRARIES(benchmarks cl m)
+TARGET_LINK_LIBRARIES(benchmarks OpenCL pthread m)
 
 ADD_EXECUTABLE(benchmark_run benchmark_run.cpp)
 TARGET_LINK_LIBRARIES(benchmark_run benchmarks)
diff --git a/benchmark/benchmark_math.cpp b/benchmark/benchmark_math.cpp
index 72bc316..b47814a 100644
--- a/benchmark/benchmark_math.cpp
+++ b/benchmark/benchmark_math.cpp
@@ -8,119 +8,122 @@
 #include <sys/time.h>
 
 double benchmark_generic_math(const char* str_filename,
-                              const char* str_kernel)
+                              const char* str_kernel,
+                              float base,
+                              float max)
 {
   double elapsed = 0;
   struct timeval start,stop;
   const size_t global_size = 1024 * 1024;
-  const size_t local_size = 64;
+  const size_t local_size = 256;
 
   /* Compute math OP, loop times on global size */
-  cl_float base = 1.000002;
   cl_float pwr = 1.0102003;
-  uint32_t loop = 1000;
+  uint32_t loop = 128;
 
-  /* Input set will be generated */
-  float* src = (float*)calloc(sizeof(float), global_size);
-  OCL_ASSERT(src != NULL);
-  for(uint32_t i = 0; i < global_size; i++)
-    src[i] = base + i * (base - 1);
+  float step;
+
+  step = (max - base) / loop;
 
   /* Setup kernel and buffers */
   OCL_CALL(cl_kernel_init, str_filename, str_kernel, SOURCE, "");
 
-  OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL);
   OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL);
 
-  OCL_MAP_BUFFER(0);
-  memcpy(buf_data[0], src, global_size * sizeof(float));
-  OCL_UNMAP_BUFFER(0);
-
   globals[0] = global_size;
   locals[0] = local_size;
 
-  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
-  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
-  OCL_SET_ARG(2, sizeof(cl_float), &pwr);
-  OCL_SET_ARG(3, sizeof(cl_uint), &loop);
+  OCL_SET_ARG(0, sizeof(float), &base);
+  OCL_SET_ARG(1, sizeof(float), &step);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(3, sizeof(cl_float), &pwr);
+  OCL_SET_ARG(4, sizeof(cl_uint), &loop);
+
+  OCL_NDRANGE(1);
+  clFinish(queue);
 
   /* Measure performance */
   gettimeofday(&start,0);
   OCL_NDRANGE(1);
+  OCL_NDRANGE(1);
+  OCL_NDRANGE(1);
+  OCL_NDRANGE(1);
   clFinish(queue);
   gettimeofday(&stop,0);
   elapsed = time_subtract(&stop, &start, 0);
 
   /* Show compute results */
+#if SHOWRESULT
   OCL_MAP_BUFFER(1);
   for(uint32_t i = 0; i < global_size; i += 8192)
     printf("\t%.3f", ((float*)buf_data[1])[i]);
   OCL_UNMAP_BUFFER(1);
-
-  return BANDWIDTH(global_size * loop, elapsed);
+#endif
+  printf("-----------------------");
+  return BANDWIDTH(global_size * loop, elapsed / 4.0);
 }
 
 double benchmark_math_pow(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_pow");
+  return benchmark_generic_math("bench_math.cl", "bench_math_pow", 1.0, 128.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_pow, "Mop/s");
 
 double benchmark_math_exp2(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_exp2");
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp2", 0.1, 8.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp2, "Mop/s");
 
 double benchmark_math_exp(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_exp");
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp", 0.1, 4.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp, "Mop/s");
 
 double benchmark_math_exp10(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_exp10");
+  return benchmark_generic_math("bench_math.cl", "bench_math_exp10", 0.1, 4.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_exp10, "Mop/s");
 
 double benchmark_math_log2(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_log2");
+  return benchmark_generic_math("bench_math.cl", "bench_math_log2", 0.008, 
1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log2, "Mop/s");
 
 double benchmark_math_log(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_log");
+  return benchmark_generic_math("bench_math.cl", "bench_math_log", 0.008, 1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log, "Mop/s");
 
 double benchmark_math_log10(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_log10");
+  return benchmark_generic_math("bench_math.cl", "bench_math_log10", 0.008, 
0.01);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_log10, "Mop/s");
 
 double benchmark_math_sqrt(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_sqrt");
+  return benchmark_generic_math("bench_math.cl", "bench_math_sqrt", 0.1, 
65537.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sqrt, "Mop/s");
 
 double benchmark_math_sin(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_sin");
+  return benchmark_generic_math("bench_math.cl", "bench_math_sin", 0.001, 
5000.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_sin, "Mop/s");
 
 double benchmark_math_cos(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_cos");
+  return benchmark_generic_math("bench_math.cl", "bench_math_cos", 0.001, 
5000.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_cos, "Mop/s");
 
 double benchmark_math_tan(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_tan");
+  return benchmark_generic_math("bench_math.cl", "bench_math_tan", 0.001, 
5000.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_tan, "Mop/s");
 
 double benchmark_math_asin(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_asin");
+  return benchmark_generic_math("bench_math.cl", "bench_math_asin", 0.001, 
1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_asin, "Mop/s");
 
 double benchmark_math_acos(void){
-  return benchmark_generic_math("bench_math.cl", "bench_math_acos");
+  return benchmark_generic_math("bench_math.cl", "bench_math_acos", 0.001, 
1.0);
 }
 MAKE_BENCHMARK_FROM_FUNCTION(benchmark_math_acos, "Mop/s");
diff --git a/kernels/bench_math.cl b/kernels/bench_math.cl
index 8d85d51..f0c1a81 100644
--- a/kernels/bench_math.cl
+++ b/kernels/bench_math.cl
@@ -3,19 +3,20 @@
 
 /* benchmark pow performance */
 kernel void bench_math_pow(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_powr(result, pwr); /* calls native */
+    result += native_powr((src + step*loop), pwr); /* calls native */
 #else
-    result = pow(result, pwr); /* calls internal slow */
+    result += pow((src + step*loop), pwr); /* calls internal slow */
 #endif
   }
   dst[get_global_id(0)] = result;
@@ -23,15 +24,16 @@ kernel void bench_math_pow(
 
 /* benchmark exp2 performance, exp2 is native */
 kernel void bench_math_exp2(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = exp2(result) * 0.1f;
+    result += exp2((src + step*loop));
 
   dst[get_global_id(0)] = result;
 }
@@ -39,21 +41,22 @@ kernel void bench_math_exp2(
 /* benchmark exp performance */
 /* calls internal fast (native) if (x > -0x1.6p1 && x < 0x1.6p1) */
 kernel void bench_math_exp(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_exp((float)-0x1.6p1 - result * 0.1f); /* calls native */
+    result += native_exp((float)-0x1.6p1 - (src + step*loop)); /* calls native 
*/
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = exp((float)-0x1.6p1 + result * 0.1f); /* calls internal fast */
+    result += exp((float)-0x1.6p1 + (src + step*loop)); /* calls internal fast 
*/
 #else
-    result = exp((float)-0x1.6p1 - result * 0.1f); /* calls internal slow */
+    result += exp((float)-0x1.6p1 - (src + step*loop)); /* calls internal slow 
*/
 #endif
   }
 
@@ -63,21 +66,22 @@ kernel void bench_math_exp(
 /* benchmark exp10 performance */
 /* calls internal fast (native) if (x < -0x1.4p+5) || (x > +0x1.4p+5)  */
 kernel void bench_math_exp10(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_exp10((float)0x1.4p+5 + result * 0.1f); /* calls native */
+    result += native_exp10((float)0x1.4p+5 + (src +  step*loop)); /* calls 
native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = exp10((float)-0x1.4p+5 - result * 0.1f); /* calls internal fast */
+    result += exp10((float)-0x1.4p+5 - (src +  step*loop)); /* calls internal 
fast */
 #else
-    result = exp10((float)-0x1.2p+5 - result * 0.1f); /* calls internal slow */
+    result += exp10((float)-0x1.2p+5 + (src +  step*loop)); /* calls internal 
slow */
 #endif
   }
 
@@ -87,21 +91,22 @@ kernel void bench_math_exp10(
 /* benchmark log2 performance */
 /* calls internal fast (native) if (x > 0x1.1p0)  */
 kernel void bench_math_log2(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log2((float)0x1.1p0 + result * 0.0001f); /* calls native */
+    result += native_log2((float)0x1.1p0 + (src +  step*loop)); /* calls 
native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log2((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+    result += log2((float)0x1.1p0 + (src +  step*loop)); /* calls internal 
fast */
 #else
-    result = log2((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+    result += log2((float)0x1.1p0 - (src +  step*loop)); /* calls internal 
slow */
 #endif
   }
 
@@ -111,21 +116,22 @@ kernel void bench_math_log2(
 /* benchmark log performance */
 /* calls internal fast (native) if (x > 0x1.1p0)  */
 kernel void bench_math_log(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log((float)0x1.1p0 + result * 0.0001f); /* calls native */
+    result += native_log((float)0x1.1p0 + (src +  step*loop)); /* calls native 
*/
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log((float)0x1.1p0 + result * 0.0001f); /* calls internal fast */
+    result += log((float)0x1.1p0 + (src +  step*loop)); /* calls internal fast 
*/
 #else
-    result = log((float)0x1.1p0 - result * 0.0001f); /* calls internal slow */
+    result += log((float)0x1.1p0 - (src +  step*loop)); /* calls internal slow 
*/
 #endif
   }
 
@@ -135,21 +141,22 @@ kernel void bench_math_log(
 /* benchmark log10 performance */
 /* calls internal fast (native) if (x > 0x1.1p0)  */
 kernel void bench_math_log10(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_log10((float)0x1.1p0 + result * 0.0001f); /* calls native 
*/
+    result += native_log10((float)0x1.1p0 + (src +  step*loop)); /* calls 
native */
 #elif defined(BENCHMARK_INTERNAL_FAST)
-    result = log10((float)0x1.1p0 + result * 0.0001f); /* calls internal fast 
*/
+    result += log10((float)0x1.1p0 + (src +  step*loop)); /* calls internal 
fast */
 #else
-    result = log10((float)0x1.1p0 - result * 0.0001f); /* calls internal slow 
*/
+    result += log10((float)0x1.1p0 - (src +  step*loop)); /* calls internal 
slow */
 #endif
   }
 
@@ -158,38 +165,36 @@ kernel void bench_math_log10(
 
 /* benchmark sqrt performance */
 kernel void bench_math_sqrt(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = sqrt(result) + sqrt(pwr + result);
+    result += sqrt((src +  step*loop));
 
   dst[get_global_id(0)] = result;
 }
 
 /* benchmark sin performance */
 kernel void bench_math_sin(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_sin(result); /* calls native */
+    result += native_sin((src +  step*loop)); /* calls native */
 #else
-    result = sin(result);      /* calls internal, random complexity */
-    //result = sin(0.1f + result); /* calls internal, (1) no reduction */
-    //result = sin(2.f + result); /* calls internal, (2) fast reduction */
-    //result = sin(4001 + result); /* calls internal, (3) slow reduction */
-    result *= 0x1p-16;
+    result += sin((src +  step*loop)); /* calls internal, random complexity */
 #endif
   }
 
@@ -198,23 +203,20 @@ kernel void bench_math_sin(
 
 /* benchmark cos performance */
 kernel void bench_math_cos(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_cos(result); /* calls native */
+    result += native_cos((src +  step*loop)); /* calls native */
 #else
-    result = cos(result);      /* calls internal, random complexity */
-    //result = cos(0.1f + result); /* calls internal, (1) no reduction */
-    //result = cos(2.f + result); /* calls internal, (2) fast reduction */
-    //result = cos(4001.f + result); /* calls internal, (3) slow reduction */
-    result *= 0x1p-16;
+    result += cos((src +  step*loop)); /* calls internal, random complexity */
 #endif
   }
   dst[get_global_id(0)] = result;
@@ -222,19 +224,20 @@ kernel void bench_math_cos(
 
 /* benchmark native tan performance */
 kernel void bench_math_tan(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
   {
 #if defined(BENCHMARK_NATIVE)
-    result = native_tan(result); /* calls native */
+    result += native_tan((src +  step*loop)); /* calls native */
 #else
-    result = tan(result); /* calls internal slow */
+    result += tan((src +  step*loop)); /* calls internal slow */
 #endif
   }
 
@@ -243,30 +246,32 @@ kernel void bench_math_tan(
 
 /* benchmark asin performance */
 kernel void bench_math_asin(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = asin(pwr - 1);
+    result += asin((src +  step*loop));
 
   dst[get_global_id(0)] = result;
 }
 
 /* benchmark acos performance */
 kernel void bench_math_acos(
-  global float *src,
+  float src,
+  float step,
   global float *dst,
   float pwr,
   uint loop)
 {
-  float result = src[get_global_id(0)];
+  float result = 0;
 
   for(; loop > 0; loop--)
-    result = acos(pwr - 1);
+    result += acos((src +  step*loop));
 
   dst[get_global_id(0)] = result;
 }
-- 
2.7.4

_______________________________________________
Beignet mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/beignet

Reply via email to