Signed-off-by: Grigore Lupescu <[email protected]> Add benchmark for workgroup reduce min/max
Signed-off-by: Grigore Lupescu <[email protected]> --- benchmark/CMakeLists.txt | 3 +- benchmark/benchmark_workgroup_reduce.cpp | 212 +++++++++++++++++++++++++++++++ kernels/bench_workgroup_reduce.cl | 55 ++++++++ 3 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 benchmark/benchmark_workgroup_reduce.cpp create mode 100644 kernels/bench_workgroup_reduce.cl diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index dd33829..a6539d9 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -18,7 +18,8 @@ set (benchmark_sources benchmark_copy_buffer_to_image.cpp benchmark_copy_image_to_buffer.cpp benchmark_copy_buffer.cpp - benchmark_copy_image.cpp) + benchmark_copy_image.cpp + benchmark_workgroup_reduce.cpp) SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}") diff --git a/benchmark/benchmark_workgroup_reduce.cpp b/benchmark/benchmark_workgroup_reduce.cpp new file mode 100644 index 0000000..c93ef26 --- /dev/null +++ b/benchmark/benchmark_workgroup_reduce.cpp @@ -0,0 +1,212 @@ +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include "utest_helper.hpp" +#include <sys/time.h> + +double benchmark_workgroup_add_uint(void) +{ + double elapsed = 0; + struct timeval start,stop; + const size_t global_size = 512 * 256; + const size_t local_size = 128; + const uint32_t reduce_loop = 10000; + + /* Input set will be generated */ + uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), global_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < global_size; i++){ + src[i] = (i / local_size); + } + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", + "bench_workgroup_reduce_add_uint"); + + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(uint32_t), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(uint32_t), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, global_size * sizeof(uint32_t)); + OCL_UNMAP_BUFFER(0); + + globals[0] = global_size; + locals[0] = local_size; + + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + + /* Measure performance */ + gettimeofday(&start,0); + OCL_NDRANGE(1); + clFinish(queue); + gettimeofday(&stop,0); + elapsed = time_subtract(&stop, &start, 0); + + /* Check results */ + OCL_MAP_BUFFER(1); + for(uint32_t i = 0; i < global_size; i += local_size){ + //printf(" %u", ((uint32_t*)buf_data[1])[i]); + OCL_ASSERT( ((uint32_t*)buf_data[1])[i] == i ); + } + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(global_size * reduce_loop, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_uint, "M/sec"); + +double benchmark_workgroup_min_uint(void) +{ + double elapsed = 0; + struct timeval start,stop; + const size_t global_size = 512 * 256; + const size_t local_size = 128; + const uint32_t reduce_loop = 10000; + + /* Input set will be generated */ + uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), global_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < global_size; i++){ + src[i] = i; + } + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", + "bench_workgroup_reduce_min_uint"); + + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(uint32_t), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(uint32_t), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, global_size * sizeof(uint32_t)); + OCL_UNMAP_BUFFER(0); + + globals[0] = global_size; + locals[0] = local_size; + + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + + /* Measure performance */ + gettimeofday(&start,0); + OCL_NDRANGE(1); + clFinish(queue); + gettimeofday(&stop,0); + elapsed = time_subtract(&stop, &start, 0); + + /* Check results */ + OCL_MAP_BUFFER(1); + for(uint32_t i = local_size/2; i < global_size; i += local_size){ + //printf(" %u", ((uint32_t*)buf_data[1])[i]); + OCL_ASSERT( ((uint32_t*)buf_data[1])[i] == (src[i] - (local_size / 2)) ); + } + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(global_size * reduce_loop, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_min_uint, "M/sec"); + +double benchmark_workgroup_add_float(void) +{ + double elapsed = 0; + struct timeval start,stop; + const size_t global_size = 512 * 256; + const size_t local_size = 128; + const uint32_t reduce_loop = 10000; + + /* Input set will be generated */ + float* src = (float*)calloc(sizeof(float), global_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < global_size; i++) + src[i] = (i / local_size); + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", + "bench_workgroup_reduce_add_float"); + + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, global_size * sizeof(float)); + OCL_UNMAP_BUFFER(0); + + globals[0] = global_size; + locals[0] = local_size; + + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + + /* Measure performance */ + gettimeofday(&start,0); + OCL_NDRANGE(1); + clFinish(queue); + gettimeofday(&stop,0); + elapsed = time_subtract(&stop, &start, 0); + + /* Check results */ + OCL_MAP_BUFFER(1); + for(uint32_t i = 0; i < global_size; i += local_size){ + //printf(" %f", ((float*)buf_data[1])[i]); + OCL_ASSERT( ((float*)buf_data[1])[i] == (float)i ); + } + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(global_size * reduce_loop, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_float, "M/sec"); + +double benchmark_workgroup_min_float(void) +{ + double elapsed = 0; + struct timeval start,stop; + const size_t global_size = 512 * 256; + const size_t local_size = 128; + const uint32_t reduce_loop = 10000; + + /* Input set will be generated */ + float* src = (float*)calloc(sizeof(float), global_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < global_size; i++) + src[i] = 1.0f * i + 1; + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", + "bench_workgroup_reduce_min_float"); + + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, global_size * sizeof(float)); + OCL_UNMAP_BUFFER(0); + + globals[0] = global_size; + locals[0] = local_size; + + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + + /* Measure performance */ + gettimeofday(&start,0); + OCL_NDRANGE(1); + clFinish(queue); + gettimeofday(&stop,0); + elapsed = time_subtract(&stop, &start, 0); + + /* Check results */ + OCL_MAP_BUFFER(1); + for(uint32_t i = local_size/2; i < global_size; i += local_size){ + //printf(" %f", ((float*)buf_data[1])[i]); + OCL_ASSERT( ((float*)buf_data[1])[i] == (src[i] - (local_size / 2)) ); + } + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(global_size * reduce_loop, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_min_float, "M/sec"); diff --git a/kernels/bench_workgroup_reduce.cl b/kernels/bench_workgroup_reduce.cl new file mode 100644 index 0000000..ba1c709 --- /dev/null +++ b/kernels/bench_workgroup_reduce.cl @@ -0,0 +1,55 @@ +kernel void bench_workgroup_reduce_add_uint( + global uint *src, + global uint *dst, + uint reduce_loop) +{ + uint val = src[get_global_id(0)]; + uint sum = work_group_reduce_add(val); + + for(; reduce_loop > 0; reduce_loop--) + sum = work_group_reduce_add(val); + + dst[get_global_id(0)] = sum; +} + +kernel void bench_workgroup_reduce_min_uint( + global uint *src, + global uint *dst, + uint reduce_loop) +{ + uint val = src[get_global_id(0)]; + uint min = work_group_reduce_min(val); + + for(; reduce_loop > 0; reduce_loop--) + min = work_group_reduce_min(val); + + dst[get_global_id(0)] = min; +} + +kernel void bench_workgroup_reduce_add_float( + global float *src, + global float *dst, + uint reduce_loop) +{ + float val = src[get_global_id(0)]; + float sum = work_group_reduce_add(val); + + for(; reduce_loop > 0; reduce_loop--) + sum = work_group_reduce_add(val); + + dst[get_global_id(0)] = sum; +} + +kernel void bench_workgroup_reduce_min_float( + global float *src, + global float *dst, + uint reduce_loop) +{ + float val = src[get_global_id(0)]; + float min = work_group_reduce_min(val); + + for(; reduce_loop > 0; reduce_loop--) + min = work_group_reduce_min(val); + + dst[get_global_id(0)] = min; +} -- 2.5.0 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
