From: Pan Xiuli <[email protected]> V2: Rename test case to buffer block read/write test
Signed-off-by: Pan Xiuli <[email protected]> --- kernels/compiler_subgroup_buffer_block_read.cl | 31 ++++ kernels/compiler_subgroup_buffer_block_write.cl | 27 ++++ utests/CMakeLists.txt | 2 + utests/compiler_subgroup_buffer_block_read.cpp | 194 ++++++++++++++++++++++++ utests/compiler_subgroup_buffer_block_write.cpp | 194 ++++++++++++++++++++++++ 5 files changed, 448 insertions(+) create mode 100644 kernels/compiler_subgroup_buffer_block_read.cl create mode 100644 kernels/compiler_subgroup_buffer_block_write.cl create mode 100644 utests/compiler_subgroup_buffer_block_read.cpp create mode 100644 utests/compiler_subgroup_buffer_block_write.cpp diff --git a/kernels/compiler_subgroup_buffer_block_read.cl b/kernels/compiler_subgroup_buffer_block_read.cl new file mode 100644 index 0000000..9edaa2e --- /dev/null +++ b/kernels/compiler_subgroup_buffer_block_read.cl @@ -0,0 +1,31 @@ +__kernel void compiler_subgroup_buffer_block_read1(global uint *src, global uint *dst) +{ + int id = get_global_id(0); + global uint * p = src + get_sub_group_id() * get_max_sub_group_size(); + uint tmp = intel_sub_group_block_read(p); + dst[id] = tmp; +} + +__kernel void compiler_subgroup_buffer_block_read2(global uint *src, global uint2 *dst) +{ + int id = get_global_id(0); + global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*2; + uint2 tmp = intel_sub_group_block_read2(p); + dst[id] = tmp; +} + +__kernel void compiler_subgroup_buffer_block_read4(global uint *src, global uint4 *dst) +{ + int id = get_global_id(0); + global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*4; + uint4 tmp = intel_sub_group_block_read4(p); + dst[id] = tmp; +} + +__kernel void compiler_subgroup_buffer_block_read8(global uint *src, global uint8 *dst) +{ + int id = get_global_id(0); + global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*8; + uint8 tmp = intel_sub_group_block_read8(p); + dst[id] = tmp; +} diff --git a/kernels/compiler_subgroup_buffer_block_write.cl b/kernels/compiler_subgroup_buffer_block_write.cl new file mode 100644 index 0000000..f735855 --- /dev/null +++ b/kernels/compiler_subgroup_buffer_block_write.cl @@ -0,0 +1,27 @@ +__kernel void compiler_subgroup_buffer_block_write1(global uint *src, global uint *dst) +{ + int id = get_global_id(0); + global uint * p = dst + get_sub_group_id() * get_max_sub_group_size(); + intel_sub_group_block_write(p,src[id]); +} + +__kernel void compiler_subgroup_buffer_block_write2(global uint2 *src, global uint *dst) +{ + int id = get_global_id(0); + global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*2; + intel_sub_group_block_write2(p,src[id]); +} + +__kernel void compiler_subgroup_buffer_block_write4(global uint4 *src, global uint *dst) +{ + int id = get_global_id(0); + global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*4; + intel_sub_group_block_write4(p,src[id]); +} + +__kernel void compiler_subgroup_buffer_block_write8(global uint8 *src, global uint *dst) +{ + int id = get_global_id(0); + global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*8; + intel_sub_group_block_write8(p,src[id]); +} diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt index e721179..8765775 100644 --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -170,6 +170,8 @@ set (utests_sources compiler_subgroup_reduce.cpp compiler_subgroup_scan_exclusive.cpp compiler_subgroup_scan_inclusive.cpp + compiler_subgroup_buffer_block_read.cpp + compiler_subgroup_buffer_block_write.cpp compiler_async_stride_copy.cpp compiler_insn_selection_min.cpp compiler_insn_selection_max.cpp diff --git a/utests/compiler_subgroup_buffer_block_read.cpp b/utests/compiler_subgroup_buffer_block_read.cpp new file mode 100644 index 0000000..334ec76 --- /dev/null +++ b/utests/compiler_subgroup_buffer_block_read.cpp @@ -0,0 +1,194 @@ +#include <cstdint> +#include <cstring> +#include <iostream> +#include "utest_helper.hpp" + +using namespace std; + +/* set to 1 for debug, output of input-expected data */ +#define DEBUG_STDOUT 0 + +/* NDRANGE */ +#define WG_GLOBAL_SIZE 32 +#define WG_LOCAL_SIZE 32 +/* + * Generic compute-expected function for buffer block read + */ +template<class T> +static void compute_expected(T* input, + T* expected, + size_t VEC_SIZE, + size_t SIMD_SIZE) +{ + for(uint32_t i = 0; i < SIMD_SIZE; i++) + for(uint32_t j = 0; j < VEC_SIZE; j++) + expected[i * VEC_SIZE + j] = input[SIMD_SIZE * j + i]; +} + +/* + * Generic input-expected generate function for block read + */ +template<class T> +static void generate_data(T* &input, + T* &expected, + size_t VEC_SIZE, + size_t SIMD_SIZE) +{ + /* allocate input and expected arrays */ + input = new T[WG_GLOBAL_SIZE * VEC_SIZE]; + expected = new T[WG_GLOBAL_SIZE * VEC_SIZE]; + + /* base value for all data types */ + T base_val = (long)7 << (sizeof(T) * 5 - 3); + + /* seed for random inputs */ + srand (time(NULL)); + + /* generate inputs and expected values */ + for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE) + { +#if DEBUG_STDOUT + cout << endl << "IN: " << endl; +#endif + SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE; + + /* input values */ + for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) + { + for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++) + { + /* initially 0, augment after */ + input[(gid + lid)*VEC_SIZE + vsz] = 0; + + /* check all data types, test ideal for QWORD types */ + input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val); + /* add trailing random bits, tests GENERAL cases */ + input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112); + +#if DEBUG_STDOUT + /* output generated input */ + cout << setw(4) << input[(gid + lid)*VEC_SIZE + vsz] << ", " ; + if((lid + 1) % 8 == 0) + cout << endl; +#endif + } + } + + /* expected values */ + compute_expected(input + gid * VEC_SIZE, expected + gid * VEC_SIZE, VEC_SIZE, SIMD_SIZE); + +#if DEBUG_STDOUT + /* output expected input */ + cout << endl << "EXP: " << endl; + for(uint32_t lid = 0; lid < SIMD_SIZE ; lid++){ + for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++) + cout << setw(4) << expected[(gid + lid)*VEC_SIZE + vsz] << ", " ; + if((lid + 1) % 8 == 0) + cout << endl; + } + cout << endl; +#endif + + } +} + +/* + * Generic subgroup utest function for buffer block read + */ +template<class T> +static void subgroup_generic(T* input, + T* expected, + size_t VEC_SIZE) +{ + /* get simd size */ + globals[0] = WG_GLOBAL_SIZE; + locals[0] = WG_LOCAL_SIZE; + size_t SIMD_SIZE = 0; + OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL); + + size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE; + /* input and expected data */ + generate_data(input, expected, VEC_SIZE, SIMD_SIZE); + + /* prepare input for datatype */ + OCL_CREATE_BUFFER(buf[0], 0, buf_sz * sizeof(T), NULL); + OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + + /* set input data for GPU */ + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], input, buf_sz* sizeof(T)); + OCL_UNMAP_BUFFER(0); + + /* run the kernel on GPU */ + OCL_NDRANGE(1); + + /* check if mismatch */ + OCL_MAP_BUFFER(1); + uint32_t mismatches = 0; + + for (uint32_t i = 0; i < buf_sz; i++) + if(((T *)buf_data[1])[i] != *(expected + i)) + { + /* found mismatch, increment */ + mismatches++; + +#if DEBUG_STDOUT + /* output mismatch */ + cout << "Err at " << i << ", " << + ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl; +#endif + } + +#if DEBUG_STDOUT + /* output mismatch count */ + cout << "mismatches " << mismatches << endl; +#endif + + OCL_UNMAP_BUFFER(1); + + OCL_ASSERT(mismatches == 0); + free(input); + free(expected); +} + +/* + * subgroup buffer block read + */ +void compiler_subgroup_buffer_block_read1(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read", + "compiler_subgroup_buffer_block_read1"); + subgroup_generic(input, expected, 1); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read1); +void compiler_subgroup_buffer_block_read2(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read", + "compiler_subgroup_buffer_block_read2"); + subgroup_generic(input, expected, 2); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read2); +void compiler_subgroup_buffer_block_read4(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read", + "compiler_subgroup_buffer_block_read4"); + subgroup_generic(input, expected, 4); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read4); +void compiler_subgroup_buffer_block_read8(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_read", + "compiler_subgroup_buffer_block_read8"); + subgroup_generic(input, expected, 8); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_read8); diff --git a/utests/compiler_subgroup_buffer_block_write.cpp b/utests/compiler_subgroup_buffer_block_write.cpp new file mode 100644 index 0000000..fb50a94 --- /dev/null +++ b/utests/compiler_subgroup_buffer_block_write.cpp @@ -0,0 +1,194 @@ +#include <cstdint> +#include <cstring> +#include <iostream> +#include "utest_helper.hpp" + +using namespace std; + +/* set to 1 for debug, output of input-expected data */ +#define DEBUG_STDOUT 0 + +/* NDRANGE */ +#define WG_GLOBAL_SIZE 32 +#define WG_LOCAL_SIZE 32 +/* + * Generic input-expected generate function for block write + */ +template<class T> +static void compute_expected(T* input, + T* expected, + size_t VEC_SIZE, + size_t SIMD_SIZE) +{ + for(uint32_t i = 0; i < SIMD_SIZE; i++) + for(uint32_t j = 0; j < VEC_SIZE; j++) + expected[SIMD_SIZE * j + i] = input[i * VEC_SIZE + j]; +} + +/* + * Generic compute-expected function for buffer block write + */ +template<class T> +static void generate_data(T* &input, + T* &expected, + size_t VEC_SIZE, + size_t SIMD_SIZE) +{ + /* allocate input and expected arrays */ + input = new T[WG_GLOBAL_SIZE * VEC_SIZE]; + expected = new T[WG_GLOBAL_SIZE * VEC_SIZE]; + + /* base value for all data types */ + T base_val = (long)7 << (sizeof(T) * 5 - 3); + + /* seed for random inputs */ + srand (time(NULL)); + + /* generate inputs and expected values */ + for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += SIMD_SIZE) + { +#if DEBUG_STDOUT + cout << endl << "IN: " << endl; +#endif + SIMD_SIZE = (gid + SIMD_SIZE) > WG_GLOBAL_SIZE ? WG_GLOBAL_SIZE - gid : SIMD_SIZE; + + /* input values */ + for(uint32_t lid = 0; lid < SIMD_SIZE; lid++) + { + for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++) + { + /* initially 0, augment after */ + input[(gid + lid)*VEC_SIZE + vsz] = 0; + + /* check all data types, test ideal for QWORD types */ + input[(gid + lid)*VEC_SIZE + vsz] += ((rand() % 2 - 1) * base_val); + /* add trailing random bits, tests GENERAL cases */ + input[(gid + lid)*VEC_SIZE + vsz] += (rand() % 112); + +#if DEBUG_STDOUT + /* output generated input */ + cout << setw(4) << input[(gid + lid)*VEC_SIZE + vsz] << ", " ; + if((lid + 1) % 8 == 0) + cout << endl; +#endif + } + } + + /* expected values */ + compute_expected(input + gid * VEC_SIZE, expected + gid * VEC_SIZE, VEC_SIZE, SIMD_SIZE); + +#if DEBUG_STDOUT + /* output expected input */ + cout << endl << "EXP: " << endl; + for(uint32_t lid = 0; lid < SIMD_SIZE ; lid++){ + for(uint32_t vsz = 0; vsz < VEC_SIZE; vsz++) + cout << setw(4) << expected[(gid + lid)*VEC_SIZE + vsz] << ", " ; + if((lid + 1) % 8 == 0) + cout << endl; + } + cout << endl; +#endif + + } +} + +/* + * Generic subgroup utest function for buffer block write + */ +template<class T> +static void subgroup_generic(T* input, + T* expected, + size_t VEC_SIZE) +{ + /* get simd size */ + globals[0] = WG_GLOBAL_SIZE; + locals[0] = WG_LOCAL_SIZE; + size_t SIMD_SIZE = 0; + OCL_CALL(clGetKernelSubGroupInfoKHR,kernel,device,CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR,sizeof(size_t)*1,locals,sizeof(size_t),&SIMD_SIZE,NULL); + + size_t buf_sz = VEC_SIZE * WG_GLOBAL_SIZE; + /* input and expected data */ + generate_data(input, expected, VEC_SIZE, SIMD_SIZE); + + /* prepare input for datatype */ + OCL_CREATE_BUFFER(buf[0], 0, buf_sz * sizeof(T), NULL); + OCL_CREATE_BUFFER(buf[1], 0, buf_sz * sizeof(T), NULL); + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + + /* set input data for GPU */ + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], input, buf_sz* sizeof(T)); + OCL_UNMAP_BUFFER(0); + + /* run the kernel on GPU */ + OCL_NDRANGE(1); + + /* check if mismatch */ + OCL_MAP_BUFFER(1); + uint32_t mismatches = 0; + + for (uint32_t i = 0; i < buf_sz; i++) + if(((T *)buf_data[1])[i] != *(expected + i)) + { + /* found mismatch, increment */ + mismatches++; + +#if DEBUG_STDOUT + /* output mismatch */ + cout << "Err at " << i << ", " << + ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl; +#endif + } + +#if DEBUG_STDOUT + /* output mismatch count */ + cout << "mismatches " << mismatches << endl; +#endif + + OCL_UNMAP_BUFFER(1); + + OCL_ASSERT(mismatches == 0); + free(input); + free(expected); +} + +/* + * subgroup buffer block write + */ +void compiler_subgroup_buffer_block_write1(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write", + "compiler_subgroup_buffer_block_write1"); + subgroup_generic(input, expected, 1); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write1); +void compiler_subgroup_buffer_block_write2(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write", + "compiler_subgroup_buffer_block_write2"); + subgroup_generic(input, expected, 2); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write2); +void compiler_subgroup_buffer_block_write4(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write", + "compiler_subgroup_buffer_block_write4"); + subgroup_generic(input, expected, 4); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write4); +void compiler_subgroup_buffer_block_write8(void) +{ + cl_uint *input = NULL; + cl_uint *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_buffer_block_write", + "compiler_subgroup_buffer_block_write8"); + subgroup_generic(input, expected, 8); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_buffer_block_write8); -- 2.7.4 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
