2/2 is OK On Mon, 2014-05-12 at 12:41 +0800, [email protected] wrote: > From: Luo <[email protected]> > > seperate the kernel code from host code to make it clean; build the > kernels offline by gbe_bin_generator to improve the performance. > --- > src/CMakeLists.txt | 23 ++- > src/cl_context.h | 24 ++- > src/cl_gt_device.h | 23 ++- > src/cl_mem.c | 214 > ++++++--------------- > src/kernels/cl_internal_copy_buf_align1.cl | 8 - > src/kernels/cl_internal_copy_buf_align16.cl | 2 +- > src/kernels/cl_internal_copy_buf_align4.cl | 2 +- > src/kernels/cl_internal_copy_buf_rect.cl | 15 ++ > .../cl_internal_copy_buf_unalign_dst_offset.cl | 2 +- > .../cl_internal_copy_buf_unalign_same_offset.cl | 2 +- > .../cl_internal_copy_buf_unalign_src_offset.cl | 2 +- > src/kernels/cl_internal_copy_buffer_to_image_2d.cl | 18 ++ > src/kernels/cl_internal_copy_buffer_to_image_3d.cl | 19 ++ > src/kernels/cl_internal_copy_image_2d_to_2d.cl | 21 ++ > src/kernels/cl_internal_copy_image_2d_to_3d.cl | 22 +++ > src/kernels/cl_internal_copy_image_2d_to_buffer.cl | 19 ++ > src/kernels/cl_internal_copy_image_3d_to_2d.cl | 22 +++ > src/kernels/cl_internal_copy_image_3d_to_3d.cl | 23 +++ > src/kernels/cl_internal_copy_image_3d_to_buffer.cl | 22 +++ > 19 files changed, 308 insertions(+), 175 deletions(-) > delete mode 100644 src/kernels/cl_internal_copy_buf_align1.cl > create mode 100644 src/kernels/cl_internal_copy_buf_rect.cl > create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d.cl > create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d.cl > create mode 100644 src/kernels/cl_internal_copy_image_2d_to_2d.cl > create mode 100644 src/kernels/cl_internal_copy_image_2d_to_3d.cl > create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer.cl > create mode 100644 src/kernels/cl_internal_copy_image_3d_to_2d.cl > create mode 100644 src/kernels/cl_internal_copy_image_3d_to_3d.cl > create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer.cl > > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt > index 8164a44..ecc04ab 100644 > --- a/src/CMakeLists.txt > +++ b/src/CMakeLists.txt > @@ -17,11 +17,30 @@ foreach (KF ${KERNEL_FILES}) > endforeach (KF) > endmacro (MakeKernelBinStr) > > +macro (MakeBuiltInKernelStr KERNEL_PATH KERNEL_FILES) > + set (output_file ${KERNEL_PATH}/${BUILT_IN_NAME}.cl) > + set (file_content) > + file (REMOVE ${output_file}) > + foreach (KF ${KERNEL_NAMES}) > + set (input_file ${KERNEL_PATH}/${KF}.cl) > + file(READ ${input_file} file_content ) > + STRING(REGEX REPLACE ";" "\\\\;" file_content "${file_content}") > + file(APPEND ${output_file} ${file_content}) > + endforeach (KF) > +endmacro (MakeBuiltInKernelStr) > + > set (KERNEL_STR_FILES) > -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 > +set (KERNEL_NAMES cl_internal_copy_buf_align4 > cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset > -cl_internal_copy_buf_unalign_dst_offset > cl_internal_copy_buf_unalign_src_offset) > +cl_internal_copy_buf_unalign_dst_offset > cl_internal_copy_buf_unalign_src_offset > +cl_internal_copy_buf_rect cl_internal_copy_image_2d_to_2d > cl_internal_copy_image_3d_to_2d > +cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d > +cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer > +cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d) > +set (BUILT_IN_NAME cl_internal_built_in_kernel) > +MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" > "${KERNEL_NAMES}") > MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}") > +MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${BUILT_IN_NAME}") > > set(OPENCL_SRC > ${KERNEL_STR_FILES} > diff --git a/src/cl_context.h b/src/cl_context.h > index 782a9af..24281be 100644 > --- a/src/cl_context.h > +++ b/src/cl_context.h > @@ -46,14 +46,22 @@ enum _cl_internal_ker_type { > CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET, > CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, > CL_ENQUEUE_COPY_BUFFER_RECT, > - CL_ENQUEUE_COPY_IMAGE_0, //copy image 2d to image 2d > - CL_ENQUEUE_COPY_IMAGE_1, //copy image 3d to image 2d > - CL_ENQUEUE_COPY_IMAGE_2, //copy image 2d to image 3d > - CL_ENQUEUE_COPY_IMAGE_3, //copy image 3d to image 3d > - CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0, //copy image 2d to buffer > - CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1, //copy image 3d tobuffer > - CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0, //copy buffer to image 2d > - CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1, //copy buffer to image 3d > + CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d > + CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d > + CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, //copy image 2d to image 3d > + CL_ENQUEUE_COPY_IMAGE_3D_TO_3D, //copy image 3d to image 3d > + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, //copy image 2d to buffer > + CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer > + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d > + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d > + CL_ENQUEUE_FILL_BUFFER_UNALIGN, //fill buffer with 1 aligne pattern, > pattern size=1 > + CL_ENQUEUE_FILL_BUFFER_ALIGN2, //fill buffer with 2 aligne pattern, > pattern size=2 > + CL_ENQUEUE_FILL_BUFFER_ALIGN4, //fill buffer with 4 aligne pattern, > pattern size=4 > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_8, //fill buffer with 8 aligne pattern, > pattern size=8 > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_16, //fill buffer with 16 aligne pattern, > pattern size=16 > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_32, //fill buffer with 16 aligne pattern, > pattern size=32 > + CL_ENQUEUE_FILL_BUFFER_ALIGN8_64, //fill buffer with 16 aligne pattern, > pattern size=64 > + CL_ENQUEUE_FILL_BUFFER_ALIGN128, //fill buffer with 128 aligne > pattern, pattern size=128 > CL_INTERNAL_KERNEL_MAX > }; > > diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h > index 7e45b4e..8690190 100644 > --- a/src/cl_gt_device.h > +++ b/src/cl_gt_device.h > @@ -75,7 +75,28 @@ DECL_INFO_STRING(version, LIBCL_VERSION_STRING) > DECL_INFO_STRING(profile, "FULL_PROFILE") > DECL_INFO_STRING(opencl_c_version, LIBCL_C_VERSION_STRING) > DECL_INFO_STRING(extensions, "") > -DECL_INFO_STRING(built_in_kernels, "") > +DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;" > + "__cl_copy_region_align16;" > + "__cl_cpy_region_unalign_same_offset;" > + "__cl_copy_region_unalign_dst_offset;" > + "__cl_copy_region_unalign_src_offset;" > + "__cl_copy_buffer_rect;" > + "__cl_copy_image_2d_to_2d;" > + "__cl_copy_image_3d_to_2d;" > + "__cl_copy_image_2d_to_3d;" > + "__cl_copy_image_3d_to_3d;" > + "__cl_copy_image_2d_to_buffer;" > + "__cl_copy_image_3d_to_buffer;" > + "__cl_copy_buffer_to_image_2d;" > + "__cl_copy_buffer_to_image_3d;" > + "__cl_fill_region_unalign;" > + "__cl_fill_region_align2;" > + "__cl_fill_region_align4;" > + "__cl_fill_region_align8_2;" > + "__cl_fill_region_align8_4;" > + "__cl_fill_region_align8_8;" > + "__cl_fill_region_align8_16;" > + "__cl_fill_region_align128;") > DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING) > #undef DECL_INFO_STRING > > diff --git a/src/cl_mem.c b/src/cl_mem.c > index 5faef4b..7eaf95f 100644 > --- a/src/cl_mem.c > +++ b/src/cl_mem.c > @@ -937,33 +937,19 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem > src_buf, cl_mem dst_buf, > global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; > global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; > global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; > - cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT; > cl_int src_offset = src_origin[2]*src_slice_pitch + > src_origin[1]*src_row_pitch + src_origin[0]; > cl_int dst_offset = dst_origin[2]*dst_slice_pitch + > dst_origin[1]*dst_row_pitch + dst_origin[0]; > > - static const char *str_kernel = > - "kernel void __cl_cpy_buffer_rect ( \n" > - " global char* src, global char* dst, \n" > - " unsigned int region0, unsigned int region1, unsigned int > region2, \n" > - " unsigned int src_offset, unsigned int dst_offset, \n" > - " unsigned int src_row_pitch, unsigned int src_slice_pitch, \n" > - " unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n" > - " int i = get_global_id(0); \n" > - " int j = get_global_id(1); \n" > - " int k = get_global_id(2); \n" > - " if((i >= region0) || (j>= region1) || (k>=region2)) \n" > - " return; \n" > - " src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n" > - " dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n" > - " dst[dst_offset] = src[src_offset]; \n" > - "}"; > - > - > /* We use one kernel to copy the data. The kernel is lazily created. */ > assert(src_buf->ctx == dst_buf->ctx); > > /* setup the kernel and run. */ > - ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL); > + extern char cl_internal_copy_buf_rect_str[]; > + extern int cl_internal_copy_buf_rect_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_BUFFER_RECT, > + cl_internal_copy_buf_rect_str, > (size_t)cl_internal_copy_buf_rect_str_size, NULL); > + > if (!ker) > return CL_OUT_OF_RESOURCES; > > @@ -992,8 +978,6 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct > _cl_mem_image* src_image > size_t global_off[] = {0,0,0}; > size_t global_sz[] = {1,1,1}; > size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2}; > - cl_int index = CL_ENQUEUE_COPY_IMAGE_0; > - char option[40] = ""; > uint32_t fixupDataType; > uint32_t savedIntelFmt; > > @@ -1003,15 +987,6 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct > _cl_mem_image* src_image > global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; > global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; > > - if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { > - strcat(option, "-D SRC_IMAGE_3D"); > - index += 1; > - } > - if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { > - strcat(option, " -D DST_IMAGE_3D"); > - index += 2; > - } > - > switch (src_image->fmt.image_channel_data_type) { > case CL_SNORM_INT8: > case CL_UNORM_INT8: fixupDataType = CL_UNSIGNED_INT8; break; > @@ -1034,54 +1009,41 @@ cl_mem_kernel_copy_image(cl_command_queue queue, > struct _cl_mem_image* src_image > src_image->intel_fmt = cl_image_get_intel_format(&fmt); > dst_image->intel_fmt = src_image->intel_fmt; > } > - static const char *str_kernel = > - "#ifdef SRC_IMAGE_3D \n" > - " #define SRC_IMAGE_TYPE image3d_t \n" > - " #define SRC_COORD_TYPE int4 \n" > - "#else \n" > - " #define SRC_IMAGE_TYPE image2d_t \n" > - " #define SRC_COORD_TYPE int2 \n" > - "#endif \n" > - "#ifdef DST_IMAGE_3D \n" > - " #define DST_IMAGE_TYPE image3d_t \n" > - " #define DST_COORD_TYPE int4 \n" > - "#else \n" > - " #define DST_IMAGE_TYPE image2d_t \n" > - " #define DST_COORD_TYPE int2 \n" > - "#endif \n" > - "kernel void __cl_copy_image ( \n" > - " __read_only SRC_IMAGE_TYPE src_image, __write_only > DST_IMAGE_TYPE dst_image, \n" > - " unsigned int region0, unsigned int region1, unsigned int > region2, \n" > - " unsigned int src_origin0, unsigned int src_origin1, unsigned > int src_origin2, \n" > - " unsigned int dst_origin0, unsigned int dst_origin1, unsigned > int dst_origin2) { \n" > - " int i = get_global_id(0); \n" > - " int j = get_global_id(1); \n" > - " int k = get_global_id(2); \n" > - " int4 color; \n" > - " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | > CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n" > - " SRC_COORD_TYPE src_coord; \n" > - " DST_COORD_TYPE dst_coord; \n" > - " if((i >= region0) || (j>= region1) || (k>=region2)) \n" > - " return; \n" > - " src_coord.x = src_origin0 + i; \n" > - " src_coord.y = src_origin1 + j; \n" > - "#ifdef SRC_IMAGE_3D \n" > - " src_coord.z = src_origin2 + k; \n" > - "#endif \n" > - " dst_coord.x = dst_origin0 + i; \n" > - " dst_coord.y = dst_origin1 + j; \n" > - "#ifdef DST_IMAGE_3D \n" > - " dst_coord.z = dst_origin2 + k; \n" > - "#endif \n" > - " color = read_imagei(src_image, sampler, src_coord); \n" > - " write_imagei(dst_image, dst_coord, color); \n" > - "}"; > > /* We use one kernel to copy the data. The kernel is lazily created. */ > assert(src_image->base.ctx == dst_image->base.ctx); > > /* setup the kernel and run. */ > - ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option); > + if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) { > + if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { > + extern char cl_internal_copy_image_2d_to_2d_str[]; > + extern int cl_internal_copy_image_2d_to_2d_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, > + cl_internal_copy_image_2d_to_2d_str, > (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL); > + }else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { > + extern char cl_internal_copy_image_2d_to_3d_str[]; > + extern int cl_internal_copy_image_2d_to_3d_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, > + cl_internal_copy_image_2d_to_3d_str, > (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL); > + } > + }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { > + if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { > + extern char cl_internal_copy_image_3d_to_2d_str[]; > + extern int cl_internal_copy_image_3d_to_2d_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, > + cl_internal_copy_image_3d_to_2d_str, > (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL); > + }else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { > + extern char cl_internal_copy_image_3d_to_3d_str[]; > + extern int cl_internal_copy_image_3d_to_3d_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_3D_TO_3D, > + cl_internal_copy_image_3d_to_3d_str, > (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL); > + } > + } > + > if (!ker) { > ret = CL_OUT_OF_RESOURCES; > goto fail; > @@ -1117,8 +1079,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, > struct _cl_mem_image* image, > size_t global_off[] = {0,0,0}; > size_t global_sz[] = {1,1,1}; > size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2}; > - cl_int index = CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_0; > - char option[40] = ""; > uint32_t intel_fmt, bpp; > cl_image_format fmt; > size_t origin0, region0; > @@ -1129,42 +1089,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, > struct _cl_mem_image* image, > global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; > global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; > > - if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { > - strcat(option, "-D IMAGE_3D"); > - index += 1; > - } > - > - static const char *str_kernel = > - "#ifdef IMAGE_3D \n" > - " #define IMAGE_TYPE image3d_t \n" > - " #define COORD_TYPE int4 \n" > - "#else \n" > - " #define IMAGE_TYPE image2d_t \n" > - " #define COORD_TYPE int2 \n" > - "#endif \n" > - "kernel void __cl_copy_image_to_buffer ( \n" > - " __read_only IMAGE_TYPE image, global uchar* buffer, \n" > - " unsigned int region0, unsigned int region1, unsigned int > region2, \n" > - " unsigned int src_origin0, unsigned int src_origin1, unsigned > int src_origin2, \n" > - " unsigned int dst_offset) { \n" > - " int i = get_global_id(0); \n" > - " int j = get_global_id(1); \n" > - " int k = get_global_id(2); \n" > - " uint4 color; \n" > - " const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | > CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; \n" > - " COORD_TYPE src_coord; \n" > - " if((i >= region0) || (j>= region1) || (k>=region2)) \n" > - " return; \n" > - " src_coord.x = src_origin0 + i; \n" > - " src_coord.y = src_origin1 + j; \n" > - "#ifdef IMAGE_3D \n" > - " src_coord.z = src_origin2 + k; \n" > - "#endif \n" > - " color = read_imageui(image, sampler, src_coord); \n" > - " dst_offset += (k * region1 + j) * region0 + i; \n" > - " buffer[dst_offset] = color.x; \n" > - "}"; > - > /* We use one kernel to copy the data. The kernel is lazily created. */ > assert(image->base.ctx == buffer->ctx); > > @@ -1180,7 +1104,20 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, > struct _cl_mem_image* image, > global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; > > /* setup the kernel and run. */ > - ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option); > + if(image->image_type == CL_MEM_OBJECT_IMAGE2D) { > + extern char cl_internal_copy_image_2d_to_buffer_str[]; > + extern int cl_internal_copy_image_2d_to_buffer_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, > + cl_internal_copy_image_2d_to_buffer_str, > (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL); > + }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { > + extern char cl_internal_copy_image_3d_to_buffer_str[]; > + extern int cl_internal_copy_image_3d_to_buffer_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, > + cl_internal_copy_image_3d_to_buffer_str, > (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL); > + } > + > if (!ker) { > ret = CL_OUT_OF_RESOURCES; > goto fail; > @@ -1216,8 +1153,6 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, > cl_mem buffer, struct _cl_me > size_t global_off[] = {0,0,0}; > size_t global_sz[] = {1,1,1}; > size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2}; > - cl_int index = CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0; > - char option[40] = ""; > uint32_t intel_fmt, bpp; > cl_image_format fmt; > size_t origin0, region0; > @@ -1228,41 +1163,6 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, > cl_mem buffer, struct _cl_me > global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; > global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; > > - if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { > - strcat(option, "-D IMAGE_3D"); > - index += 1; > - } > - > - static const char *str_kernel = > - "#ifdef IMAGE_3D \n" > - " #define IMAGE_TYPE image3d_t \n" > - " #define COORD_TYPE int4 \n" > - "#else \n" > - " #define IMAGE_TYPE image2d_t \n" > - " #define COORD_TYPE int2 \n" > - "#endif \n" > - "kernel void __cl_copy_image_to_buffer ( \n" > - " __read_only IMAGE_TYPE image, global uchar* buffer, \n" > - " unsigned int region0, unsigned int region1, unsigned int > region2, \n" > - " unsigned int dst_origin0, unsigned int dst_origin1, unsigned > int dst_origin2, \n" > - " unsigned int src_offset) { \n" > - " int i = get_global_id(0); \n" > - " int j = get_global_id(1); \n" > - " int k = get_global_id(2); \n" > - " uint4 color = (uint4)(0); \n" > - " COORD_TYPE dst_coord; \n" > - " if((i >= region0) || (j>= region1) || (k>=region2)) \n" > - " return; \n" > - " dst_coord.x = dst_origin0 + i; \n" > - " dst_coord.y = dst_origin1 + j; \n" > - "#ifdef IMAGE_3D \n" > - " dst_coord.z = dst_origin2 + k; \n" > - "#endif \n" > - " src_offset += (k * region1 + j) * region0 + i; \n" > - " color.x = buffer[src_offset]; \n" > - " write_imageui(image, dst_coord, color); \n" > - "}"; > - > /* We use one kernel to copy the data. The kernel is lazily created. */ > assert(image->base.ctx == buffer->ctx); > > @@ -1278,7 +1178,19 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, > cl_mem buffer, struct _cl_me > global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; > > /* setup the kernel and run. */ > - ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, option); > + if(image->image_type == CL_MEM_OBJECT_IMAGE2D) { > + extern char cl_internal_copy_buffer_to_image_2d_str[]; > + extern int cl_internal_copy_buffer_to_image_2d_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, > + cl_internal_copy_buffer_to_image_2d_str, > (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL); > + }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { > + extern char cl_internal_copy_buffer_to_image_3d_str[]; > + extern int cl_internal_copy_buffer_to_image_3d_str_size; > + > + ker = cl_context_get_static_kernel_form_bin(queue->ctx, > CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, > + cl_internal_copy_buffer_to_image_3d_str, > (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL); > + } > if (!ker) > return CL_OUT_OF_RESOURCES; > > diff --git a/src/kernels/cl_internal_copy_buf_align1.cl > b/src/kernels/cl_internal_copy_buf_align1.cl > deleted file mode 100644 > index cd3ec7b..0000000 > --- a/src/kernels/cl_internal_copy_buf_align1.cl > +++ /dev/null > @@ -1,8 +0,0 @@ > -kernel void __cl_cpy_region_align1 ( global char* src, unsigned int > src_offset, > - global char* dst, unsigned int > dst_offset, > - unsigned int size) > -{ > - int i = get_global_id(0); > - if (i < size) > - dst[i+dst_offset] = src[i+src_offset]; > -} > diff --git a/src/kernels/cl_internal_copy_buf_align16.cl > b/src/kernels/cl_internal_copy_buf_align16.cl > index 75b1a4a..1abb4e9 100644 > --- a/src/kernels/cl_internal_copy_buf_align16.cl > +++ b/src/kernels/cl_internal_copy_buf_align16.cl > @@ -1,4 +1,4 @@ > -kernel void __cl_cpy_region_align16 ( global float* src, unsigned int > src_offset, > +kernel void __cl_copy_region_align16 ( global float* src, unsigned int > src_offset, > global float* dst, unsigned int > dst_offset, > unsigned int size) > { > diff --git a/src/kernels/cl_internal_copy_buf_align4.cl > b/src/kernels/cl_internal_copy_buf_align4.cl > index 44a0f81..27174ca 100644 > --- a/src/kernels/cl_internal_copy_buf_align4.cl > +++ b/src/kernels/cl_internal_copy_buf_align4.cl > @@ -1,4 +1,4 @@ > -kernel void __cl_cpy_region_align4 ( global float* src, unsigned int > src_offset, > +kernel void __cl_copy_region_align4 ( global float* src, unsigned int > src_offset, > global float* dst, unsigned int > dst_offset, > unsigned int size) > { > diff --git a/src/kernels/cl_internal_copy_buf_rect.cl > b/src/kernels/cl_internal_copy_buf_rect.cl > new file mode 100644 > index 0000000..71e7484 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_buf_rect.cl > @@ -0,0 +1,15 @@ > +kernel void __cl_copy_buffer_rect ( global char* src, global char* dst, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int src_offset, unsigned > int dst_offset, > + unsigned int src_row_pitch, > unsigned int src_slice_pitch, > + unsigned int dst_row_pitch, > unsigned int dst_slice_pitch) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_offset += k * src_slice_pitch + j * src_row_pitch + i; > + dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; > + dst[dst_offset] = src[src_offset]; > +} > diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl > b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl > index 13f4162..e02d0e5 100644 > --- a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl > +++ b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl > @@ -1,4 +1,4 @@ > -kernel void __cl_cpy_region_unalign_dst_offset ( global int* src, unsigned > int src_offset, > +kernel void __cl_copy_region_unalign_dst_offset ( global int* src, unsigned > int src_offset, > global int* dst, unsigned int > dst_offset, > unsigned int size, > unsigned int first_mask, unsigned int > last_mask, > diff --git a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl > b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl > index 8510246..83b6e97 100644 > --- a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl > +++ b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl > @@ -1,4 +1,4 @@ > -kernel void __cl_cpy_region_unalign_same_offset ( global int* src, unsigned > int src_offset, > +kernel void __cl_copy_region_unalign_same_offset ( global int* src, unsigned > int src_offset, > global int* dst, unsigned int > dst_offset, > unsigned int size, > unsigned int first_mask, unsigned int > last_mask) > diff --git a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl > b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl > index f98368a..ce0aa1d 100644 > --- a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl > +++ b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl > @@ -1,4 +1,4 @@ > -kernel void __cl_cpy_region_unalign_src_offset ( global int* src, unsigned > int src_offset, > +kernel void __cl_copy_region_unalign_src_offset ( global int* src, unsigned > int src_offset, > global int* dst, unsigned int > dst_offset, > unsigned int size, > unsigned int first_mask, unsigned int > last_mask, > diff --git a/src/kernels/cl_internal_copy_buffer_to_image_2d.cl > b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl > new file mode 100644 > index 0000000..a218b58 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_buffer_to_image_2d.cl > @@ -0,0 +1,18 @@ > +kernel void __cl_copy_buffer_to_image_2d(__read_only image2d_t image, global > uchar* buffer, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int dst_origin0, unsigned > int dst_origin1, unsigned int dst_origin2, > + unsigned int src_offset) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + uint4 color = (uint4)(0); > + int2 dst_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + dst_coord.x = dst_origin0 + i; > + dst_coord.y = dst_origin1 + j; > + src_offset += (k * region1 + j) * region0 + i; > + color.x = buffer[src_offset]; > + write_imageui(image, dst_coord, color); > +} > diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d.cl > b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl > new file mode 100644 > index 0000000..84d3b27 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_buffer_to_image_3d.cl > @@ -0,0 +1,19 @@ > +kernel void __cl_copy_buffer_to_image_3d(__read_only image3d_t image, global > uchar* buffer, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int dst_origin0, unsigned > int dst_origin1, unsigned int dst_origin2, > + unsigned int src_offset) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + uint4 color = (uint4)(0); > + int4 dst_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + dst_coord.x = dst_origin0 + i; > + dst_coord.y = dst_origin1 + j; > + dst_coord.z = dst_origin2 + k; > + src_offset += (k * region1 + j) * region0 + i; > + color.x = buffer[src_offset]; > + write_imageui(image, dst_coord, color); > +} > diff --git a/src/kernels/cl_internal_copy_image_2d_to_2d.cl > b/src/kernels/cl_internal_copy_image_2d_to_2d.cl > new file mode 100644 > index 0000000..c5eaab1 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_image_2d_to_2d.cl > @@ -0,0 +1,21 @@ > +kernel void __cl_copy_image_2d_to_2d(__read_only image2d_t src_image, > __write_only image2d_t dst_image, > + unsigned int region0, unsigned int region1, > unsigned int region2, > + unsigned int src_origin0, unsigned int > src_origin1, unsigned int src_origin2, > + unsigned int dst_origin0, unsigned int > dst_origin1, unsigned int dst_origin2) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + int4 color; > + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | > CLK_FILTER_NEAREST; > + int2 src_coord; > + int2 dst_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_coord.x = src_origin0 + i; > + src_coord.y = src_origin1 + j; > + dst_coord.x = dst_origin0 + i; > + dst_coord.y = dst_origin1 + j; > + color = read_imagei(src_image, sampler, src_coord); > + write_imagei(dst_image, dst_coord, color); > +} > diff --git a/src/kernels/cl_internal_copy_image_2d_to_3d.cl > b/src/kernels/cl_internal_copy_image_2d_to_3d.cl > new file mode 100644 > index 0000000..4c73a74 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_image_2d_to_3d.cl > @@ -0,0 +1,22 @@ > +kernel void __cl_copy_image_2d_to_3d(__read_only image2d_t src_image, > __write_only image3d_t dst_image, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int src_origin0, unsigned > int src_origin1, unsigned int src_origin2, > + unsigned int dst_origin0, unsigned > int dst_origin1, unsigned int dst_origin2) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + int4 color; > + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | > CLK_FILTER_NEAREST; > + int2 src_coord; > + int4 dst_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_coord.x = src_origin0 + i; > + src_coord.y = src_origin1 + j; > + dst_coord.x = dst_origin0 + i; > + dst_coord.y = dst_origin1 + j; > + dst_coord.z = dst_origin2 + k; > + color = read_imagei(src_image, sampler, src_coord); > + write_imagei(dst_image, dst_coord, color); > +} > diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer.cl > b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl > new file mode 100644 > index 0000000..b6c352e > --- /dev/null > +++ b/src/kernels/cl_internal_copy_image_2d_to_buffer.cl > @@ -0,0 +1,19 @@ > +kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, > global uchar* buffer, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int src_origin0, unsigned > int src_origin1, unsigned int src_origin2, > + unsigned int dst_offset) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + uint4 color; > + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | > CLK_FILTER_NEAREST; > + int2 src_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_coord.x = src_origin0 + i; > + src_coord.y = src_origin1 + j; > + color = read_imageui(image, sampler, src_coord); > + dst_offset += (k * region1 + j) * region0 + i; > + buffer[dst_offset] = color.x; > +} > diff --git a/src/kernels/cl_internal_copy_image_3d_to_2d.cl > b/src/kernels/cl_internal_copy_image_3d_to_2d.cl > new file mode 100644 > index 0000000..e0effa0 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_image_3d_to_2d.cl > @@ -0,0 +1,22 @@ > +kernel void __cl_copy_image_3d_to_2d(__read_only image3d_t src_image, > __write_only image2d_t dst_image, > + unsigned int region0, unsigned int region1, > unsigned int region2, > + unsigned int src_origin0, unsigned int > src_origin1, unsigned int src_origin2, > + unsigned int dst_origin0, unsigned int > dst_origin1, unsigned int dst_origin2) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + int4 color; > + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | > CLK_FILTER_NEAREST; > + int4 src_coord; > + int2 dst_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_coord.x = src_origin0 + i; > + src_coord.y = src_origin1 + j; > + src_coord.z = src_origin2 + k; > + dst_coord.x = dst_origin0 + i; > + dst_coord.y = dst_origin1 + j; > + color = read_imagei(src_image, sampler, src_coord); > + write_imagei(dst_image, dst_coord, color); > +} > diff --git a/src/kernels/cl_internal_copy_image_3d_to_3d.cl > b/src/kernels/cl_internal_copy_image_3d_to_3d.cl > new file mode 100644 > index 0000000..de80a0a > --- /dev/null > +++ b/src/kernels/cl_internal_copy_image_3d_to_3d.cl > @@ -0,0 +1,23 @@ > +kernel void __cl_copy_image_3d_to_3d(__read_only image3d_t src_image, > __write_only image3d_t dst_image, > + unsigned int region0, unsigned int region1, > unsigned int region2, > + unsigned int src_origin0, unsigned int > src_origin1, unsigned int src_origin2, > + unsigned int dst_origin0, unsigned int > dst_origin1, unsigned int dst_origin2) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + int4 color; > + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | > CLK_FILTER_NEAREST; > + int4 src_coord; > + int4 dst_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_coord.x = src_origin0 + i; > + src_coord.y = src_origin1 + j; > + src_coord.z = src_origin2 + k; > + dst_coord.x = dst_origin0 + i; > + dst_coord.y = dst_origin1 + j; > + dst_coord.z = dst_origin2 + k; > + color = read_imagei(src_image, sampler, src_coord); > + write_imagei(dst_image, dst_coord, color); > +} > diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer.cl > b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl > new file mode 100644 > index 0000000..dcfc8a2 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_image_3d_to_buffer.cl > @@ -0,0 +1,22 @@ > +#define IMAGE_TYPE image3d_t > +#define COORD_TYPE int4 > +kernel void __cl_copy_image_3d_to_buffer ( __read_only IMAGE_TYPE image, > global uchar* buffer, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int src_origin0, unsigned > int src_origin1, unsigned int src_origin2, > + unsigned int dst_offset) > +{ > + int i = get_global_id(0); > + int j = get_global_id(1); > + int k = get_global_id(2); > + uint4 color; > + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | > CLK_FILTER_NEAREST; > + COORD_TYPE src_coord; > + if((i >= region0) || (j>= region1) || (k>=region2)) > + return; > + src_coord.x = src_origin0 + i; > + src_coord.y = src_origin1 + j; > + src_coord.z = src_origin2 + k; > + color = read_imageui(image, sampler, src_coord); > + dst_offset += (k * region1 + j) * region0 + i; > + buffer[dst_offset] = color.x; > +}
_______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
