I dig into this patch carefully, and found one serious issue in this patch. You should also take into account the local_sz[0] and the region[0]. Either divide it by 4 on the kernel or on the host side. I recommend you to divide them on CPU side. And you need to check whether region[0] is multipe of 4 as well.
There is another issue beyond this patch but is related to cl_mem_copy_buffer_rect(), the task dimension is hard coded to 1 which is incorrect. It could be 1D/2D/3D rect. We should check whether the region[2] and region[3] are zero to determine the actual dimension. On Wed, Jul 16, 2014 at 12:47:02AM +0000, Lv, Meng wrote: > > > -----Original Message----- > From: Zhigang Gong [mailto:[email protected]] > Sent: Tuesday, July 15, 2014 5:55 PM > To: Lv, Meng > Cc: [email protected] > Subject: Re: [Beignet] [PATCH] [PATCH_V2]improve the clEnqueueCopyBufferRect > performance in some cases > > On Tue, Jul 15, 2014 at 12:22:55PM +0800, Lv Meng wrote: > > Signed-off-by: Lv Meng <[email protected]> > > --- > > src/CMakeLists.txt | 3 ++- > > src/cl_context.h | 1 + > > src/cl_mem.c | 31 > > ++++++++++++++++++++++--- > > src/kernels/cl_internal_copy_buf_rect_align4.cl | 15 ++++++++++++ > > 4 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 > > src/kernels/cl_internal_copy_buf_rect_align4.cl > > > > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index > > 46426d9..dff8fdf 100644 > > --- a/src/CMakeLists.txt > > +++ b/src/CMakeLists.txt > > @@ -41,7 +41,8 @@ set (KERNEL_STR_FILES) set (KERNEL_NAMES > > cl_internal_copy_buf_align4 > > cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset > > cl_internal_copy_buf_unalign_dst_offset > > cl_internal_copy_buf_unalign_src_offset > > -cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d > > cl_internal_copy_image_2d_to_2d > > +cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4 > > +cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d > > cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d > > cl_internal_copy_image_3d_to_3d cl_internal_copy_image_2d_to_buffer > > cl_internal_copy_image_3d_to_buffer > > cl_internal_copy_buffer_to_image_2d > > cl_internal_copy_buffer_to_image_3d > > diff --git a/src/cl_context.h b/src/cl_context.h index > > 75afbf6..f8342d3 100644 > > --- a/src/cl_context.h > > +++ b/src/cl_context.h > > @@ -47,6 +47,7 @@ enum _cl_internal_ker_type { > > CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET, > > CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, > > CL_ENQUEUE_COPY_BUFFER_RECT, > > + CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, > > CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, //copy image 1d to image 1d > > CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d > > CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d > > diff --git a/src/cl_mem.c b/src/cl_mem.c index 70bc3eb..c125f62 100644 > > --- a/src/cl_mem.c > > +++ b/src/cl_mem.c > > @@ -1399,6 +1399,16 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, > > cl_mem src_buf, cl_mem dst_buf, > > size_t global_off[] = {0,0,0}; > > size_t global_sz[] = {1,1,1}; > > size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1}; > > + // the src and dst mem rect is continuous, the copy is degraded to > > + buf copy if((region[0] == dst_row_pitch) && (region[0] == > > + src_row_pitch) && (region[1] * src_row_pitch == src_slice_pitch) && > > (region[1] * dst_row_pitch == dst_slice_pitch)){ > > + cl_int src_offset = src_origin[2]*src_slice_pitch + > > src_origin[1]*src_row_pitch + src_origin[0]; > > + cl_int dst_offset = dst_origin[2]*dst_slice_pitch + > > dst_origin[1]*dst_row_pitch + dst_origin[0]; > > + cl_int size = region[0]*region[1]*region[2]; > > + ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, > > size); > > + return ret; > > + } > > + > > if(region[1] == 1) local_sz[1] = 1; > > if(region[2] == 1) local_sz[2] = 1; > > global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * > > local_sz[0]; @@ -1411,11 +1421,26 @@ > > cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem > > dst_buf, > > assert(src_buf->ctx == dst_buf->ctx); > > > > /* setup the kernel and run. */ > > - extern char cl_internal_copy_buf_rect_str[]; > > - extern size_t cl_internal_copy_buf_rect_str_size; > > > > - ker = cl_context_get_static_kernel_from_bin(queue->ctx, > > CL_ENQUEUE_COPY_BUFFER_RECT, > > + if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch > > + % 4== 0) && (dst_row_pitch % 4== 0) && (src_slice_pitch % 4== 0) && > > (dst_slice_pitch % 4== 0) && (global_sz[0] % 4 == 0) ){ > > + extern char cl_internal_copy_buf_rect_align4_str[]; > > + extern size_t cl_internal_copy_buf_rect_align4_str_size; > > + global_sz[0] /= 4; > > + src_offset /= 4; > > + dst_offset /= 4; > > + src_row_pitch /= 4; > > + dst_row_pitch /= 4; > > + src_slice_pitch /= 4; > > + dst_slice_pitch /= 4; > > + ker = cl_context_get_static_kernel_from_bin(queue->ctx, > > CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, > > + cl_internal_copy_buf_rect_align4_str, > > + (size_t)cl_internal_copy_buf_rect_align4_str_size, NULL); }else{ > > + extern char cl_internal_copy_buf_rect_str[]; > > + extern size_t cl_internal_copy_buf_rect_str_size; > > + ker = cl_context_get_static_kernel_from_bin(queue->ctx, > > + CL_ENQUEUE_COPY_BUFFER_RECT, > > cl_internal_copy_buf_rect_str, > > (size_t)cl_internal_copy_buf_rect_str_size, NULL); > > + } > > > > if (!ker) > > return CL_OUT_OF_RESOURCES; > > diff --git a/src/kernels/cl_internal_copy_buf_rect_align4.cl > > b/src/kernels/cl_internal_copy_buf_rect_align4.cl > > new file mode 100644 > > index 0000000..fbfe7b2 > > --- /dev/null > > +++ b/src/kernels/cl_internal_copy_buf_rect_align4.cl > > @@ -0,0 +1,15 @@ > > +kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* > > dst, > > + unsigned int region0, unsigned > > int region1, unsigned int region2, > > + unsigned int src_offset, > > unsigned int dst_offset, > > + unsigned int src_row_pitch, > > unsigned int src_slice_pitch, > > + unsigned int dst_row_pitch, > > +unsigned int dst_slice_pitch) { > > + int i = get_global_id(0); > > + int j = get_global_id(1); > > + int k = get_global_id(2); > > + if((i >= region0) || (j>= region1) || (k>=region2)) > > + return; > > + src_offset += k * src_slice_pitch + j * src_row_pitch + i; > > + dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; > > + dst[dst_offset] = src[src_offset]; > > +} > This kernel is the same as __cl_copy_buffer_rect(). Is it expected? > This kernel is different from __cl_copy_buffer_rect(), the src and dst in > this kernel is 4byte aligned, and which in __cl_copy_buffer_rect() is 1 byte > aligned. > > -- > > 1.8.3.2 > > > > _______________________________________________ > > Beignet mailing list > > [email protected] > > http://lists.freedesktop.org/mailman/listinfo/beignet > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
