This version LGTM, will push latter thanks.
On Fri, Feb 13, 2015 at 11:33:44AM +0800, Chuanbo Weng wrote: > We can change the image_channel_order to CL_RGBA and > image_channel_data_type to CL_UNSIGNED_INT32 for some special > case, thus 16 bytes can be read by one work item. Bandwidth is > fully used. > > v2: > Now we just optimize for IMAGE2D, so add judgement to not affect > other image type's code path. > > Signed-off-by: Chuanbo Weng <[email protected]> > --- > src/CMakeLists.txt | 2 +- > src/cl_context.h | 1 + > src/cl_mem.c | 44 > ++++++++++++++++++---- > .../cl_internal_copy_image_2d_to_buffer_align16.cl | 19 ++++++++++ > 4 files changed, 57 insertions(+), 9 deletions(-) > create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl > > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt > index 939f58d..d4181d8 100644 > --- a/src/CMakeLists.txt > +++ b/src/CMakeLists.txt > @@ -49,7 +49,7 @@ cl_internal_copy_image_3d_to_2d > cl_internal_copy_image_2d_to_3d cl_internal_copy > cl_internal_copy_image_2d_to_2d_array > cl_internal_copy_image_1d_array_to_1d_array > cl_internal_copy_image_2d_array_to_2d_array > cl_internal_copy_image_2d_array_to_2d > cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array > -cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer > +cl_internal_copy_image_2d_to_buffer > cl_internal_copy_image_2d_to_buffer_align16 > cl_internal_copy_image_3d_to_buffer > cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d > cl_internal_fill_buf_align8 cl_internal_fill_buf_align4 > cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign > diff --git a/src/cl_context.h b/src/cl_context.h > index 2ea0a73..fdbfd2a 100644 > --- a/src/cl_context.h > +++ b/src/cl_context.h > @@ -60,6 +60,7 @@ enum _cl_internal_ker_type { > CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D, //copy image 2d array to image > 3d > CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY, //copy image 3d to image 2d > array > CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, //copy image 2d to buffer > + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16, > CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer > CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d > CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d > diff --git a/src/cl_mem.c b/src/cl_mem.c > index e58a183..b41ec14 100644 > --- a/src/cl_mem.c > +++ b/src/cl_mem.c > @@ -1714,6 +1714,10 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, > struct _cl_mem_image* image, > uint32_t intel_fmt, bpp; > cl_image_format fmt; > size_t origin0, region0; > + size_t kn_dst_offset; > + int align16 = 0; > + size_t align_size = 1; > + size_t w_saved; > > if(region[1] == 1) local_sz[1] = 1; > if(region[2] == 1) local_sz[2] = 1; > @@ -1724,24 +1728,48 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, > struct _cl_mem_image* image, > /* We use one kernel to copy the data. The kernel is lazily created. */ > assert(image->base.ctx == buffer->ctx); > > - fmt.image_channel_order = CL_R; > - fmt.image_channel_data_type = CL_UNSIGNED_INT8; > intel_fmt = image->intel_fmt; > bpp = image->bpp; > - image->intel_fmt = cl_image_get_intel_format(&fmt); > - image->w = image->w * image->bpp; > - image->bpp = 1; > + w_saved = image->w; > region0 = region[0] * bpp; > - origin0 = src_origin[0] * bpp; > + kn_dst_offset = dst_offset; > + if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * > image->bpp) % 16 == 0) && > + ((src_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && > (dst_offset % 16 == 0)){ > + fmt.image_channel_order = CL_RGBA; > + fmt.image_channel_data_type = CL_UNSIGNED_INT32; > + align16 = 1; > + align_size = 16; > + } > + else{ > + fmt.image_channel_order = CL_R; > + fmt.image_channel_data_type = CL_UNSIGNED_INT8; > + align_size = 1; > + } > + image->intel_fmt = cl_image_get_intel_format(&fmt); > + image->w = (image->w * image->bpp) / align_size; > + image->bpp = align_size; > + region0 = (region[0] * bpp) / align_size; > + origin0 = (src_origin[0] * bpp) / align_size; > + kn_dst_offset /= align_size; > global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; > > /* setup the kernel and run. */ > if(image->image_type == CL_MEM_OBJECT_IMAGE2D) { > + if(align16){ > + extern char cl_internal_copy_image_2d_to_buffer_align16_str[]; > + extern size_t cl_internal_copy_image_2d_to_buffer_align16_str_size; > + > + ker = cl_context_get_static_kernel_from_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16, > + cl_internal_copy_image_2d_to_buffer_align16_str, > + > (size_t)cl_internal_copy_image_2d_to_buffer_align16_str_size, NULL); > + } > + else{ > extern char cl_internal_copy_image_2d_to_buffer_str[]; > extern size_t cl_internal_copy_image_2d_to_buffer_str_size; > > ker = cl_context_get_static_kernel_from_bin(queue->ctx, > CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, > cl_internal_copy_image_2d_to_buffer_str, > (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL); > + } > }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { > extern char cl_internal_copy_image_3d_to_buffer_str[]; > extern size_t cl_internal_copy_image_3d_to_buffer_str_size; > @@ -1763,7 +1791,7 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, > struct _cl_mem_image* image, > cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); > cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); > cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); > - cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_offset); > + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset); > > ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, > local_sz); > > @@ -1771,7 +1799,7 @@ fail: > > image->intel_fmt = intel_fmt; > image->bpp = bpp; > - image->w = image->w / bpp; > + image->w = w_saved; > > return ret; > } > diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl > b/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl > new file mode 100644 > index 0000000..a32e5f2 > --- /dev/null > +++ b/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl > @@ -0,0 +1,19 @@ > +kernel void __cl_copy_image_2d_to_buffer_align16( __read_only image2d_t > image, global uint4* buffer, > + unsigned int region0, unsigned int > region1, unsigned int region2, > + unsigned int src_origin0, unsigned > int src_origin1, unsigned int src_origin2, > + unsigned int dst_offset) > +{ > + > + int i = get_global_id(0); > + int j = get_global_id(1); > + if((i >= region0) || (j>= region1)) > + return; > + uint4 color; > + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | > CLK_FILTER_NEAREST; > + int2 src_coord; > + src_coord.x = src_origin0 + i; > + src_coord.y = src_origin1 + j; > + color = read_imageui(image, sampler, src_coord); > + > + *(buffer + dst_offset + region0*j + i) = color; > +} > -- > 1.9.1 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
