Hi, Junyan,

    Have you reviewed this patch?

-----Original Message-----
From: Lv, Meng 
Sent: Monday, May 19, 2014 3:45 PM
To: Yang, Rong R
Cc: [email protected]; Luo, Xionghu
Subject: RE: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect 
performance in some cases


RONG, 麻烦你先CHECK下 LUO的PATCH,我V2的PATCH会按照他的格式来做
-----Original Message-----
From: Luo, Xionghu
Sent: Monday, May 19, 2014 3:35 PM
To: Lv, Meng
Cc: [email protected]
Subject: RE: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect 
performance in some cases

Could you please move the kernel source code to file instead of staying in host 
code?
You can refer to my pending patch "move enqueue_copy_image kernels outside of 
runtime code", thanks.

Luo Xionghu
Best Regards

-----Original Message-----
From: Beignet [mailto:[email protected]] On Behalf Of Yang, 
Rong R
Sent: Monday, May 19, 2014 3:14 PM
To: Lv, Meng; [email protected]
Cc: Lv, Meng
Subject: Re: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect 
performance in some cases

The one index indicate to one kernel string, because you add a new kernel for 
CL_ENQUEUE_COPY_BUFFER_RECT, you should also add a new index for it.

And the file mode change 100644 => 100755, I think it is not necessary.

-----Original Message-----
From: Beignet [mailto:[email protected]] On Behalf Of Lv 
Meng
Sent: Monday, May 05, 2014 10:50 AM
To: [email protected]
Cc: Lv, Meng
Subject: [Beignet] [PATCH] [PATCH]improve the clEnqueueCopyBufferRect 
performance in some cases

Signed-off-by: Lv Meng <[email protected]>
---
 src/cl_mem.c | 80 ++++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 21 deletions(-)  mode change 100644 => 
100755 src/cl_mem.c

diff --git a/src/cl_mem.c b/src/cl_mem.c old mode 100644 new mode 100755 index 
44482f7..92f51d0
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -911,6 +911,17 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem 
src_buf, cl_mem dst_buf,
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+
+  // the src and dst mem rect is continuous, the copy is degraded to 
+ buf copy  if((region[0] == dst_row_pitch) && (region[0] ==
+ src_row_pitch)  && (region[1] * src_row_pitch == src_slice_pitch) && 
(region[1] * dst_row_pitch == dst_slice_pitch)){
+    cl_int src_offset = src_origin[2]*src_slice_pitch + 
src_origin[1]*src_row_pitch + src_origin[0];
+    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + 
dst_origin[1]*dst_row_pitch + dst_origin[0];
+    cl_int size = region[0]*region[1]*region[2];
+    ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
+    return ret;
+  }
+
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
   global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; 
@@ -919,30 +930,57 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem 
src_buf, cl_mem dst_buf,
   cl_int index = CL_ENQUEUE_COPY_BUFFER_RECT;
   cl_int src_offset = src_origin[2]*src_slice_pitch + 
src_origin[1]*src_row_pitch + src_origin[0];
   cl_int dst_offset = dst_origin[2]*dst_slice_pitch + 
dst_origin[1]*dst_row_pitch + dst_origin[0];
-
-  static const char *str_kernel =
-      "kernel void __cl_cpy_buffer_rect ( \n"
-      "       global char* src, global char* dst, \n"
-      "       unsigned int region0, unsigned int region1, unsigned int 
region2, \n"
-      "       unsigned int src_offset, unsigned int dst_offset, \n"
-      "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
-      "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
-      "  int i = get_global_id(0); \n"
-      "  int j = get_global_id(1); \n"
-      "  int k = get_global_id(2); \n"
-      "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
-      "    return; \n"
-      "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
-      "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
-      "  dst[dst_offset] = src[src_offset]; \n"
-      "}";
-
-
   /* We use one kernel to copy the data. The kernel is lazily created. */
   assert(src_buf->ctx == dst_buf->ctx);
+  if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) 
&& (dst_row_pitch % 4== 0)
+  && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (global_sz[0] % 
4 == 0) ){      
+    global_sz[0] /= 4;
+    src_offset /= 4;
+    dst_offset /= 4;
+    src_row_pitch /= 4;
+    dst_row_pitch /= 4;
+    src_slice_pitch /= 4;
+    dst_slice_pitch /= 4;
+    static const char *str_intkernel =
+        "kernel void __cl_cpy_buffer_rect ( \n"
+        "              global int* src, global int* dst, \n"
+        "              unsigned int region0, unsigned int region1, unsigned 
int region2, \n"
+        "              unsigned int src_offset, unsigned int dst_offset, \n"
+        "              unsigned int src_row_pitch, unsigned int 
src_slice_pitch, \n"
+        "              unsigned int dst_row_pitch, unsigned int 
dst_slice_pitch) { \n"
+        "  int i = get_global_id(0); \n"
+        "  int j = get_global_id(1); \n"
+        "  int k = get_global_id(2); \n"
+        "  region0 >>= 2; \n"
+        "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+        "       return; \n"
+        "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+        "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+        "  dst[dst_offset] = src[src_offset]; \n"
+        "}";
+    /* setup the kernel and run. */
+    ker = cl_context_get_static_kernel(queue->ctx, index, 
+ str_intkernel, NULL);  } else {
+    static const char *str_kernel =
+        "kernel void __cl_cpy_buffer_rect ( \n"
+        "       global char* src, global char* dst, \n"
+        "       unsigned int region0, unsigned int region1, unsigned int 
region2, \n"
+        "       unsigned int src_offset, unsigned int dst_offset, \n"
+        "       unsigned int src_row_pitch, unsigned int src_slice_pitch, \n"
+        "       unsigned int dst_row_pitch, unsigned int dst_slice_pitch) { \n"
+        "  int i = get_global_id(0); \n"
+        "  int j = get_global_id(1); \n"
+        "  int k = get_global_id(2); \n"
+        "  if((i >= region0) || (j>= region1) || (k>=region2)) \n"
+        "    return; \n"
+        "  src_offset += k * src_slice_pitch + j * src_row_pitch + i; \n"
+        "  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; \n"
+        "  dst[dst_offset] = src[src_offset]; \n"
+        "}";
+    /* setup the kernel and run. */
+    ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, 
+ NULL);  }
 
-  /* setup the kernel and run. */
-  ker = cl_context_get_static_kernel(queue->ctx, index, str_kernel, NULL);
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
--
1.8.3.2

_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet
_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet
_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet

Reply via email to