if environment variable OCL_OUTPUT_KERNEL_PERF is set non-zero, then after the executable program exits, beignet will output the time information of each kernel executed.
Signed-off-by:Yongjia Zhang<[email protected]> --- src/CMakeLists.txt | 3 +- src/cl_api.c | 23 ++++- src/cl_command_queue.c | 5 +- src/performance.c | 254 +++++++++++++++++++++++++++++++++++++++++++++++++ src/performance.h | 12 +++ 5 files changed, 294 insertions(+), 3 deletions(-) create mode 100644 src/performance.c create mode 100644 src/performance.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 95ff56f..4c34235 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -47,7 +47,8 @@ set(OPENCL_SRC intel/intel_batchbuffer.c intel/intel_driver.c x11/dricommon.c - x11/va_dri2.c) + x11/va_dri2.c + performance.c) if (EGL_FOUND AND MESA_SOURCE_FOUND) set (OPENCL_SRC ${OPENCL_SRC} cl_mem_gl.c cl_gl_api.c x11/mesa_egl_extension.c x11/mesa_egl_res_share.c intel/intel_dri_resource_sharing.c) diff --git a/src/cl_api.c b/src/cl_api.c index 9638994..f670f13 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -40,6 +40,8 @@ #include <assert.h> #include <unistd.h> +#include "performance.h" + #ifndef CL_VERSION_1_2 #define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) #define CL_DEVICE_TYPE_CUSTOM (1 << 4) @@ -293,6 +295,7 @@ clCreateContext(const cl_context_properties * properties, pfn_notify, user_data, &err); + initialize_env_var(); error: if (errcode_ret) *errcode_ret = err; @@ -1676,6 +1679,10 @@ clEnqueueCopyBuffer(cl_command_queue command_queue, err = cl_command_queue_flush(command_queue); } + + if(b_output_kernel_perf) + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", command_queue); + return 0; error: @@ -1777,6 +1784,9 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue, err = cl_command_queue_flush(command_queue); } + if(b_output_kernel_perf) + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", command_queue); + error: return err; } @@ -2016,6 +2026,9 @@ clEnqueueCopyImage(cl_command_queue command_queue, err = cl_command_queue_flush(command_queue); } + if(b_output_kernel_perf) + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", command_queue); + error: return err; } @@ -2077,6 +2090,9 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue, err = cl_command_queue_flush(command_queue); } + if(b_output_kernel_perf) + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", command_queue); + error: return err; } @@ -2138,6 +2154,9 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue, err = cl_command_queue_flush(command_queue); } + if(b_output_kernel_perf) + time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", command_queue); + error: return err; } @@ -2526,7 +2545,9 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, err = cl_command_queue_flush(command_queue); } - + + if(b_output_kernel_perf) + time_end(command_queue->ctx, cl_kernel_get_name(kernel), command_queue); error: return err; } diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 4ac2e11..7eff14c 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -28,6 +28,7 @@ #include "cl_alloc.h" #include "cl_driver.h" #include "cl_khr_icd.h" +#include "performance.h" #include <assert.h> #include <stdio.h> @@ -376,6 +377,8 @@ cl_command_queue_ND_range(cl_command_queue queue, const size_t *global_wk_sz, const size_t *local_wk_sz) { + if(b_output_kernel_perf) + time_start(queue->ctx, cl_kernel_get_name(k), queue); const int32_t ver = cl_driver_get_ver(queue->ctx->drv); cl_int err = CL_SUCCESS; @@ -408,7 +411,7 @@ cl_command_queue_ND_range(cl_command_queue queue, TRY (cl_fulsim_read_all_surfaces, queue, k); } #endif /* USE_FULSIM */ - + error: return err; } diff --git a/src/performance.c b/src/performance.c new file mode 100644 index 0000000..c4f40a6 --- /dev/null +++ b/src/performance.c @@ -0,0 +1,254 @@ +#include <performance.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/time.h> +#include <pthread.h> + +#define MAX_KERNEL_NAME_LENGTH 100 +#define MAX_KERNEL_EXECUTION_COUNT 100000 + +typedef struct kernel_storage_node +{ + char kernel_name[MAX_KERNEL_NAME_LENGTH]; + float kernel_times[MAX_KERNEL_EXECUTION_COUNT]; + int current_count; + float kernel_sum_time; + struct kernel_storage_node *next; +} kernel_storage_node; + +typedef struct context_storage_node +{ + uint64_t context_id; + kernel_storage_node *kernels_storage; + char max_time_kernel_name[MAX_KERNEL_NAME_LENGTH]; + float kernel_max_time; + int kernel_count; + struct context_storage_node *next; +} context_storage_node; + +typedef struct storage +{ + context_storage_node * context_storage; +} storage; + + + +static storage record; +static int atexit_registered = 0; + + +static context_storage_node * prev_context_pointer = NULL; +static kernel_storage_node * prev_kernel_pointer = NULL; + +static context_storage_node * find_context(cl_context context) +{ + if(NULL != prev_context_pointer ) + { + if(prev_context_pointer->context_id == (uint64_t)context) + return prev_context_pointer; + } + + if(NULL == record.context_storage) + { + record.context_storage = (context_storage_node *) malloc(sizeof(context_storage_node)); + record.context_storage->context_id = (uint64_t)context; + record.context_storage->kernels_storage = NULL; + record.context_storage->kernel_max_time = 0.0f; + record.context_storage->next = NULL; + record.context_storage->kernel_count = 0; + return record.context_storage; + } + + context_storage_node *pre = record.context_storage; + context_storage_node *cur = record.context_storage; + while(NULL !=cur && (uint64_t)context != cur->context_id ) + { + pre = cur; + cur = cur->next; + } + if(NULL != cur) + return cur; + + pre->next = (context_storage_node *)malloc(sizeof(context_storage_node)); + pre = pre->next; + pre->context_id = (uint64_t)context; + pre->kernels_storage = NULL; + pre->kernel_max_time = 0.0f; + pre->next = NULL; + pre->kernel_count = 0; + return pre; +} + +static kernel_storage_node * find_kernel(context_storage_node *p_context, const char *kernel_name) +{ + if(NULL != prev_kernel_pointer && NULL != prev_context_pointer && + p_context == prev_context_pointer && + !strcmp(kernel_name, prev_kernel_pointer->kernel_name)) + return prev_kernel_pointer; + + if(NULL == p_context) + return NULL; + + if(NULL == p_context->kernels_storage) + { + p_context->kernels_storage = (kernel_storage_node *)malloc(sizeof(kernel_storage_node)); + p_context->kernel_count++; + strcpy(p_context->kernels_storage->kernel_name,kernel_name); + p_context->kernels_storage->current_count = 0; + p_context->kernels_storage->kernel_sum_time = 0.0f; + p_context->kernels_storage->next = NULL; + return p_context->kernels_storage; + } + kernel_storage_node *pre = p_context->kernels_storage; + kernel_storage_node *cur = p_context->kernels_storage; + while(NULL != cur && strcmp(cur->kernel_name, kernel_name)) + { + pre = cur; + cur = cur->next; + } + if(NULL != cur) + { + return cur; + } + p_context->kernel_count++; + pre->next = (kernel_storage_node *)malloc(sizeof(kernel_storage_node)); + pre = pre->next; + pre->current_count = 0; + pre->kernel_sum_time = 0.0f; + pre->next = NULL; + strcpy(pre->kernel_name, kernel_name); + return pre; +} + +static void free_storage() +{ + context_storage_node *p_context = record.context_storage; + while(NULL != p_context) + { + context_storage_node *p_tmp_context = p_context->next; + kernel_storage_node *p_kernel = p_context->kernels_storage; + while(NULL != p_kernel) + { + kernel_storage_node *p_tmp_kernel = p_kernel->next; + free(p_kernel); + p_kernel = p_tmp_kernel; + } + free(p_context); + p_context = p_tmp_context; + } +} + +typedef struct time_element +{ + char kernel_name[MAX_KERNEL_NAME_LENGTH]; + float kernel_sum_time; +} time_element; + +static int cmp(const void *a, const void *b) +{ + if(((time_element *)a)->kernel_sum_time < ((time_element *)b)->kernel_sum_time) + return 1; + else if(((time_element *)a)->kernel_sum_time > ((time_element *)b)->kernel_sum_time) + return -1; + else + return 0; +} + +static void print_time_info() +{ + context_storage_node *p_context = record.context_storage; + if(NULL == p_context) + { + printf("Nothing to output !\n"); + return; + } + + int tmp_context_id = 0; + while(NULL != p_context) + { + printf("[------------ CONTEXT %4d ------------]\n", tmp_context_id++); + printf(" ->>>> KERNELS TIME SUMMARY <<<<-\n"); + kernel_storage_node *p_kernel = p_context->kernels_storage; + kernel_storage_node *p_tmp_kernel = p_kernel; + time_element *te = (time_element *)malloc(sizeof(time_element)*p_context->kernel_count); + int i = 0; + while(NULL != p_tmp_kernel) + { + strcpy(te[i].kernel_name, p_tmp_kernel->kernel_name); + te[i++].kernel_sum_time = p_tmp_kernel->kernel_sum_time; + p_tmp_kernel = p_tmp_kernel->next; + } + float sum_time = 0.0f; + qsort((void *)te, p_context->kernel_count, sizeof(time_element), cmp); + for(i=0; i<p_context->kernel_count; ++i) + { + sum_time += te[i].kernel_sum_time; + printf(" [Kernel Name : %s Time : %.2f]\n", te[i].kernel_name, te[i].kernel_sum_time); + } + free(te); + printf(" Total : %.2f\n", sum_time); + p_tmp_kernel = p_kernel; + printf("\n ->>>> KERNELS TIME DETAIL <<<<-\n"); + while(NULL != p_kernel) + { + printf(" [Kernel Name : %s Time : %.2f]\n", p_kernel->kernel_name, p_kernel->kernel_sum_time); + for(i=0; i!=p_kernel->current_count; ++i) + printf(" Execution Round %d : %.2f\n", i+1, p_kernel->kernel_times[i]); + p_kernel = p_kernel->next; + } + printf("[------------ CONTEXT ENDS------------]\n\n"); + p_context = p_context->next; + } + free_storage(); +} + + +static void insert(cl_context context, const char *kernel_name, float time) +{ + if(!atexit_registered) + { + atexit_registered = 1; + atexit(print_time_info); + } + context_storage_node *p_context = find_context(context); + kernel_storage_node *p_kernel = find_kernel(p_context, kernel_name); + prev_context_pointer = p_context; + prev_kernel_pointer = p_kernel; + p_kernel->kernel_times[p_kernel->current_count++] = time; + p_kernel->kernel_sum_time += time; + if(p_kernel->kernel_sum_time > p_context->kernel_max_time) + { + p_context->kernel_max_time = p_kernel->kernel_sum_time; + strcpy(p_context->max_time_kernel_name, kernel_name); + } +} + + +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; +int b_output_kernel_perf = 0; +static struct timeval start, end; + +void initialize_env_var() +{ + char *env = getenv("OCL_OUTPUT_KERNEL_PERF"); + if(NULL == env || !strcmp(env,"0")) + b_output_kernel_perf = 0; + else + b_output_kernel_perf = 1; +} + +void time_start(cl_context context, const char * kernel_name, cl_command_queue cq) +{ + pthread_mutex_lock(&mutex); + gettimeofday(&start, NULL); +} + +void time_end(cl_context context, const char * kernel_name, cl_command_queue cq) +{ + clFinish(cq); + gettimeofday(&end, NULL); + float t = (end.tv_sec - start.tv_sec)*1000 + (end.tv_usec - start.tv_usec)/1000.0f; + insert(context, kernel_name, t); + pthread_mutex_unlock(&mutex); +} diff --git a/src/performance.h b/src/performance.h new file mode 100644 index 0000000..c747743 --- /dev/null +++ b/src/performance.h @@ -0,0 +1,12 @@ +#ifndef __PERFORMANCE_H__ +#define __PERFORMANCE_H__ +#include "CL/cl.h" + + +extern int b_output_kernel_perf; +void time_start(cl_context context, const char * kernel_name, cl_command_queue cq); +void time_end(cl_context context, const char * kernel_name, cl_command_queue cq); +void initialize_env_var(); + + +#endif -- 1.8.3.2 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
