From: Junyan He <[email protected]> The profiling feature is now all supported. We use drm_intel_reg_read to get the current time of GPU when the event is queued and submitted, and use PIPI_CONTROL cmd to get the executing time of the GPU for kernel start and end. One trivial problem is that: The GPU timer counter is 36 bits with resolution of 80ns, so 2^36*80 = 5500s, about half an hour. Some test may last about 2~5 min and if it starts at about half an hour, this may cause a wrap back problem and cause the case fail.
Signed-off-by: Junyan He <[email protected]> --- src/cl_api.c | 78 +++++++++++++++++++++++++++++++++++++++---------- src/cl_driver.h | 8 +++-- src/cl_driver_defs.c | 3 +- src/cl_enqueue.c | 19 ++++++++---- src/cl_enqueue.h | 2 +- src/cl_event.c | 27 ++++++++++------- src/cl_event.h | 3 +- src/intel/intel_gpgpu.c | 36 +++++++++++++++++++++-- 8 files changed, 137 insertions(+), 39 deletions(-) diff --git a/src/cl_api.c b/src/cl_api.c index 0978129..35d0984 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -70,6 +70,13 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list, cl_event e; if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) { e = cl_event_new(queue->ctx, queue, type, event!=NULL); + + /* if need profiling, add the submit timestamp here. */ + if (e->type != CL_COMMAND_USER && + e->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED); + } + if(event != NULL) *event = e; if(status == CL_ENQUEUE_EXECUTE_DEFER) { @@ -1270,16 +1277,25 @@ clGetEventProfilingInfo(cl_event event, goto error; } - if ((param_name != CL_PROFILING_COMMAND_QUEUED && - param_name != CL_PROFILING_COMMAND_SUBMIT && - param_name != CL_PROFILING_COMMAND_START && - param_name != CL_PROFILING_COMMAND_END) || - (param_value && param_value_size < sizeof(cl_ulong))) { + if (param_value && param_value_size < sizeof(cl_ulong)) { err = CL_INVALID_VALUE; goto error; } - err = cl_event_profiling(event, param_name, &ret_val); + if (param_name == CL_PROFILING_COMMAND_QUEUED) { + ret_val = event->timestamp[0]; + } else if (param_name == CL_PROFILING_COMMAND_SUBMIT) { + ret_val = event->timestamp[1]; + } else if (param_name == CL_PROFILING_COMMAND_START) { + err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_START); + ret_val = event->timestamp[2]; + } else if (param_name == CL_PROFILING_COMMAND_END) { + err = cl_event_get_timestamp(event, CL_PROFILING_COMMAND_END); + ret_val = event->timestamp[3]; + } else { + err = CL_INVALID_VALUE; + goto error; + } if (err == CL_SUCCESS) { if (param_value) @@ -1354,7 +1370,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -1437,7 +1453,7 @@ clEnqueueReadBufferRect(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -1487,7 +1503,7 @@ clEnqueueWriteBuffer(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -1570,7 +1586,7 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -1649,6 +1665,11 @@ clEnqueueCopyBuffer(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { + if (event && (*event)->type != CL_COMMAND_USER + && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); + } + err = cl_command_queue_flush(command_queue); } return 0; @@ -1740,6 +1761,11 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) { + if (event && (*event)->type != CL_COMMAND_USER + && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); + } + err = cl_command_queue_flush(command_queue); } @@ -1818,7 +1844,7 @@ clEnqueueReadImage(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -1897,7 +1923,7 @@ clEnqueueWriteImage(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -1974,6 +2000,11 @@ clEnqueueCopyImage(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { + if (event && (*event)->type != CL_COMMAND_USER + && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); + } + err = cl_command_queue_flush(command_queue); } @@ -2030,6 +2061,11 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { + if (event && (*event)->type != CL_COMMAND_USER + && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); + } + err = cl_command_queue_flush(command_queue); } @@ -2086,6 +2122,11 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { + if (event && (*event)->type != CL_COMMAND_USER + && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); + } + err = cl_command_queue_flush(command_queue); } @@ -2217,7 +2258,7 @@ clEnqueueMapBuffer(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -2313,7 +2354,7 @@ clEnqueueMapImage(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -2350,7 +2391,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } @@ -2456,6 +2497,11 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) { + if (event && (*event)->type != CL_COMMAND_USER + && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); + } + err = cl_command_queue_flush(command_queue); } @@ -2535,7 +2581,7 @@ clEnqueueNativeKernel(cl_command_queue command_queue, if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(data); + err = cl_enqueue_handle(event ? *event : NULL, data); if(event) cl_event_set_status(*event, CL_COMPLETE); } diff --git a/src/cl_driver.h b/src/cl_driver.h index 8efe1e7..a34c22e 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -193,8 +193,12 @@ typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event); extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete; /* Get a event time stamp */ -typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, uint64_t*); -extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp; +typedef void (cl_gpgpu_event_get_exec_timestamp_cb)(cl_gpgpu_event, int, uint64_t*); +extern cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp; + +/* Get current GPU time stamp */ +typedef void (cl_gpgpu_event_get_gpu_cur_timestamp_cb)(cl_gpgpu, uint64_t*); +extern cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp; /* Will spawn all threads */ typedef void (cl_gpgpu_walker_cb)(cl_gpgpu, diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index 54fa62e..b46799a 100644 --- a/src/cl_driver_defs.c +++ b/src/cl_driver_defs.c @@ -80,5 +80,6 @@ LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL; LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL; LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL; LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL; -LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL; +LOCAL cl_gpgpu_event_get_exec_timestamp_cb *cl_gpgpu_event_get_exec_timestamp = NULL; +LOCAL cl_gpgpu_event_get_gpu_cur_timestamp_cb *cl_gpgpu_event_get_gpu_cur_timestamp = NULL; diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c index 070fd98..330d230 100644 --- a/src/cl_enqueue.c +++ b/src/cl_enqueue.c @@ -16,16 +16,18 @@ * * Author: Rong Yang <[email protected]> */ +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <pthread.h> #include "cl_enqueue.h" #include "cl_image.h" #include "cl_driver.h" +#include "cl_event.h" +#include "cl_command_queue.h" #include "cl_utils.h" -#include <stdio.h> -#include <string.h> -#include <assert.h> -#include <pthread.h> cl_int cl_enqueue_read_buffer(enqueue_data* data) { @@ -376,8 +378,15 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data) error: return err; } -cl_int cl_enqueue_handle(enqueue_data* data) + +cl_int cl_enqueue_handle(cl_event event, enqueue_data* data) { + /* if need profiling, add the submit timestamp here. */ + if (event && event->type != CL_COMMAND_USER + && event->queue->props & CL_QUEUE_PROFILING_ENABLE) { + cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT); + } + switch(data->type) { case EnqueueReadBuffer: return cl_enqueue_read_buffer(data); diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h index b412d58..1d3ae5f 100644 --- a/src/cl_enqueue.h +++ b/src/cl_enqueue.h @@ -64,5 +64,5 @@ typedef struct _enqueue_data { } enqueue_data; /* Do real enqueue commands */ -cl_int cl_enqueue_handle(enqueue_data* data); +cl_int cl_enqueue_handle(cl_event event, enqueue_data* data); #endif /* __CL_ENQUEUE_H__ */ diff --git a/src/cl_event.c b/src/cl_event.c index 028dfb6..f838a3a 100644 --- a/src/cl_event.c +++ b/src/cl_event.c @@ -380,7 +380,7 @@ void cl_event_set_status(cl_event event, cl_int status) if(status <= CL_COMPLETE) { if(event->enqueue_cb) { - cl_enqueue_handle(&event->enqueue_cb->data); + cl_enqueue_handle(event, &event->enqueue_cb->data); if(event->gpgpu_event) cl_gpgpu_event_update_status(event->gpgpu_event, 1); //now set complet, need refine event->status = status; //Change the event status after enqueue and befor unlock @@ -496,22 +496,29 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* event) return CL_SUCCESS; } -cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val) +cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name) { + cl_ulong ret_val = 0; + GET_QUEUE_THREAD_GPGPU(event->queue); + if (!event->gpgpu_event) { - /* Some event like read buffer do not need GPU involved, so - we just return all the profiling to 0 now. */ - *ret_val = 0; + cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val); + event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; return CL_SUCCESS; } - if(param_name == CL_PROFILING_COMMAND_START || - param_name == CL_PROFILING_COMMAND_QUEUED || - param_name == CL_PROFILING_COMMAND_SUBMIT) { - cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val); + if(param_name == CL_PROFILING_COMMAND_SUBMIT || + param_name == CL_PROFILING_COMMAND_QUEUED) { + cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val); + event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; + return CL_SUCCESS; + } else if(param_name == CL_PROFILING_COMMAND_START) { + cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 0, &ret_val); + event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; return CL_SUCCESS; } else if (param_name == CL_PROFILING_COMMAND_END) { - cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val); + cl_gpgpu_event_get_exec_timestamp(event->gpgpu_event, 1, &ret_val); + event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; return CL_SUCCESS; } else { return CL_INVALID_VALUE; diff --git a/src/cl_event.h b/src/cl_event.h index 722486a..3c61110 100644 --- a/src/cl_event.h +++ b/src/cl_event.h @@ -68,6 +68,7 @@ struct _cl_event { enqueue_callback* enqueue_cb; /* This event's enqueue */ enqueue_callback* waits_head; /* The head of enqueues list wait on this event */ cl_bool emplict; /* Identify this event whether created by api emplict*/ + cl_ulong timestamp[4];/* The time stamps for profiling. */ }; /* Create a new event object */ @@ -91,6 +92,6 @@ void cl_event_update_status(cl_event); /* Create the marker event */ cl_int cl_event_marker(cl_command_queue, cl_event*); /* Do the event profiling */ -cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, cl_ulong *ret_val); +cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name); #endif /* __CL_EVENT_H__ */ diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 7be9059..b1597ac 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -51,6 +51,8 @@ #define MO_RETAIN_BIT (1 << 28) #define SAMPLER_STATE_SIZE (16) +#define TIMESTAMP_ADDR 0x2358 + /* Stores both binding tables and surface states */ typedef struct surface_heap { uint32_t binding_table[256]; @@ -1041,15 +1043,42 @@ intel_gpgpu_event_delete(intel_event_t *event) cl_free(event); } +/* We want to get the current time of GPU. */ +static void +intel_gpgpu_event_get_gpu_cur_timestamp(intel_gpgpu_t* gpgpu, uint64_t* ret_ts) +{ + uint64_t result = 0; + drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; + + drm_intel_reg_read(bufmgr, TIMESTAMP_ADDR, &result); + result = result & 0xFFFFFFFFF0000000; + result = result >> 28; + result *= 80; + + *ret_ts = result; + return; +} + +/* Get the GPU execute time. */ static void -intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* ret_ts) +intel_gpgpu_event_get_exec_timestamp(intel_event_t *event, + int index, uint64_t* ret_ts) { + uint64_t result = 0; + assert(event->ts_buf != NULL); assert(index == 0 || index == 1); drm_intel_gem_bo_map_gtt(event->ts_buf); uint64_t* ptr = event->ts_buf->virtual; + result = ptr[index]; + + /* According to BSpec, the timestamp counter should be 36 bits, + but comparing to the timestamp counter from IO control reading, + we find the first 4 bits seems to be fake. In order to keep the + timestamp counter conformable, we just skip the first 4 bits. */ + result = ((result & 0x0FFFFFFFF) << 4) * 80; //convert to nanoseconds + *ret_ts = result; - *ret_ts = ptr[index] * 80; //convert to nanoseconds drm_intel_gem_bo_unmap_gtt(event->ts_buf); } @@ -1080,6 +1109,7 @@ intel_set_gpgpu_callbacks(void) cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb *)intel_gpgpu_event_pending; cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb *)intel_gpgpu_event_resume; cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb *)intel_gpgpu_event_delete; - cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb *)intel_gpgpu_event_get_timestamp; + cl_gpgpu_event_get_exec_timestamp = (cl_gpgpu_event_get_exec_timestamp_cb *)intel_gpgpu_event_get_exec_timestamp; + cl_gpgpu_event_get_gpu_cur_timestamp = (cl_gpgpu_event_get_gpu_cur_timestamp_cb *)intel_gpgpu_event_get_gpu_cur_timestamp; } -- 1.8.3.2 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
