Pushed, thanks.
On Fri, Oct 18, 2013 at 10:19:51AM +0800, [email protected] wrote: > From: Junyan He <[email protected]> > > We use PIPE_CONTROL to get the time stamps from GPU just after batch > start and before batch flush. Using the first one the caculate the > CL_PROFILING_COMMAND_START time and uing the second one to caculate > the CL_PROFILING_COMMAND_END time. > There are 2 limitations here: > 1. Then end time stamp is just before the FLUSH, so the Flush time > is not included, which will cause to lose the accuracy. Because > the we do not know which event will be used to do the profling > when it is created, adding another flush for end time stamp may > add some overload. > 2. The time of CPU and GPU can not be sync correctly now. So the > time of CL_PROFILING_COMMAND_QUEUED and CL_PROFILING_COMMAND_SUBMIT > which happens on CPU side can not be caculated correctly with the > same base time of GPU. So we just simplely set them to > CL_PROFILING_COMMAND_START now. For the Event not involving GPU > operations such as ReadBuffer, all the times are 0 now. > > Signed-off-by: Junyan He <[email protected]> > --- > src/cl_command_queue_gen7.c | 5 +++- > src/cl_driver.h | 5 +++- > src/cl_driver_defs.c | 1 + > src/cl_event.c | 22 ++++++++++++++++ > src/cl_event.h | 2 ++ > src/intel/intel_defines.h | 4 +++ > src/intel/intel_gpgpu.c | 60 > ++++++++++++++++++++++++++++++++++++++++++- > 7 files changed, 96 insertions(+), 3 deletions(-) > > diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c > index be7bcef..65f8e17 100644 > --- a/src/cl_command_queue_gen7.c > +++ b/src/cl_command_queue_gen7.c > @@ -287,7 +287,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, > } > > /* Setup the kernel */ > - cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32); > + if (queue->props & CL_QUEUE_PROFILING_ENABLE) > + cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, > 1); > + else > + cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, > 0); > > /* Bind user buffers */ > cl_command_queue_bind_surface(queue, ker); > diff --git a/src/cl_driver.h b/src/cl_driver.h > index 100b38d..5ed4fb1 100644 > --- a/src/cl_driver.h > +++ b/src/cl_driver.h > @@ -129,7 +129,7 @@ typedef void (cl_gpgpu_set_scratch_cb)(cl_gpgpu, uint32_t > per_thread_size); > extern cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch; > > /* Configure internal state */ > -typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, > uint32_t size_cs_entry); > +typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, > uint32_t size_cs_entry, int profiling); > extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init; > > /* Set the buffer object where to report performance counters */ > @@ -191,6 +191,9 @@ extern cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume; > typedef void (cl_gpgpu_event_delete_cb)(cl_gpgpu_event); > extern cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete; > > +/* Get a event time stamp */ > +typedef void (cl_gpgpu_event_get_timestamp_cb)(cl_gpgpu_event, int, > uint64_t*); > +extern cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp; > > /* Will spawn all threads */ > typedef void (cl_gpgpu_walker_cb)(cl_gpgpu, > diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c > index ac4ff7a..fe38ba2 100644 > --- a/src/cl_driver_defs.c > +++ b/src/cl_driver_defs.c > @@ -78,4 +78,5 @@ LOCAL cl_gpgpu_event_update_status_cb > *cl_gpgpu_event_update_status = NULL; > LOCAL cl_gpgpu_event_pending_cb *cl_gpgpu_event_pending = NULL; > LOCAL cl_gpgpu_event_resume_cb *cl_gpgpu_event_resume = NULL; > LOCAL cl_gpgpu_event_delete_cb *cl_gpgpu_event_delete = NULL; > +LOCAL cl_gpgpu_event_get_timestamp_cb *cl_gpgpu_event_get_timestamp = NULL; > > diff --git a/src/cl_event.c b/src/cl_event.c > index 918e245..212f1ee 100644 > --- a/src/cl_event.c > +++ b/src/cl_event.c > @@ -490,3 +490,25 @@ cl_int cl_event_marker(cl_command_queue queue, cl_event* > event) > cl_event_set_status(*event, CL_COMPLETE); > return CL_SUCCESS; > } > + > +cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, > cl_ulong *ret_val) > +{ > + if (!event->gpgpu_event) { > + /* Some event like read buffer do not need GPU involved, so > + we just return all the profiling to 0 now. */ > + *ret_val = 0; > + return CL_SUCCESS; > + } > + > + if(param_name == CL_PROFILING_COMMAND_START || > + param_name == CL_PROFILING_COMMAND_QUEUED || > + param_name == CL_PROFILING_COMMAND_SUBMIT) { > + cl_gpgpu_event_get_timestamp(event->gpgpu_event, 0, ret_val); > + return CL_SUCCESS; > + } else if (param_name == CL_PROFILING_COMMAND_END) { > + cl_gpgpu_event_get_timestamp(event->gpgpu_event, 1, ret_val); > + return CL_SUCCESS; > + } else { > + return CL_INVALID_VALUE; > + } > +} > diff --git a/src/cl_event.h b/src/cl_event.h > index 7dde24b..722486a 100644 > --- a/src/cl_event.h > +++ b/src/cl_event.h > @@ -90,5 +90,7 @@ void cl_event_set_status(cl_event, cl_int); > void cl_event_update_status(cl_event); > /* Create the marker event */ > cl_int cl_event_marker(cl_command_queue, cl_event*); > +/* Do the event profiling */ > +cl_int cl_event_profiling(cl_event event, cl_profiling_info param_name, > cl_ulong *ret_val); > #endif /* __CL_EVENT_H__ */ > > diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h > index 19bdbed..e5015ec 100644 > --- a/src/intel/intel_defines.h > +++ b/src/intel/intel_defines.h > @@ -62,6 +62,7 @@ > #define CMD_MEDIA_GATEWAY_STATE CMD(2, 0, 3) > #define CMD_MEDIA_STATE_FLUSH CMD(2, 0, 4) > #define CMD_GPGPU_WALKER CMD(2, 1, 5) > +#define CMD_PIPE_CONTROL CMD(3, 2, 0) > > #define CMD_LOAD_REGISTER_IMM (0x22 << 23) > > @@ -300,6 +301,9 @@ > #define GEN7_PIPE_CONTROL_INSTRUCTION_GFX 0x3 > #define GEN7_PIPE_CONTROL_OPCODE_3D_CONTROL 0x2 > #define GEN7_PIPE_CONTROL_SUBOPCODE_3D_CONTROL 0x0 > +#define GEN7_PIPE_CONTROL_WRITE_TIMESTAMP (3 << 14) > +#define GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE (1 << 2) > + > > #define GEN_MAPFILTER_NEAREST 0x0 > #define GEN_MAPFILTER_LINEAR 0x1 > diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c > index 21cf09b..f3de186 100644 > --- a/src/intel/intel_gpgpu.c > +++ b/src/intel/intel_gpgpu.c > @@ -60,6 +60,7 @@ typedef struct surface_heap { > typedef struct intel_event { > intel_batchbuffer_t *batch; > drm_intel_bo* buffer; > + drm_intel_bo* ts_buf; > int status; > } intel_event_t; > > @@ -98,6 +99,7 @@ struct intel_gpgpu > struct { drm_intel_bo *bo; } perf_b; > struct { drm_intel_bo *bo; } scratch_b; > struct { drm_intel_bo *bo; } constant_b; > + struct { drm_intel_bo *bo; } time_stamp_b; /* time stamp buffer */ > > uint32_t per_thread_scratch; > struct { > @@ -123,6 +125,8 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu) > { > if (gpgpu == NULL) > return; > + if(gpgpu->time_stamp_b.bo) > + drm_intel_bo_unreference(gpgpu->time_stamp_b.bo); > if (gpgpu->surface_heap_b.bo) > drm_intel_bo_unreference(gpgpu->surface_heap_b.bo); > if (gpgpu->idrt_b.bo) > @@ -280,6 +284,21 @@ static const uint32_t gpgpu_l3_config_reg2[] = { > 0x00204080, 0x00244890, 0x00284490, 0x002444A0 > }; > > +/* Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. */ > +static void > +intel_gpgpu_write_timestamp(intel_gpgpu_t *gpgpu, int idx) > +{ > + BEGIN_BATCH(gpgpu->batch, 5); > + OUT_BATCH(gpgpu->batch, CMD_PIPE_CONTROL | (5-2)); > + OUT_BATCH(gpgpu->batch, GEN7_PIPE_CONTROL_WRITE_TIMESTAMP); > + OUT_RELOC(gpgpu->batch, gpgpu->time_stamp_b.bo, > + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, > + GEN7_PIPE_CONTROL_GLOBAL_GTT_WRITE | idx * sizeof(uint64_t)); > + OUT_BATCH(gpgpu->batch, 0); > + OUT_BATCH(gpgpu->batch, 0); > + ADVANCE_BATCH(); > +} > + > static void > intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu) > { > @@ -345,11 +364,19 @@ intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu) > OUT_BATCH(gpgpu->batch, 0); > ADVANCE_BATCH(gpgpu->batch); > } > + > + /* Insert PIPE_CONTROL for time stamp of start*/ > + if (gpgpu->time_stamp_b.bo) > + intel_gpgpu_write_timestamp(gpgpu, 0); > } > > static void > intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode) > { > + /* Insert PIPE_CONTROL for time stamp of end*/ > + if (gpgpu->time_stamp_b.bo) > + intel_gpgpu_write_timestamp(gpgpu, 1); > + > /* Insert the performance counter command */ > if (gpgpu->perf_b.bo) { > BEGIN_BATCH(gpgpu->batch, 3); > @@ -394,7 +421,8 @@ intel_gpgpu_flush(intel_gpgpu_t *gpgpu) > static void > intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, > uint32_t max_threads, > - uint32_t size_cs_entry) > + uint32_t size_cs_entry, > + int profiling) > { > drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; > drm_intel_bo *bo; > @@ -410,6 +438,16 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, > gpgpu->urb.size_cs_entry = size_cs_entry; > gpgpu->max_threads = max_threads; > > + /* Set the profile buffer*/ > + if(gpgpu->time_stamp_b.bo) > + dri_bo_unreference(gpgpu->time_stamp_b.bo); > + gpgpu->time_stamp_b.bo = NULL; > + if (profiling) { > + bo = dri_bo_alloc(gpgpu->drv->bufmgr, "timestamp query", 4096, 4096); > + assert(bo); > + gpgpu->time_stamp_b.bo = bo; > + } > + > /* Constant URB buffer */ > if(gpgpu->curbe_b.bo) > dri_bo_unreference(gpgpu->curbe_b.bo); > @@ -926,6 +964,11 @@ intel_gpgpu_event_new(intel_gpgpu_t *gpgpu) > if(event->buffer != NULL) > drm_intel_bo_reference(event->buffer); > > + if(gpgpu->time_stamp_b.bo) { > + event->ts_buf = gpgpu->time_stamp_b.bo; > + drm_intel_bo_reference(event->ts_buf); > + } > + > exit: > return event; > error: > @@ -988,9 +1031,23 @@ intel_gpgpu_event_delete(intel_event_t *event) > assert(event->batch == NULL); //This command must have been flushed. > if(event->buffer) > drm_intel_bo_unreference(event->buffer); > + if(event->ts_buf) > + drm_intel_bo_unreference(event->ts_buf); > cl_free(event); > } > > +static void > +intel_gpgpu_event_get_timestamp(intel_event_t *event, int index, uint64_t* > ret_ts) > +{ > + assert(event->ts_buf != NULL); > + assert(index == 0 || index == 1); > + drm_intel_gem_bo_map_gtt(event->ts_buf); > + uint64_t* ptr = event->ts_buf->virtual; > + > + *ret_ts = ptr[index] * 80; //convert to nanoseconds > + drm_intel_gem_bo_unmap_gtt(event->ts_buf); > +} > + > LOCAL void > intel_set_gpgpu_callbacks(void) > { > @@ -1018,5 +1075,6 @@ intel_set_gpgpu_callbacks(void) > cl_gpgpu_event_pending = (cl_gpgpu_event_pending_cb > *)intel_gpgpu_event_pending; > cl_gpgpu_event_resume = (cl_gpgpu_event_resume_cb > *)intel_gpgpu_event_resume; > cl_gpgpu_event_delete = (cl_gpgpu_event_delete_cb > *)intel_gpgpu_event_delete; > + cl_gpgpu_event_get_timestamp = (cl_gpgpu_event_get_timestamp_cb > *)intel_gpgpu_event_get_timestamp; > } > > -- > 1.7.9.5 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
