HSW's surface cache control is changed, correct it. Also correct scratch size calculate. And disable exec flag for slm. When kernel parse cmd finish, need remove it totally
Signed-off-by: Yang Rong <[email protected]> --- src/cl_command_queue.c | 4 +-- src/cl_command_queue_gen7.c | 4 +-- src/cl_driver.h | 19 +++++++++++++- src/cl_driver_defs.c | 1 + src/intel/intel_gpgpu.c | 61 ++++++++++++++++++++++++++++----------------- 5 files changed, 61 insertions(+), 28 deletions(-) diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index a2109d7..e6553ec 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -157,9 +157,9 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k) offset = gbe_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i); if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) { struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem; - cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cc_llc_l3); + cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, buffer->sub_offset, cl_gpgpu_get_cache_ctrl()); } else { - cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cc_llc_l3); + cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, 0, cl_gpgpu_get_cache_ctrl()); } } diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index c9818e6..3401baa 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -31,7 +31,7 @@ #include <string.h> #define MAX_GROUP_SIZE_IN_HALFSLICE 512 -static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; } +static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+128; } /* "Varing" payload is the part of the curbe that changes accross threads in the * same work group. Right now, it consists in local IDs and block IPs @@ -244,7 +244,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) assert(offset >= 0); stack_sz *= gbe_kernel_get_simd_width(ker->opaque); stack_sz *= device->max_compute_unit; - cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3); + cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl()); } LOCAL cl_int diff --git a/src/cl_driver.h b/src/cl_driver.h index 3e01c92..2bca443 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -59,7 +59,7 @@ typedef enum cl_gpgpu_tiling { GPGPU_TILE_Y = 2, } cl_gpgpu_tiling; -/* Cache control options */ +/* Cache control options for gen7 */ typedef enum cl_cache_control { cc_gtt = 0x0, cc_l3 = 0x1, @@ -67,6 +67,20 @@ typedef enum cl_cache_control { cc_llc_l3 = 0x3 } cl_cache_control; +/* L3 Cache control options for gen75 */ +typedef enum cl_l3_cache_control { + l3cc_uc = 0x0, + l3cc_ec = 0x1 +} cl_l3_cache_control; + +/* LLCCC Cache control options for gen75 */ +typedef enum cl_llccc_cache_control { + llccc_pte = 0x0<<1, + llccc_uc = 0x1<<1, + llccc_ec = 0x2<<1, + llccc_ucllc = 0x3<<1 +} cl_llccc_cache_control; + typedef enum gpu_command_status { command_queued = 3, command_submitted = 2, @@ -106,6 +120,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf; typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz); extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler; +/* get the default cache control value. */ +typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)(); +extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl; /* Set a 2d texture */ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state, uint32_t id, diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index 95a1a03..ab3af49 100644 --- a/src/cl_driver_defs.c +++ b/src/cl_driver_defs.c @@ -64,6 +64,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL; LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL; LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL; LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL; +LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL; LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL; LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL; LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL; diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index bde9bd5..20b832a 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -121,8 +121,8 @@ typedef struct intel_gpgpu intel_gpgpu_t; typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm); intel_gpgpu_set_L3_t *intel_gpgpu_set_L3 = NULL; -typedef uint32_t (get_scratch_index_t)(uint32_t size); -get_scratch_index_t *get_scratch_index = NULL; +typedef uint32_t (intel_gpgpu_get_scratch_index_t)(uint32_t size); +intel_gpgpu_get_scratch_index_t *intel_gpgpu_get_scratch_index = NULL; static void intel_gpgpu_sync(void *buf) @@ -194,10 +194,22 @@ intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu) ADVANCE_BATCH(gpgpu->batch); } +static uint32_t +intel_gpgpu_get_cache_ctrl_gen7() +{ + return cc_llc_l3; +} + +static uint32_t +intel_gpgpu_get_cache_ctrl_gen75() +{ + return llccc_ec | l3cc_ec; +} + static void intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu) { - const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */ + const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */ BEGIN_BATCH(gpgpu->batch, 10); OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8); /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */ @@ -233,12 +245,12 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu) ADVANCE_BATCH(gpgpu->batch); } -uint32_t get_scratch_index_gen7(uint32_t size) { +uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) { return size / 1024 - 1; } -uint32_t get_scratch_index_gen75(uint32_t size) { - size = size >> 12; +uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) { + size = size >> 11; uint32_t index = 0; while((size >>= 1) > 0) index++; //get leading one @@ -256,7 +268,7 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu) OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2)); if(gpgpu->per_thread_scratch > 0) { - scratch_index = get_scratch_index(gpgpu->per_thread_scratch); + scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch); OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, @@ -356,11 +368,9 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu) static void intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm) { - /* still set L3 in batch buffer for fulsim. */ BEGIN_BATCH(gpgpu->batch, 9); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET); - OUT_BATCH(gpgpu->batch, 0x00730000); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ @@ -377,7 +387,7 @@ intel_gpgpu_set_L3_gen7(intel_gpgpu_t *gpgpu, uint32_t use_slm) OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]); else OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]); - ADVANCE_BATCH(gpgpu->batch); + ADVANCE_BATCH(gpgpu->batch); intel_gpgpu_pipe_control(gpgpu); } @@ -411,25 +421,29 @@ static void intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm) { /* still set L3 in batch buffer for fulsim. */ - BEGIN_BATCH(gpgpu->batch, 6); + BEGIN_BATCH(gpgpu->batch, 9); + OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ + OUT_BATCH(gpgpu->batch, GEN7_L3_SQC_REG1_ADDRESS_OFFSET); + OUT_BATCH(gpgpu->batch, 0x00610000); + OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG2_ADDRESS_OFFSET); + if (use_slm) - OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[8]); + OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[12]); else OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]); OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */ OUT_BATCH(gpgpu->batch, GEN7_L3_CNTL_REG3_ADDRESS_OFFSET); if (use_slm) - OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[8]); + OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[12]); else OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]); ADVANCE_BATCH(gpgpu->batch); - //To set L3 in HSW, enable the flag I915_EXEC_ENABLE_SLM flag when exec - if(use_slm) - gpgpu->batch->enable_slm = 1; + //if(use_slm) + // gpgpu->batch->enable_slm = 1; intel_gpgpu_pipe_control(gpgpu); } @@ -614,7 +628,7 @@ intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size) ss2->ss2.width = s & 0x7f; /* bits 6:0 of sz */ ss2->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */ ss2->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */ - ss2->ss5.cache_control = cc_llc_l3; + ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); heap->binding_table[2] = offsetof(surface_heap_t, surface) + 2* sizeof(gen7_surface_state_t); if(gpgpu->constant_b.bo) @@ -652,7 +666,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu) ss1->ss2.height = ss0->ss2.height = 16383; /* bits 20:7 of sz */ ss0->ss3.depth = 1023; /* bits 30:21 of sz */ ss1->ss3.depth = 1023; /* bits 30:21 of sz */ - ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3; + ss1->ss5.cache_control = ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); heap->binding_table[0] = offsetof(surface_heap_t, surface); heap->binding_table[1] = sizeof(gen7_surface_state_t) + offsetof(surface_heap_t, surface); } @@ -702,7 +716,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu, ss->ss4.not_str_buf.rt_view_extent = depth - 1; ss->ss4.not_str_buf.min_array_element = 0; ss->ss3.pitch = pitch - 1; - ss->ss5.cache_control = cc_llc_l3; + ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); if (tiling == GPGPU_TILE_X) { ss->ss0.tiled_surface = 1; ss->ss0.tile_walk = I965_TILEWALK_XMAJOR; @@ -743,7 +757,7 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu, ss->ss4.not_str_buf.rt_view_extent = depth - 1; ss->ss4.not_str_buf.min_array_element = 0; ss->ss3.pitch = pitch - 1; - ss->ss5.cache_control = cc_llc_l3; + ss->ss5.cache_control = cl_gpgpu_get_cache_ctrl(); ss->ss7.shader_r = I965_SURCHAN_SELECT_RED; ss->ss7.shader_g = I965_SURCHAN_SELECT_GREEN; ss->ss7.shader_b = I965_SURCHAN_SELECT_BLUE; @@ -1208,7 +1222,8 @@ intel_set_gpgpu_callbacks(int device_id) if (IS_HASWELL(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75; - get_scratch_index = get_scratch_index_gen75; + cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75; + intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75; } else if (IS_IVYBRIDGE(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7; @@ -1216,9 +1231,9 @@ intel_set_gpgpu_callbacks(int device_id) intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail; else intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen7; - get_scratch_index = get_scratch_index_gen7; + cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7; + intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7; } else assert(0); } - -- 1.8.3.2 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
