Does this mean Beignet can launch 560 work item (70 hardware thread * 10 EU * SIMD8) work groups on Haswell as requested here: https://software.intel.com/en-us/forums/topic/516373 ?
On Mon, Jun 30, 2014 at 7:32 PM, Song, Ruiling <[email protected]> wrote: > Looks good to me > > -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Zhigang Gong > Sent: Tuesday, July 01, 2014 8:31 AM > To: [email protected] > Cc: Zhigang Gong > Subject: [Beignet] [PATCH v3] runtime: fix max group size calculation > issue. > > From: Zhigang Gong <[email protected]> > > If the kernel doesn't use slm/barrier, there is no hard limitation for the > max group size. And if the max work group size is more than 1024, the > original 64 urb entry count will not be sufficient to hold all the curbe > payload. Change the entry count to max thread count to fix this potential > issue. > > I found this bug when I tried to run phoronix test suite's juliagpu test > case on my MBA. > > v2: > refine the max kernel work group size calculation mechanism. > the wg_sz should not be a device's member variable, it should be a > variable derived from kernel and device's attriute at runtime. > also fix wrong configuration for IVB GT1. > > v3: > Add an important max thread limitation in the GPGPU_WALKER command. > For non-Baytrail, the max thread depth * max thread height * max thread > width should less than 64 (under either simd16 or simd8), no matter whether > SLM/barrier is used. We oversighted that limitation before, thus for a > simd8 kernel which use work group size 1024 will exceed this limitation and > half of the thread will not be executed at all. > > Signed-off-by: Zhigang Gong <[email protected]> > --- > src/cl_command_queue_gen7.c | 6 ----- > src/cl_device_id.c | 63 > +++++++++++++++++++++++++++++++-------------- > src/cl_device_id.h | 12 ++++----- > src/cl_kernel.c | 2 +- > src/intel/intel_gpgpu.c | 3 ++- > 5 files changed, 53 insertions(+), 33 deletions(-) > > diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c > index 9af4829..5b80d74 100644 > --- a/src/cl_command_queue_gen7.c > +++ b/src/cl_command_queue_gen7.c > @@ -304,12 +304,6 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, > kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz; > kernel.curbe_sz = cst_sz; > > - /* Barrier and SLM must fit into a single half slice */ > - if(kernel.use_slm > 0 && simd_sz == 8 && local_sz > > MAX_GROUP_SIZE_IN_HALFSLICE){ > - fprintf(stderr, "Beignet: Work group CAN NOT large than %d when using > barrier or local momery.\n", MAX_GROUP_SIZE_IN_HALFSLICE); > - return CL_OUT_OF_RESOURCES; > - } > - > if (scratch_sz > ker->program->ctx->device->scratch_mem_size) { > fprintf(stderr, "Beignet: Out of scratch memory %d.\n", scratch_sz); > return CL_OUT_OF_RESOURCES; > diff --git a/src/cl_device_id.c b/src/cl_device_id.c index > 0263f02..4db580e 100644 > --- a/src/cl_device_id.c > +++ b/src/cl_device_id.c > @@ -40,21 +40,19 @@ static struct _cl_device_id intel_ivb_gt2_device = { > INIT_ICD(dispatch) > .max_compute_unit = 16, > .max_thread_per_unit = 8, > - .max_work_item_sizes = {512, 512, 512}, > + .max_work_item_sizes = {1024, 1024, 1024}, > .max_work_group_size = 1024, > .max_clock_frequency = 1000, > - .wg_sz = 1024, > #include "cl_gen7_device.h" > }; > > static struct _cl_device_id intel_ivb_gt1_device = { > INIT_ICD(dispatch) > - .max_compute_unit = 8, > - .max_thread_per_unit = 8, > + .max_compute_unit = 6, > + .max_thread_per_unit = 6, > .max_work_item_sizes = {512, 512, 512}, > .max_work_group_size = 512, > .max_clock_frequency = 1000, > - .wg_sz = 512, > #include "cl_gen7_device.h" > }; > > @@ -63,9 +61,8 @@ static struct _cl_device_id intel_baytrail_t_device = { > .max_compute_unit = 4, > .max_thread_per_unit = 8, > .max_work_item_sizes = {512, 512, 512}, > - .max_work_group_size = 256, > + .max_work_group_size = 512, > .max_clock_frequency = 1000, > - .wg_sz = 256, > #include "cl_gen7_device.h" > }; > > @@ -74,10 +71,9 @@ static struct _cl_device_id intel_hsw_gt1_device = { > INIT_ICD(dispatch) > .max_compute_unit = 10, > .max_thread_per_unit = 7, > - .max_work_item_sizes = {512, 512, 512}, > - .max_work_group_size = 512, > + .max_work_item_sizes = {1024, 1024, 1024}, .max_work_group_size = > + 1024, > .max_clock_frequency = 1000, > - .wg_sz = 512, > #include "cl_gen75_device.h" > }; > > @@ -85,10 +81,9 @@ static struct _cl_device_id intel_hsw_gt2_device = { > INIT_ICD(dispatch) > .max_compute_unit = 20, > .max_thread_per_unit = 7, > - .max_work_item_sizes = {512, 512, 512}, > + .max_work_item_sizes = {1024, 1024, 1024}, > .max_work_group_size = 1024, > .max_clock_frequency = 1000, > - .wg_sz = 1024, > #include "cl_gen75_device.h" > }; > > @@ -96,10 +91,9 @@ static struct _cl_device_id intel_hsw_gt3_device = { > INIT_ICD(dispatch) > .max_compute_unit = 40, > .max_thread_per_unit = 7, > - .max_work_item_sizes = {512, 512, 512}, > + .max_work_item_sizes = {1024, 1024, 1024}, > .max_work_group_size = 1024, > .max_clock_frequency = 1000, > - .wg_sz = 2048, > #include "cl_gen75_device.h" > }; > > @@ -465,6 +459,26 @@ cl_device_get_version(cl_device_id device, cl_int > *ver) > _DECL_FIELD(FIELD) > > #include "cl_kernel.h" > +#include "cl_program.h" > + > +LOCAL size_t > +cl_get_kernel_max_wg_sz(cl_kernel kernel) { > + size_t work_group_size; > + int simd_width = interp_kernel_get_simd_width(kernel->opaque); > + int vendor_id = kernel->program->ctx->device->vendor_id; > + if (!interp_kernel_use_slm(kernel->opaque)) { > + if (!IS_BAYTRAIL_T(vendor_id) || simd_width == 16) > + work_group_size = simd_width * 64; > + else > + work_group_size = kernel->program->ctx->device->max_compute_unit * > + > +kernel->program->ctx->device->max_thread_per_unit * simd_width; > + } else > + work_group_size = kernel->program->ctx->device->max_work_group_size / > + (16 / simd_width); > + return work_group_size; > +} > + > LOCAL cl_int > cl_get_kernel_workgroup_info(cl_kernel kernel, > cl_device_id device, @@ -484,13 +498,24 @@ > cl_get_kernel_workgroup_info(cl_kernel kernel, > > CHECK_KERNEL(kernel); > switch (param_name) { > - DECL_FIELD(WORK_GROUP_SIZE, device->wg_sz) > + case CL_KERNEL_WORK_GROUP_SIZE: > + { > + if (param_value && param_value_size < sizeof(size_t)) > + return CL_INVALID_VALUE; > + if (param_value_size_ret != NULL) > + *param_value_size_ret = sizeof(size_t); > + if (param_value) { > + size_t work_group_size = cl_get_kernel_max_wg_sz(kernel); > + *(size_t*)param_value = work_group_size; > + return CL_SUCCESS; > + } > + } > DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, > device->preferred_wg_sz_mul) > case CL_KERNEL_LOCAL_MEM_SIZE: > - { > - size_t local_mem_sz = interp_kernel_get_slm_size(kernel->opaque) > + kernel->local_mem_sz; > - _DECL_FIELD(local_mem_sz) > - } > + { > + size_t local_mem_sz = interp_kernel_get_slm_size(kernel->opaque) + > kernel->local_mem_sz; > + _DECL_FIELD(local_mem_sz) > + } > DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz) > DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size) > default: > diff --git a/src/cl_device_id.h b/src/cl_device_id.h index > 422ef37..c4f8227 100644 > --- a/src/cl_device_id.h > +++ b/src/cl_device_id.h > @@ -25,11 +25,11 @@ struct _cl_device_id { > DEFINE_ICD(dispatch) > cl_device_type device_type; > cl_uint vendor_id; > - cl_uint max_compute_unit; > - cl_uint max_thread_per_unit; > - cl_uint max_work_item_dimensions; > - size_t max_work_item_sizes[3]; > - size_t max_work_group_size; > + cl_uint max_compute_unit; // maximum EU number > + cl_uint max_thread_per_unit; // maximum EU threads per EU. > + cl_uint max_work_item_dimensions; // should be 3. > + size_t max_work_item_sizes[3]; // equal to maximum work group > size. > + size_t max_work_group_size; // maximum work group size > under simd16 mode. > cl_uint preferred_vector_width_char; > cl_uint preferred_vector_width_short; > cl_uint preferred_vector_width_int; > @@ -101,7 +101,6 @@ struct _cl_device_id { > size_t driver_version_sz; > size_t built_in_kernels_sz; > /* Kernel specific info that we're assigning statically */ > - size_t wg_sz; > size_t preferred_wg_sz_mul; > /* SubDevice specific info */ > cl_device_id parent_device; > @@ -137,6 +136,7 @@ extern cl_int cl_get_kernel_workgroup_info(cl_kernel > kernel, > size_t * > param_value_size_ret); > /* Returns the Gen device ID */ > extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver); > +extern size_t cl_get_kernel_max_wg_sz(cl_kernel); > > #endif /* __CL_DEVICE_ID_H__ */ > > diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 5d0b36e..aad3c04 > 100644 > --- a/src/cl_kernel.c > +++ b/src/cl_kernel.c > @@ -410,7 +410,7 @@ cl_kernel_work_group_sz(cl_kernel ker, > for (i = 1; i < wk_dim; ++i) > sz *= local_wk_sz[i]; > > - if (sz > ker->program->ctx->device->max_work_group_size) { > + if (sz > cl_get_kernel_max_wg_sz(ker)) { > err = CL_INVALID_WORK_ITEM_SIZE; > goto error; > } > diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index > 3b89539..d403aa0 100644 > --- a/src/intel/intel_gpgpu.c > +++ b/src/intel/intel_gpgpu.c > @@ -577,7 +577,7 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu, > gpgpu->sampler_bitmap = ~((1 << max_sampler_n) - 1); > > /* URB */ > - gpgpu->urb.num_cs_entries = 64; > + gpgpu->urb.num_cs_entries = max_threads; > gpgpu->urb.size_cs_entry = size_cs_entry; > gpgpu->max_threads = max_threads; > > @@ -1113,6 +1113,7 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu, > BEGIN_BATCH(gpgpu->batch, 11); > OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9); > OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 > */ > + assert(thread_n <= 64); > if (simd_sz == 16) > OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread > max */ > else > -- > 1.8.3.2 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet > -- Allan MacKinnon :: [email protected] Pixel I/O, LLC :: www.pixel.io :: @pixelio
_______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
