I pushed this patch to openacc-gcc-7-branch to allow the runtime to dynamically select a suitable num_worker for OpenACC parallel regions which do not make use of the num_workers clause. As with num_gangs, the default num_workers is now influenced by the GOMP_OPENACC_DIM environment variable. Before, the default num_workers was hard-coded to 32 and quite a few test cases would need to be recompiled with -fopenacc-dim because of the lack of hardware resources.
I'll try to add some test cases later, but ultimately this functionality should be transparent to the end user. In fact, it generally improves usability because it doesn't require the end user to rebuild their program multiple times to find the optimal num_workers. That's because aforementioned GOMP_OPENACC_DIM environment variable can be used at run time. Cesar
2017-08-11 Cesar Philippidis <ce...@codesourcery.com> gcc/ * config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Delete define. (PTX_DEFAULT_RUNTIME_DIM): New define. (nvptx_goacc_validate_dims): Use it to allow the runtime to dynamically allocate num_workers and num_gangs. (nvptx_dim_limit): Don't impose an arbritary num_workers. libgomp/ * plugin/plugin-nvptx.c (nvptx_exec): Dynamically allocate default num_workers. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index e250ebd7179..dfb27efe704 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -4852,7 +4852,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), /* Define dimension sizes for known hardware. */ #define PTX_VECTOR_LENGTH 32 #define PTX_WORKER_LENGTH 32 -#define PTX_GANG_DEFAULT 0 /* Defer to runtime. */ +#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ @@ -4923,9 +4923,9 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level) { dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH; if (dims[GOMP_DIM_WORKER] < 0) - dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; + dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; if (dims[GOMP_DIM_GANG] < 0) - dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT; + dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM; changed = true; } @@ -4939,9 +4939,6 @@ nvptx_dim_limit (int axis) { switch (axis) { - case GOMP_DIM_WORKER: - return PTX_WORKER_LENGTH; - case GOMP_DIM_VECTOR: return PTX_VECTOR_LENGTH; diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 10f000ab3c1..94abfe2036f 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -726,6 +726,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, seen_zero = 1; } + /* Calculate the optimal number of gangs for the current device. */ + int reg_used = targ_fn->regs_per_thread; + int reg_per_warp = ((reg_used * warp_size + reg_unit_size - 1) + / reg_unit_size) * reg_unit_size; + int threads_per_sm = (rf_size / reg_per_warp / reg_granularity) + * reg_granularity * warp_size; + int threads_per_block = threads_per_sm > block_size + ? block_size : threads_per_sm; + + threads_per_block /= warp_size; + + if (threads_per_sm > cpu_size) + threads_per_sm = cpu_size; + /* See if the user provided GOMP_OPENACC_DIM environment variable to specify runtime defaults. */ static int default_dims[GOMP_DIM_MAX]; @@ -765,7 +779,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int gang = 0, worker = 32, vector = 32; gang = (cpu_size / block_size) * dev_size; - worker = block_size / warp_size; vector = warp_size; /* If the user hasn't specified the number of gangs, determine @@ -775,7 +788,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, /* The worker size must not exceed the hardware. */ if (default_dims[GOMP_DIM_WORKER] < 1 || (default_dims[GOMP_DIM_WORKER] > worker && gang)) - default_dims[GOMP_DIM_WORKER] = worker; + default_dims[GOMP_DIM_WORKER] = -1; /* The vector size must exactly match the hardware. */ if (default_dims[GOMP_DIM_VECTOR] < 1 || (default_dims[GOMP_DIM_VECTOR] != vector && gang)) @@ -788,16 +801,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, } pthread_mutex_unlock (&ptx_dev_lock); - /* Calculate the optimal number of gangs for the current device. */ - int reg_used = targ_fn->regs_per_thread; - int reg_per_warp = ((reg_used * warp_size + reg_unit_size - 1) - / reg_unit_size) * reg_unit_size; - int threads_per_sm = (rf_size / reg_per_warp / reg_granularity) - * reg_granularity * warp_size; - - if (threads_per_sm > cpu_size) - threads_per_sm = cpu_size; - if (seen_zero) { for (i = 0; i != GOMP_DIM_MAX; i++) @@ -817,6 +820,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, : 2 * dev_size; break; case GOMP_DIM_WORKER: + dims[i] = threads_per_block; + break; case GOMP_DIM_VECTOR: dims[i] = warp_size; break; @@ -830,11 +835,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, launch the offloaded kernel. */ if (dims[GOMP_DIM_WORKER] > 1) { - int threads_per_block = threads_per_sm > block_size - ? block_size : threads_per_sm; - - threads_per_block /= warp_size; - if (reg_granularity > 0 && dims[GOMP_DIM_WORKER] > threads_per_block) GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources " "to launch '%s'; recompile the program with "