[og7] dynamic num_workers

Cesar Philippidis Fri, 11 Aug 2017 13:46:30 -0700

I pushed this patch to openacc-gcc-7-branch to allow the runtime to
dynamically select a suitable num_worker for OpenACC parallel regions
which do not make use of the num_workers clause. As with num_gangs, the
default num_workers is now influenced by the GOMP_OPENACC_DIM
environment variable. Before, the default num_workers was hard-coded to
32 and quite a few test cases would need to be recompiled with
-fopenacc-dim because of the lack of hardware resources.


I'll try to add some test cases later, but ultimately this functionality
should be transparent to the end user. In fact, it generally improves
usability because it doesn't require the end user to rebuild their
program multiple times to find the optimal num_workers. That's because
aforementioned GOMP_OPENACC_DIM environment variable can be used at run
time.

Cesar

2017-08-11  Cesar Philippidis  <ce...@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Delete define.
	(PTX_DEFAULT_RUNTIME_DIM): New define.
	(nvptx_goacc_validate_dims): Use it to allow the runtime to
	dynamically allocate num_workers and num_gangs. 
	(nvptx_dim_limit): Don't impose an arbritary num_workers.

	libgomp/
	* plugin/plugin-nvptx.c (nvptx_exec): Dynamically allocate
	default num_workers.


diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index e250ebd7179..dfb27efe704 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4852,7 +4852,7 @@ nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 /* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
 #define PTX_WORKER_LENGTH 32
-#define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
+#define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
 
 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
 
@@ -4923,9 +4923,9 @@ nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level)
     {
       dims[GOMP_DIM_VECTOR] = PTX_VECTOR_LENGTH;
       if (dims[GOMP_DIM_WORKER] < 0)
-	dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
+	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
       if (dims[GOMP_DIM_GANG] < 0)
-	dims[GOMP_DIM_GANG] = PTX_GANG_DEFAULT;
+	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
       changed = true;
     }
 
@@ -4939,9 +4939,6 @@ nvptx_dim_limit (int axis)
 {
   switch (axis)
     {
-    case GOMP_DIM_WORKER:
-      return PTX_WORKER_LENGTH;
-
     case GOMP_DIM_VECTOR:
       return PTX_VECTOR_LENGTH;
 
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 10f000ab3c1..94abfe2036f 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -726,6 +726,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 	seen_zero = 1;
     }
 
+  /* Calculate the optimal number of gangs for the current device.  */
+  int reg_used = targ_fn->regs_per_thread;
+  int reg_per_warp = ((reg_used * warp_size + reg_unit_size - 1)
+		      / reg_unit_size) * reg_unit_size;
+  int threads_per_sm = (rf_size / reg_per_warp / reg_granularity)
+    * reg_granularity * warp_size;
+  int threads_per_block = threads_per_sm > block_size
+    ? block_size : threads_per_sm;
+
+  threads_per_block /= warp_size;
+
+  if (threads_per_sm > cpu_size)
+    threads_per_sm = cpu_size;
+
   /* See if the user provided GOMP_OPENACC_DIM environment variable to
      specify runtime defaults. */
   static int default_dims[GOMP_DIM_MAX];
@@ -765,7 +779,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
       int gang = 0, worker = 32, vector = 32;
 
       gang = (cpu_size / block_size) * dev_size;
-      worker = block_size / warp_size;
       vector = warp_size;
 
       /* If the user hasn't specified the number of gangs, determine
@@ -775,7 +788,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
       /* The worker size must not exceed the hardware.  */
       if (default_dims[GOMP_DIM_WORKER] < 1
 	  || (default_dims[GOMP_DIM_WORKER] > worker && gang))
-	default_dims[GOMP_DIM_WORKER] = worker;
+	default_dims[GOMP_DIM_WORKER] = -1;
       /* The vector size must exactly match the hardware.  */
       if (default_dims[GOMP_DIM_VECTOR] < 1
 	  || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
@@ -788,16 +801,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
     }
   pthread_mutex_unlock (&ptx_dev_lock);
 
-  /* Calculate the optimal number of gangs for the current device.  */
-  int reg_used = targ_fn->regs_per_thread;
-  int reg_per_warp = ((reg_used * warp_size + reg_unit_size - 1)
-		      / reg_unit_size) * reg_unit_size;
-  int threads_per_sm = (rf_size / reg_per_warp / reg_granularity)
-    * reg_granularity * warp_size;
-
-  if (threads_per_sm > cpu_size)
-    threads_per_sm = cpu_size;
-
   if (seen_zero)
     {
       for (i = 0; i != GOMP_DIM_MAX; i++)
@@ -817,6 +820,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 		  : 2 * dev_size;
 		break;
 	      case GOMP_DIM_WORKER:
+		dims[i] = threads_per_block;
+		break;
 	      case GOMP_DIM_VECTOR:
 		dims[i] = warp_size;
 		break;
@@ -830,11 +835,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
      launch the offloaded kernel.  */
   if (dims[GOMP_DIM_WORKER] > 1)
     {
-      int threads_per_block = threads_per_sm > block_size
-	? block_size : threads_per_sm;
-
-      threads_per_block /= warp_size;
-
       if (reg_granularity > 0 && dims[GOMP_DIM_WORKER] > threads_per_block)
 	GOMP_PLUGIN_fatal ("The Nvidia accelerator has insufficient resources "
 			   "to launch '%s'; recompile the program with "

[og7] dynamic num_workers

Reply via email to