Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

Cesar Philippidis Fri, 10 Aug 2018 11:39:44 -0700

On 08/08/2018 08:19 AM, Tom de Vries wrote:
> On Wed, Aug 08, 2018 at 07:09:16AM -0700, Cesar Philippidis wrote:
>> On 08/07/2018 06:52 AM, Cesar Philippidis wrote:


Thanks for review. This version should address all of the following
remarks. However, one thing to note ...

>> [nvptx] Use CUDA driver API to select default runtime launch geometry
>>
>> 2018-08-YY  Cesar Philippidis  <[email protected]>
>>
>>      libgomp/
>>      plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
>>      (cuDriverGetVersion): Declare.
>>      (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
>>      plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
>>      cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
>>      (ptx_device): Add driver_version member.
>>      (nvptx_open_device): Initialize it.
>>      (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
>>      default num_gangs and num_workers when the driver supports it.
>> ---
>>  libgomp/plugin/cuda-lib.def   |  2 ++
>>  libgomp/plugin/cuda/cuda.h    |  4 ++++
>>  libgomp/plugin/plugin-nvptx.c | 40 +++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 45 insertions(+), 1 deletion(-)
>>
>> diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
>> index be8e3b3..f2433e1 100644
>> --- a/libgomp/plugin/cuda-lib.def
>> +++ b/libgomp/plugin/cuda-lib.def
>> @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
>>  CUDA_ONE_CALL (cuCtxDestroy)
>>  CUDA_ONE_CALL (cuCtxGetCurrent)
>>  CUDA_ONE_CALL (cuCtxGetDevice)
>> +CUDA_ONE_CALL (cuDriverGetVersion)
> 
> Don't use cuDriverGetVersion.
> 
>>  CUDA_ONE_CALL (cuCtxPopCurrent)
>>  CUDA_ONE_CALL (cuCtxPushCurrent)
>>  CUDA_ONE_CALL (cuCtxSynchronize)
>> @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
>>  CUDA_ONE_CALL (cuModuleLoad)
>>  CUDA_ONE_CALL (cuModuleLoadData)
>>  CUDA_ONE_CALL (cuModuleUnload)
>> +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
> 
> Use CUDA_ONE_CALL_MAYBE_NULL.
> 
>>  CUDA_ONE_CALL (cuStreamCreate)
>>  CUDA_ONE_CALL (cuStreamDestroy)
>>  CUDA_ONE_CALL (cuStreamQuery)
>> diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
>> index 4799825..3a790e6 100644
>> --- a/libgomp/plugin/cuda/cuda.h
>> +++ b/libgomp/plugin/cuda/cuda.h
>> @@ -44,6 +44,7 @@ typedef void *CUevent;
>>  typedef void *CUfunction;
>>  typedef void *CUlinkState;
>>  typedef void *CUmodule;
>> +typedef size_t (*CUoccupancyB2DSize)(int);
>>  typedef void *CUstream;
>>  
>>  typedef enum {
>> @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
>>  CUresult cuDeviceGet (CUdevice *, int);
>>  CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
>>  CUresult cuDeviceGetCount (int *);
>> +CUresult cuDriverGetVersion(int *);
>>  CUresult cuEventCreate (CUevent *, unsigned);
>>  #define cuEventDestroy cuEventDestroy_v2
>>  CUresult cuEventDestroy (CUevent);
>> @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, 
>> CUmodule, const char *);
>>  CUresult cuModuleLoad (CUmodule *, const char *);
>>  CUresult cuModuleLoadData (CUmodule *, const void *);
>>  CUresult cuModuleUnload (CUmodule);
>> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
>> +                                      CUoccupancyB2DSize, size_t, int);
>>  CUresult cuStreamCreate (CUstream *, unsigned);
>>  #define cuStreamDestroy cuStreamDestroy_v2
>>  CUresult cuStreamDestroy (CUstream);
>> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
>> index 825470a..b0ccf0b 100644
>> --- a/libgomp/plugin/plugin-nvptx.c
>> +++ b/libgomp/plugin/plugin-nvptx.c
>> @@ -376,6 +376,7 @@ struct ptx_device
>>    int max_threads_per_block;
>>    int max_threads_per_multiprocessor;
>>    int default_dims[GOMP_DIM_MAX];
>> +  int driver_version;
>>  
>>    struct ptx_image_data *images;  /* Images loaded on device.  */
>>    pthread_mutex_t image_lock;     /* Lock for above list.  */
>> @@ -687,6 +688,7 @@ nvptx_open_device (int n)
>>    ptx_dev->ord = n;
>>    ptx_dev->dev = dev;
>>    ptx_dev->ctx_shared = false;
>> +  ptx_dev->driver_version = 0;
>>  
>>    r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
>>    if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
>> @@ -780,6 +782,9 @@ nvptx_open_device (int n)
>>    for (int i = 0; i != GOMP_DIM_MAX; i++)
>>      ptx_dev->default_dims[i] = 0;
>>  
>> +  CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
>> +  ptx_dev->driver_version = pi;
>> +
>>    ptx_dev->images = NULL;
>>    pthread_mutex_init (&ptx_dev->image_lock, NULL);
>>  
>> @@ -1173,11 +1178,44 @@ nvptx_exec (void (*fn), size_t mapnum, void 
>> **hostaddrs, void **devaddrs,
>>  
>>        {
>>      bool default_dim_p[GOMP_DIM_MAX];
>> +    int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
>> +    int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
>> +    int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
>> +

is that I modified the default value for vectors as follows

+           int vectors = default_dim_p[GOMP_DIM_VECTOR]
+             ? 0 : dims[GOMP_DIM_VECTOR];

Technically, trunk only supports warp-sized vectors, but the fallback
code is already checking for the presence of vectors as so

+           if (default_dim_p[GOMP_DIM_VECTOR])
+             dims[GOMP_DIM_VECTOR]
+               = MIN (dims[GOMP_DIM_VECTOR],
+                      (targ_fn->max_threads_per_block / warp_size
+                       * warp_size));

therefore, I had the cuOccupancyMaxPotentialBlockSize code path behave
the same. If you want, I can resubmit a patch without that change though.

>> +    /* The CUDA driver occupancy calculator is only available on
>> +       CUDA version 6.5 (6050) and newer.  */
>> +#if (CUDA_VERSION >= 6050)
>> +    if (nvthd->ptx_dev->driver_version > 6050)
> 
> Use CUDA_CALL_EXISTS.
> 
>> +      {
>> +        int grids, blocks;
>> +        CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
>> +                          &blocks, function, NULL, 0,
>> +                          dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
>> +        GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
>> +                           "grid = %d, block = %d\n", grids, blocks);
>> +
>> +        /* Keep the num_gangs proportional to the block size.  The
>> +           constant factor 2 is there to prevent threads from
>> +           idling when there is sufficient work for them.  */
> 
> typo: sufficient -> insufficient
> 
> Also, reformulate the first part of rationale comment to say: "Keep the
> num_gangs proportional to the block size, in order to ..." or some such, along
> the lines of what you mentioned here (
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00289.html ).
> 
>> +        if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)
>> +          gangs = 2 * grids * (blocks / warp_size);
>> +
>> +        if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
>> +          workers = blocks / vectors;
>> +      }
>> +#endif
>> +
>>      for (i = 0; i != GOMP_DIM_MAX; i++)
>>        {
>>          default_dim_p[i] = !dims[i];
>>          if (default_dim_p[i])
>> -          dims[i] = nvthd->ptx_dev->default_dims[i];
>> +          switch (i)
>> +            {
>> +            case GOMP_DIM_GANG: dims[i] = gangs; break;
>> +            case GOMP_DIM_WORKER: dims[i] = workers; break;
>> +            case GOMP_DIM_VECTOR: dims[i] = vectors; break;
>> +            default: GOMP_PLUGIN_fatal ("invalid dim");
>> +            }
>>        }
>>  
> 
> The new default calculation is not cleanly separated from the fallback default
> calculation. I've already shown you how that should be done: (
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00027.html ).

Is this patch OK for trunk? I tested it with CUDA 5.5, 8.0 and 9.0, with
and without --without-cuda-driver.

Thanks,
Cesar

[nvptx] Use CUDA driver API to select default runtime launch geometry

	PR target/85590

	libgomp/
	plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
	(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
	plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
	CUDA_ONE_CALL_MAYBE_NULL.
	plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
	CUoccupancyB2DSize and declare
	cuOccupancyMaxPotentialBlockSizeWithFlags.
	(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
	default num_gangs and num_workers when the driver supports it.

---
 libgomp/plugin/cuda-lib.def   |  1 +
 libgomp/plugin/cuda/cuda.h    |  3 ++
 libgomp/plugin/plugin-nvptx.c | 77 +++++++++++++++++++++++++++++------
 3 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 29028b504a0..b2a4c2154eb 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
 CUDA_ONE_CALL (cuModuleLoad)
 CUDA_ONE_CALL (cuModuleLoadData)
 CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
 CUDA_ONE_CALL (cuStreamCreate)
 CUDA_ONE_CALL (cuStreamDestroy)
 CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..b4c1b29c5d8 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;
 
 typedef enum {
@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 6799a264976..9a4bc11e5fe 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
 			const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
 #else
+typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
 			   const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 #endif
 
 #define DO_PRAGMA(x) _Pragma (#x)
@@ -1200,21 +1203,71 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
       {
 	bool default_dim_p[GOMP_DIM_MAX];
 	for (i = 0; i != GOMP_DIM_MAX; i++)
+	  default_dim_p[i] = !dims[i];
+
+	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
 	  {
-	    default_dim_p[i] = !dims[i];
-	    if (default_dim_p[i])
-	      dims[i] = nvthd->ptx_dev->default_dims[i];
-	  }
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      {
+		if (default_dim_p[i])
+		  dims[i] = nvthd->ptx_dev->default_dims[i];
+	      }
 
-	if (default_dim_p[GOMP_DIM_VECTOR])
-	  dims[GOMP_DIM_VECTOR]
-	    = MIN (dims[GOMP_DIM_VECTOR],
-		   (targ_fn->max_threads_per_block / warp_size * warp_size));
+	    if (default_dim_p[GOMP_DIM_VECTOR])
+	      dims[GOMP_DIM_VECTOR]
+		= MIN (dims[GOMP_DIM_VECTOR],
+		       (targ_fn->max_threads_per_block / warp_size
+			* warp_size));
 
-	if (default_dim_p[GOMP_DIM_WORKER])
-	  dims[GOMP_DIM_WORKER]
-	    = MIN (dims[GOMP_DIM_WORKER],
-		   targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+	    if (default_dim_p[GOMP_DIM_WORKER])
+	      dims[GOMP_DIM_WORKER]
+		= MIN (dims[GOMP_DIM_WORKER],
+		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+	  }
+	else
+	  {
+	    int vectors = default_dim_p[GOMP_DIM_VECTOR]
+	      ? 0 : dims[GOMP_DIM_VECTOR];
+	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+	    int grids, blocks;
+
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    /* Keep the num_gangs proportional to the block size.  In
+	       the case were a block size is limited by shared-memory
+	       or the register file capacity, the runtime will not
+	       excessively over assign gangs to the multiprocessor
+	       units if their state is going to be swapped out even
+	       more than necessary. The constant factor 2 is there to
+	       prevent threads from idling when there is insufficient
+	       work for them.  */
+	    if (gangs == 0)
+	      gangs = 2 * grids * (blocks / warp_size);
+
+	    if (vectors == 0)
+	      vectors = warp_size;
+
+	    if (workers == 0)
+	      workers = blocks / vectors;
+
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      {
+		default_dim_p[i] = !dims[i];
+		if (default_dim_p[i])
+		  switch (i)
+		    {
+		    case GOMP_DIM_GANG: dims[i] = gangs; break;
+		    case GOMP_DIM_WORKER: dims[i] = workers; break;
+		    case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		    default: GOMP_PLUGIN_fatal ("invalid dim");
+		    }
+	      }
+	  }
       }
     }
 
-- 
2.17.1

Re: [PATCH,nvptx] Use CUDA driver API to select default runtime launch, geometry

Reply via email to