On 08/08/2018 08:19 AM, Tom de Vries wrote:
> On Wed, Aug 08, 2018 at 07:09:16AM -0700, Cesar Philippidis wrote:
>> On 08/07/2018 06:52 AM, Cesar Philippidis wrote:
Thanks for review. This version should address all of the following
remarks. However, one thing to note ...
>> [nvptx] Use CUDA driver API to select default runtime launch geometry
>>
>> 2018-08-YY Cesar Philippidis <[email protected]>
>>
>> libgomp/
>> plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
>> (cuDriverGetVersion): Declare.
>> (cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
>> plugin/plugin-nvptx.c (CUDA_ONE_CALL): Add entries for
>> cuDriverGetVersion and cuOccupancyMaxPotentialBlockSize.
>> (ptx_device): Add driver_version member.
>> (nvptx_open_device): Initialize it.
>> (nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
>> default num_gangs and num_workers when the driver supports it.
>> ---
>> libgomp/plugin/cuda-lib.def | 2 ++
>> libgomp/plugin/cuda/cuda.h | 4 ++++
>> libgomp/plugin/plugin-nvptx.c | 40 +++++++++++++++++++++++++++++++++++++++-
>> 3 files changed, 45 insertions(+), 1 deletion(-)
>>
>> diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
>> index be8e3b3..f2433e1 100644
>> --- a/libgomp/plugin/cuda-lib.def
>> +++ b/libgomp/plugin/cuda-lib.def
>> @@ -2,6 +2,7 @@ CUDA_ONE_CALL (cuCtxCreate)
>> CUDA_ONE_CALL (cuCtxDestroy)
>> CUDA_ONE_CALL (cuCtxGetCurrent)
>> CUDA_ONE_CALL (cuCtxGetDevice)
>> +CUDA_ONE_CALL (cuDriverGetVersion)
>
> Don't use cuDriverGetVersion.
>
>> CUDA_ONE_CALL (cuCtxPopCurrent)
>> CUDA_ONE_CALL (cuCtxPushCurrent)
>> CUDA_ONE_CALL (cuCtxSynchronize)
>> @@ -39,6 +40,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
>> CUDA_ONE_CALL (cuModuleLoad)
>> CUDA_ONE_CALL (cuModuleLoadData)
>> CUDA_ONE_CALL (cuModuleUnload)
>> +CUDA_ONE_CALL (cuOccupancyMaxPotentialBlockSize)
>
> Use CUDA_ONE_CALL_MAYBE_NULL.
>
>> CUDA_ONE_CALL (cuStreamCreate)
>> CUDA_ONE_CALL (cuStreamDestroy)
>> CUDA_ONE_CALL (cuStreamQuery)
>> diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
>> index 4799825..3a790e6 100644
>> --- a/libgomp/plugin/cuda/cuda.h
>> +++ b/libgomp/plugin/cuda/cuda.h
>> @@ -44,6 +44,7 @@ typedef void *CUevent;
>> typedef void *CUfunction;
>> typedef void *CUlinkState;
>> typedef void *CUmodule;
>> +typedef size_t (*CUoccupancyB2DSize)(int);
>> typedef void *CUstream;
>>
>> typedef enum {
>> @@ -123,6 +124,7 @@ CUresult cuCtxSynchronize (void);
>> CUresult cuDeviceGet (CUdevice *, int);
>> CUresult cuDeviceGetAttribute (int *, CUdevice_attribute, CUdevice);
>> CUresult cuDeviceGetCount (int *);
>> +CUresult cuDriverGetVersion(int *);
>> CUresult cuEventCreate (CUevent *, unsigned);
>> #define cuEventDestroy cuEventDestroy_v2
>> CUresult cuEventDestroy (CUevent);
>> @@ -170,6 +172,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *,
>> CUmodule, const char *);
>> CUresult cuModuleLoad (CUmodule *, const char *);
>> CUresult cuModuleLoadData (CUmodule *, const void *);
>> CUresult cuModuleUnload (CUmodule);
>> +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
>> + CUoccupancyB2DSize, size_t, int);
>> CUresult cuStreamCreate (CUstream *, unsigned);
>> #define cuStreamDestroy cuStreamDestroy_v2
>> CUresult cuStreamDestroy (CUstream);
>> diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
>> index 825470a..b0ccf0b 100644
>> --- a/libgomp/plugin/plugin-nvptx.c
>> +++ b/libgomp/plugin/plugin-nvptx.c
>> @@ -376,6 +376,7 @@ struct ptx_device
>> int max_threads_per_block;
>> int max_threads_per_multiprocessor;
>> int default_dims[GOMP_DIM_MAX];
>> + int driver_version;
>>
>> struct ptx_image_data *images; /* Images loaded on device. */
>> pthread_mutex_t image_lock; /* Lock for above list. */
>> @@ -687,6 +688,7 @@ nvptx_open_device (int n)
>> ptx_dev->ord = n;
>> ptx_dev->dev = dev;
>> ptx_dev->ctx_shared = false;
>> + ptx_dev->driver_version = 0;
>>
>> r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
>> if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
>> @@ -780,6 +782,9 @@ nvptx_open_device (int n)
>> for (int i = 0; i != GOMP_DIM_MAX; i++)
>> ptx_dev->default_dims[i] = 0;
>>
>> + CUDA_CALL_ERET (NULL, cuDriverGetVersion, &pi);
>> + ptx_dev->driver_version = pi;
>> +
>> ptx_dev->images = NULL;
>> pthread_mutex_init (&ptx_dev->image_lock, NULL);
>>
>> @@ -1173,11 +1178,44 @@ nvptx_exec (void (*fn), size_t mapnum, void
>> **hostaddrs, void **devaddrs,
>>
>> {
>> bool default_dim_p[GOMP_DIM_MAX];
>> + int vectors = nvthd->ptx_dev->default_dims[GOMP_DIM_VECTOR];
>> + int workers = nvthd->ptx_dev->default_dims[GOMP_DIM_WORKER];
>> + int gangs = nvthd->ptx_dev->default_dims[GOMP_DIM_GANG];
>> +
is that I modified the default value for vectors as follows
+ int vectors = default_dim_p[GOMP_DIM_VECTOR]
+ ? 0 : dims[GOMP_DIM_VECTOR];
Technically, trunk only supports warp-sized vectors, but the fallback
code is already checking for the presence of vectors as so
+ if (default_dim_p[GOMP_DIM_VECTOR])
+ dims[GOMP_DIM_VECTOR]
+ = MIN (dims[GOMP_DIM_VECTOR],
+ (targ_fn->max_threads_per_block / warp_size
+ * warp_size));
therefore, I had the cuOccupancyMaxPotentialBlockSize code path behave
the same. If you want, I can resubmit a patch without that change though.
>> + /* The CUDA driver occupancy calculator is only available on
>> + CUDA version 6.5 (6050) and newer. */
>> +#if (CUDA_VERSION >= 6050)
>> + if (nvthd->ptx_dev->driver_version > 6050)
>
> Use CUDA_CALL_EXISTS.
>
>> + {
>> + int grids, blocks;
>> + CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
>> + &blocks, function, NULL, 0,
>> + dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
>> + GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
>> + "grid = %d, block = %d\n", grids, blocks);
>> +
>> + /* Keep the num_gangs proportional to the block size. The
>> + constant factor 2 is there to prevent threads from
>> + idling when there is sufficient work for them. */
>
> typo: sufficient -> insufficient
>
> Also, reformulate the first part of rationale comment to say: "Keep the
> num_gangs proportional to the block size, in order to ..." or some such, along
> the lines of what you mentioned here (
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00289.html ).
>
>> + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_GANG) == 0)
>> + gangs = 2 * grids * (blocks / warp_size);
>> +
>> + if (GOMP_PLUGIN_acc_default_dim (GOMP_DIM_WORKER) == 0)
>> + workers = blocks / vectors;
>> + }
>> +#endif
>> +
>> for (i = 0; i != GOMP_DIM_MAX; i++)
>> {
>> default_dim_p[i] = !dims[i];
>> if (default_dim_p[i])
>> - dims[i] = nvthd->ptx_dev->default_dims[i];
>> + switch (i)
>> + {
>> + case GOMP_DIM_GANG: dims[i] = gangs; break;
>> + case GOMP_DIM_WORKER: dims[i] = workers; break;
>> + case GOMP_DIM_VECTOR: dims[i] = vectors; break;
>> + default: GOMP_PLUGIN_fatal ("invalid dim");
>> + }
>> }
>>
>
> The new default calculation is not cleanly separated from the fallback default
> calculation. I've already shown you how that should be done: (
> https://gcc.gnu.org/ml/gcc-patches/2018-08/msg00027.html ).
Is this patch OK for trunk? I tested it with CUDA 5.5, 8.0 and 9.0, with
and without --without-cuda-driver.
Thanks,
Cesar
[nvptx] Use CUDA driver API to select default runtime launch geometry
PR target/85590
libgomp/
plugin/cuda/cuda.h (CUoccupancyB2DSize): New typedef.
(cuOccupancyMaxPotentialBlockSizeWithFlags): Declare.
plugin/cuda-lib.def (cuOccupancyMaxPotentialBlockSize): New
CUDA_ONE_CALL_MAYBE_NULL.
plugin/plugin-nvptx.c (CUDA_VERSION < 6050): Define
CUoccupancyB2DSize and declare
cuOccupancyMaxPotentialBlockSizeWithFlags.
(nvptx_exec): Use cuOccupancyMaxPotentialBlockSize to set the
default num_gangs and num_workers when the driver supports it.
---
libgomp/plugin/cuda-lib.def | 1 +
libgomp/plugin/cuda/cuda.h | 3 ++
libgomp/plugin/plugin-nvptx.c | 77 +++++++++++++++++++++++++++++------
3 files changed, 69 insertions(+), 12 deletions(-)
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 29028b504a0..b2a4c2154eb 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -41,6 +41,7 @@ CUDA_ONE_CALL (cuModuleGetGlobal)
CUDA_ONE_CALL (cuModuleLoad)
CUDA_ONE_CALL (cuModuleLoadData)
CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
CUDA_ONE_CALL (cuStreamCreate)
CUDA_ONE_CALL (cuStreamDestroy)
CUDA_ONE_CALL (cuStreamQuery)
diff --git a/libgomp/plugin/cuda/cuda.h b/libgomp/plugin/cuda/cuda.h
index 4799825bda2..b4c1b29c5d8 100644
--- a/libgomp/plugin/cuda/cuda.h
+++ b/libgomp/plugin/cuda/cuda.h
@@ -44,6 +44,7 @@ typedef void *CUevent;
typedef void *CUfunction;
typedef void *CUlinkState;
typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
typedef void *CUstream;
typedef enum {
@@ -170,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr *, size_t *, CUmodule, const char *);
CUresult cuModuleLoad (CUmodule *, const char *);
CUresult cuModuleLoadData (CUmodule *, const void *);
CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+ CUoccupancyB2DSize, size_t, int);
CUresult cuStreamCreate (CUstream *, unsigned);
#define cuStreamDestroy cuStreamDestroy_v2
CUresult cuStreamDestroy (CUstream);
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 6799a264976..9a4bc11e5fe 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -61,9 +61,12 @@ CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
#else
+typedef size_t (*CUoccupancyB2DSize)(int);
CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
const char *, unsigned, CUjit_option *, void **);
CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+ CUoccupancyB2DSize, size_t, int);
#endif
#define DO_PRAGMA(x) _Pragma (#x)
@@ -1200,21 +1203,71 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
{
bool default_dim_p[GOMP_DIM_MAX];
for (i = 0; i != GOMP_DIM_MAX; i++)
+ default_dim_p[i] = !dims[i];
+
+ if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
{
- default_dim_p[i] = !dims[i];
- if (default_dim_p[i])
- dims[i] = nvthd->ptx_dev->default_dims[i];
- }
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ {
+ if (default_dim_p[i])
+ dims[i] = nvthd->ptx_dev->default_dims[i];
+ }
- if (default_dim_p[GOMP_DIM_VECTOR])
- dims[GOMP_DIM_VECTOR]
- = MIN (dims[GOMP_DIM_VECTOR],
- (targ_fn->max_threads_per_block / warp_size * warp_size));
+ if (default_dim_p[GOMP_DIM_VECTOR])
+ dims[GOMP_DIM_VECTOR]
+ = MIN (dims[GOMP_DIM_VECTOR],
+ (targ_fn->max_threads_per_block / warp_size
+ * warp_size));
- if (default_dim_p[GOMP_DIM_WORKER])
- dims[GOMP_DIM_WORKER]
- = MIN (dims[GOMP_DIM_WORKER],
- targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+ if (default_dim_p[GOMP_DIM_WORKER])
+ dims[GOMP_DIM_WORKER]
+ = MIN (dims[GOMP_DIM_WORKER],
+ targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+ }
+ else
+ {
+ int vectors = default_dim_p[GOMP_DIM_VECTOR]
+ ? 0 : dims[GOMP_DIM_VECTOR];
+ int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+ int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+ int grids, blocks;
+
+ CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+ &blocks, function, NULL, 0,
+ dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+ GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+ "grid = %d, block = %d\n", grids, blocks);
+
+ /* Keep the num_gangs proportional to the block size. In
+ the case were a block size is limited by shared-memory
+ or the register file capacity, the runtime will not
+ excessively over assign gangs to the multiprocessor
+ units if their state is going to be swapped out even
+ more than necessary. The constant factor 2 is there to
+ prevent threads from idling when there is insufficient
+ work for them. */
+ if (gangs == 0)
+ gangs = 2 * grids * (blocks / warp_size);
+
+ if (vectors == 0)
+ vectors = warp_size;
+
+ if (workers == 0)
+ workers = blocks / vectors;
+
+ for (i = 0; i != GOMP_DIM_MAX; i++)
+ {
+ default_dim_p[i] = !dims[i];
+ if (default_dim_p[i])
+ switch (i)
+ {
+ case GOMP_DIM_GANG: dims[i] = gangs; break;
+ case GOMP_DIM_WORKER: dims[i] = workers; break;
+ case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+ default: GOMP_PLUGIN_fatal ("invalid dim");
+ }
+ }
+ }
}
}
--
2.17.1