This is a minimal patch for NVPTX OpenMP offloading, using Jakub's initial implementation. It allows to successfully run '#pragma omp target', without any parallel execution: 1 team of 1 thread is spawned on the device, and target regions with '#pragma omp parallel' will fail with a link error.
* plugin/plugin-nvptx.c (nvptx_host2dev): Allow NULL 'nvthd'. (nvptx_dev2host): Ditto. (GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400. (GOMP_OFFLOAD_run): New. --- libgomp/plugin/plugin-nvptx.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 52c49c7..a3eaafa 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1052,7 +1052,7 @@ nvptx_host2dev (void *d, const void *h, size_t s) GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; @@ -1117,7 +1117,7 @@ nvptx_dev2host (void *h, const void *d, size_t s) GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC - if (nvthd->current_stream != nvthd->ptx_dev->null_stream) + if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; @@ -1451,7 +1451,7 @@ GOMP_OFFLOAD_get_name (void) unsigned int GOMP_OFFLOAD_get_caps (void) { - return GOMP_OFFLOAD_CAP_OPENACC_200; + return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400; } int @@ -1788,3 +1788,27 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream) { return nvptx_set_cuda_stream (async, stream); } + +void +GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars) +{ + CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn; + CUresult r; + struct ptx_device *ptx_dev = ptx_devices[ord]; + const char *maybe_abort_msg = "(perhaps abort was called)"; + void *args = &tgt_vars; + + r = cuLaunchKernel (function, + 1, 1, 1, + 1, 1, 1, + 0, ptx_dev->null_stream->stream, &args, 0); + if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); + + r = cuCtxSynchronize (); + if (r == CUDA_ERROR_LAUNCH_FAILED) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), + maybe_abort_msg); + else if (r != CUDA_SUCCESS) + GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); +}