https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88939
--- Comment #1 from Tom de Vries <vries at gcc dot gnu.org> --- The usual fix for this sort of problem is to move the map_pop to before the GOMP_PLUGIN_fatal: ... @@ -1365,6 +1365,7 @@ nvptx_exec if (async < acc_async_noval) { r = CUDA_CALL_NOCHECK (cuStreamSynchronize, dev_str->stream); + map_pop (dev_str); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1392,6 +1393,7 @@ nvptx_exec } #else r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); + map_pop (dev_str); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); @@ -1401,11 +1403,6 @@ nvptx_exec GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); - -#ifndef DISABLE_ASYNC - if (async < acc_async_noval) -#endif - map_pop (dev_str); } void * openacc_get_current_cuda_context (void); ... but then we run into the same CUDA_ERROR_ILLEGAL_INSTRUCTION when calling cuMemFree when trying to free the device pointer: ... libgomp: cuStreamSynchronize error: an illegal instruction was encountered libgomp: cuMemFree error: an illegal instruction was encountered ... because the cuda error leaves the process in an inconsistent state and any further CUDA calls in the process will return the same error: We could do: ... @@ -237,7 +237,7 @@ cuda_map_create (size_t size) static void cuda_map_destroy (struct cuda_map *map) { - CUDA_CALL_ASSERT (cuMemFree, map->d); + CUDA_CALL_NOCHECK (cuMemFree, map->d); free (map); } ... but that's just a workaround.