Hi,
I wrote a patch that called some function in the common libgomp code
from GOMP_OFFLOAD_fini_device, and found that it hung due to the fact that:
- gomp_target_fini locks devices[*].lock while calling
GOMP_OFFLOAD_fini_device, and
- the function call that I added also locked that same lock, and
- that lock is not recursive.
Given that gomp_target_fini is called at exit, I decided to move the
function call to a separate function (let's call it pre_fini_device),
and register it with atexit.
[ Other ways to handle this problem are:
- add a new plugin function GOMP_OFFLOAD_pre_fini_device, or
- to make the lock recursive
]
Then I ran into the problem that pre_fini_device was called after
GOMP_OFFLOAD_fini_device, due to the fact that:
- atexit (gomp_target_fini) is called at the end of gomp_target_init,
and
- the atexit (pre_fini_device) happens on the first plugin call, which
is the current_device.get_num_devices_func () call earlier in
gomp_target_init.
I fixed this by moving the atexit to the start of gomp_target_init.
I tested this on nvptx, and found that some cuda cleanup is no longer
needed (or possible), presumably because the cuda runtime itself
registers a cleanup at exit, which is now called before gomp_target_fini
instead of after.
This patch contains:
- the move of atexit (gomp_target_fini) from end to start of
gomp_target_init, and
- handling of the new situation in plugin-nvptx.c. I suspect the code
can be simplified by assuming that cuda_alive is always false.
Tested on x86_64 with nvptx accelerator.
Is moving the atexit (gomp_target_fini) to the start of gomp_target_init
a good idea? Any other comments?
Thanks,
- Tom
Call atexit (gomp_target_fini) at start of gomp_target_init
---
libgomp/plugin/plugin-nvptx.c | 39 ++++++++++++++++++++++++++-------------
libgomp/target.c | 15 ++++++++++++---
2 files changed, 38 insertions(+), 16 deletions(-)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 71630b57355..0bf523409ab 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -484,7 +484,7 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
}
static bool
-fini_streams_for_device (struct ptx_device *ptx_dev)
+fini_streams_for_device (struct ptx_device *ptx_dev, bool cuda_alive)
{
free (ptx_dev->async_streams.arr);
@@ -494,18 +494,22 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
struct ptx_stream *s = ptx_dev->active_streams;
ptx_dev->active_streams = ptx_dev->active_streams->next;
- ret &= map_fini (s);
-
- CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
- if (r != CUDA_SUCCESS)
+ if (cuda_alive)
{
- GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
- ret = false;
+ ret &= map_fini (s);
+
+ CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
+ if (r != CUDA_SUCCESS)
+ {
+ GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
+ ret = false;
+ }
}
free (s);
}
- ret &= map_fini (ptx_dev->null_stream);
+ if (cuda_alive)
+ ret &= map_fini (ptx_dev->null_stream);
free (ptx_dev->null_stream);
return ret;
}
@@ -813,17 +817,17 @@ nvptx_open_device (int n)
}
static bool
-nvptx_close_device (struct ptx_device *ptx_dev)
+nvptx_close_device (struct ptx_device *ptx_dev, bool cuda_alive)
{
if (!ptx_dev)
return true;
- if (!fini_streams_for_device (ptx_dev))
+ if (!fini_streams_for_device (ptx_dev, cuda_alive))
return false;
pthread_mutex_destroy (&ptx_dev->image_lock);
- if (!ptx_dev->ctx_shared)
+ if (cuda_alive && !ptx_dev->ctx_shared)
CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
free (ptx_dev);
@@ -1729,8 +1733,17 @@ GOMP_OFFLOAD_fini_device (int n)
if (ptx_devices[n] != NULL)
{
- if (!nvptx_attach_host_thread_to_device (n)
- || !nvptx_close_device (ptx_devices[n]))
+ bool failed = false;
+ CUdevice dev;
+ CUresult r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
+ bool cuda_alive = r != CUDA_ERROR_DEINITIALIZED;
+ if (cuda_alive && !nvptx_attach_host_thread_to_device (n))
+ failed = true;
+
+ if (!failed && !nvptx_close_device (ptx_devices[n], cuda_alive))
+ failed = true;
+
+ if (failed)
{
pthread_mutex_unlock (&ptx_dev_lock);
return false;
diff --git a/libgomp/target.c b/libgomp/target.c
index 8ac05e8c641..829c2de1624 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -2651,6 +2651,18 @@ gomp_target_init (void)
num_devices = 0;
devices = NULL;
+ /* Register gomp_target_fini for execution at exit before calling a plugin
+ function, to make sure that happens before a plugin calls atexit.
+ That way, the functions registered by a plugin will be executed before
+ gomp_target_fini. This gives those functions the possibility to implement
+ cleanups that require locking and unlocking devices[*].lock. These
+ cleanups cannot happen during fini_device_func because:
+ - gomp_target_fini locks devices[*].lock while calling fini_device_func,
+ and
+ - the lock is not recursive. */
+ if (atexit (gomp_target_fini) != 0)
+ gomp_fatal ("atexit failed");
+
cur = OFFLOAD_TARGETS;
if (*cur)
do
@@ -2737,9 +2749,6 @@ gomp_target_init (void)
if (devices[i].capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
goacc_register (&devices[i]);
}
-
- if (atexit (gomp_target_fini) != 0)
- gomp_fatal ("atexit failed");
}
#else /* PLUGIN_SUPPORT */