RFC: libgomp target plugins and atexit

Tom de Vries Fri, 17 Nov 2017 01:28:58 -0800

Hi,

I wrote a patch that called some function in the common libgomp codefrom GOMP_OFFLOAD_fini_device, and found that it hung due to the fact that:

- gomp_target_fini locks devices[*].lock while calling
  GOMP_OFFLOAD_fini_device, and
- the function call that I added also locked that same lock, and
- that lock is not recursive.

Given that gomp_target_fini is called at exit, I decided to move thefunction call to a separate function (let's call it pre_fini_device),and register it with atexit.


[ Other ways to handle this problem are:
- add a new plugin function GOMP_OFFLOAD_pre_fini_device, or
- to make the lock recursive
]

Then I ran into the problem that pre_fini_device was called afterGOMP_OFFLOAD_fini_device, due to the fact that:

- atexit (gomp_target_fini) is called at the end of gomp_target_init,
  and
- the atexit (pre_fini_device) happens on the first plugin call, which
  is the current_device.get_num_devices_func () call earlier in
  gomp_target_init.

I fixed this by moving the atexit to the start of gomp_target_init.

I tested this on nvptx, and found that some cuda cleanup is no longerneeded (or possible), presumably because the cuda runtime itselfregisters a cleanup at exit, which is now called before gomp_target_finiinstead of after.


This patch contains:
- the move of atexit (gomp_target_fini) from end to start of
  gomp_target_init, and
- handling of the new situation in plugin-nvptx.c. I suspect the code
  can be simplified by assuming that cuda_alive is always false.

Tested on x86_64 with nvptx accelerator.

Is moving the atexit (gomp_target_fini) to the start of gomp_target_inita good idea? Any other comments?


Thanks,
- Tom

Call atexit (gomp_target_fini) at start of gomp_target_init

---
 libgomp/plugin/plugin-nvptx.c | 39 ++++++++++++++++++++++++++-------------
 libgomp/target.c              | 15 ++++++++++++---
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 71630b57355..0bf523409ab 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -484,7 +484,7 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
 }
 
 static bool
-fini_streams_for_device (struct ptx_device *ptx_dev)
+fini_streams_for_device (struct ptx_device *ptx_dev, bool cuda_alive)
 {
   free (ptx_dev->async_streams.arr);
 
@@ -494,18 +494,22 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
       struct ptx_stream *s = ptx_dev->active_streams;
       ptx_dev->active_streams = ptx_dev->active_streams->next;
 
-      ret &= map_fini (s);
-
-      CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
-      if (r != CUDA_SUCCESS)
+      if (cuda_alive)
 	{
-	  GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
-	  ret = false;
+	  ret &= map_fini (s);
+
+	  CUresult r = CUDA_CALL_NOCHECK (cuStreamDestroy, s->stream);
+	  if (r != CUDA_SUCCESS)
+	    {
+	      GOMP_PLUGIN_error ("cuStreamDestroy error: %s", cuda_error (r));
+	      ret = false;
+	    }
 	}
       free (s);
     }
 
-  ret &= map_fini (ptx_dev->null_stream);
+  if (cuda_alive)
+    ret &= map_fini (ptx_dev->null_stream);
   free (ptx_dev->null_stream);
   return ret;
 }
@@ -813,17 +817,17 @@ nvptx_open_device (int n)
 }
 
 static bool
-nvptx_close_device (struct ptx_device *ptx_dev)
+nvptx_close_device (struct ptx_device *ptx_dev, bool cuda_alive)
 {
   if (!ptx_dev)
     return true;
 
-  if (!fini_streams_for_device (ptx_dev))
+  if (!fini_streams_for_device (ptx_dev, cuda_alive))
     return false;
   
   pthread_mutex_destroy (&ptx_dev->image_lock);
 
-  if (!ptx_dev->ctx_shared)
+  if (cuda_alive && !ptx_dev->ctx_shared)
     CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
 
   free (ptx_dev);
@@ -1729,8 +1733,17 @@ GOMP_OFFLOAD_fini_device (int n)
 
   if (ptx_devices[n] != NULL)
     {
-      if (!nvptx_attach_host_thread_to_device (n)
-	  || !nvptx_close_device (ptx_devices[n]))
+      bool failed = false;
+      CUdevice dev;
+      CUresult r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
+      bool cuda_alive = r != CUDA_ERROR_DEINITIALIZED;
+      if (cuda_alive && !nvptx_attach_host_thread_to_device (n))
+	failed = true;
+
+      if (!failed && !nvptx_close_device (ptx_devices[n], cuda_alive))
+	failed = true;
+
+      if (failed)
 	{
 	  pthread_mutex_unlock (&ptx_dev_lock);
 	  return false;
diff --git a/libgomp/target.c b/libgomp/target.c
index 8ac05e8c641..829c2de1624 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -2651,6 +2651,18 @@ gomp_target_init (void)
   num_devices = 0;
   devices = NULL;
 
+  /* Register gomp_target_fini for execution at exit before calling a plugin
+     function, to make sure that happens before a plugin calls atexit.
+     That way, the functions registered by a plugin will be executed before
+     gomp_target_fini.  This gives those functions the possibility to implement
+     cleanups that require locking and unlocking devices[*].lock.  These
+     cleanups cannot happen during fini_device_func because:
+     - gomp_target_fini locks devices[*].lock while calling fini_device_func,
+       and
+     - the lock is not recursive.  */
+  if (atexit (gomp_target_fini) != 0)
+    gomp_fatal ("atexit failed");
+
   cur = OFFLOAD_TARGETS;
   if (*cur)
     do
@@ -2737,9 +2749,6 @@ gomp_target_init (void)
       if (devices[i].capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
 	goacc_register (&devices[i]);
     }
-
-  if (atexit (gomp_target_fini) != 0)
-    gomp_fatal ("atexit failed");
 }
 
 #else /* PLUGIN_SUPPORT */

RFC: libgomp target plugins and atexit

Reply via email to