While tuning the performance of nvptx OpenACC offloading earlier this
year, Tom fixed a bug in og7 that prevented Nvidia's nvprof profiling
tool from working with CUDA 9. Tom posted more details on the patch here
<https://gcc.gnu.org/ml/gcc-patches/2018-02/msg01269.html>, which is
still relevant here.

Note that this issue was triggered by the new OpenACC profiling API in
og7, which has not landed in trunk yet. However, it's probably a good
idea to get this patch committed independently from that huge profiling
patch series.

Is this OK for trunk? I bootstrapped and regtested this for x86_64 Linux
with nvptx offloading.

Thanks,
Cesar
[OpenACC] Fix hang when running oacc exec with CUDA 9.0 nvprof

2018-XX-YY  Tom de Vries  <tdevr...@suse.de>
	    Cesar Philippidis  <ce...@codesourcery.com>

	libgomp/
	* oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
	New variable.
	(acc_init_1): Set acc_init_thread to pthread_self ().  Set
	acc_init_state to initializing at the start, and to initialized at the
	end.
	(self_initializing_p): New function.
	(acc_get_device_type): Return acc_device_none if called by thread that
	is currently executing acc_init_1.

(cherry picked from openacc-gcc-7-branch commit
81904b675f6298a9c26c71391909ce362990a11f, bfc999c)
---
 libgomp/oacc-init.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
index 8db24b17d29..8842e7218cb 100644
--- a/libgomp/oacc-init.c
+++ b/libgomp/oacc-init.c
@@ -40,6 +40,11 @@
 
 static gomp_mutex_t acc_device_lock;
 
+static gomp_mutex_t acc_init_state_lock;
+static enum { uninitialized, initializing, initialized } acc_init_state
+  = uninitialized;
+static pthread_t acc_init_thread;
+
 /* A cached version of the dispatcher for the global "current" accelerator type,
    e.g. used as the default when creating new host threads.  This is the
    device-type equivalent of goacc_device_num (which specifies which device to
@@ -215,6 +220,11 @@ acc_init_1 (acc_device_t d)
   struct gomp_device_descr *base_dev, *acc_dev;
   int ndevs;
 
+  gomp_mutex_lock (&acc_init_state_lock);
+  acc_init_state = initializing;
+  acc_init_thread = pthread_self ();
+  gomp_mutex_unlock (&acc_init_state_lock);
+
   base_dev = resolve_device (d, true);
 
   ndevs = base_dev->get_num_devices_func ();
@@ -234,6 +244,10 @@ acc_init_1 (acc_device_t d)
   gomp_init_device (acc_dev);
   gomp_mutex_unlock (&acc_dev->lock);
 
+  gomp_mutex_lock (&acc_init_state_lock);
+  acc_init_state = initialized;
+  gomp_mutex_unlock (&acc_init_state_lock);
+
   return base_dev;
 }
 
@@ -528,6 +542,17 @@ acc_set_device_type (acc_device_t d)
 
 ialias (acc_set_device_type)
 
+static bool
+self_initializing_p (void)
+{
+  bool res;
+  gomp_mutex_lock (&acc_init_state_lock);
+  res = (acc_init_state == initializing
+	 && pthread_equal (acc_init_thread, pthread_self ()));
+  gomp_mutex_unlock (&acc_init_state_lock);
+  return res;
+}
+
 acc_device_t
 acc_get_device_type (void)
 {
@@ -537,6 +562,15 @@ acc_get_device_type (void)
 
   if (thr && thr->base_dev)
     res = acc_device_type (thr->base_dev->type);
+  else if (self_initializing_p ())
+    /* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
+       acc_ev_device_init_start event callback, which is dispatched during
+       acc_init_1.  Trying to lock acc_device_lock during such a call (as we do
+       in the else clause below), will result in deadlock, since the lock has
+       already been taken by the acc_init_1 caller.  We work around this problem
+       by using the acc_get_device_type property "If the device type has not yet
+       been selected, the value acc_device_none may be returned".  */
+    ;
   else
     {
       gomp_init_targets_once ();
-- 
2.17.1

Reply via email to