Hi Chung-Lin!

On Tue, 25 Sep 2018 21:10:47 +0800, Chung-Lin Tang <chunglin_t...@mentor.com> 
wrote:
> --- a/libgomp/oacc-async.c
> +++ b/libgomp/oacc-async.c

> +attribute_hidden struct goacc_asyncqueue *
> +lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async)
> +{
> +  /* The special value acc_async_noval (-1) maps to the thread-specific
> +     default async stream.  */
> +  if (async == acc_async_noval)
> +    async = thr->default_async;
> +
> +  if (async == acc_async_sync)
> +    return NULL;
> +
> +  if (async < 0)
> +    gomp_fatal ("bad async %d", async);
> +
> +  struct gomp_device_descr *dev = thr->dev;
> +
> +  if (!create
> +      && (async >= dev->openacc.async.nasyncqueue
> +       || !dev->openacc.async.asyncqueue[async]))
> +    return NULL;
> +

Doesn't this last block also have to be included in the lock you're
taking below?

> +  gomp_mutex_lock (&dev->openacc.async.lock);
> +  if (async >= dev->openacc.async.nasyncqueue)
> +    {
> +      int diff = async + 1 - dev->openacc.async.nasyncqueue;
> +      dev->openacc.async.asyncqueue
> +     = gomp_realloc (dev->openacc.async.asyncqueue,
> +                     sizeof (goacc_aq) * (async + 1));
> +      memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue,
> +           0, sizeof (goacc_aq) * diff);
> +      dev->openacc.async.nasyncqueue = async + 1;
> +    }
> +
> +  if (!dev->openacc.async.asyncqueue[async])
> +    {
> +      dev->openacc.async.asyncqueue[async] = 
> dev->openacc.async.construct_func ();
> +
> +      /* Link new async queue into active list.  */
> +      goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
> +      n->aq = dev->openacc.async.asyncqueue[async];
> +      n->next = dev->openacc.async.active;
> +      dev->openacc.async.active = n;
> +    }
> +  gomp_mutex_unlock (&dev->openacc.async.lock);
> +  return dev->openacc.async.asyncqueue[async];
> +}

And then, some more concerns, as encoded in the following patch (but
please also continue reading below):

commit d2d6aaeca840debbec14e421be705ef56d444ac7
Author: Thomas Schwinge <tho...@codesourcery.com>
Date:   Wed Dec 12 15:57:30 2018 +0100

    into async re-work: locking concerns
---
 libgomp/oacc-async.c          | 18 +++++++++++++++---
 libgomp/plugin/plugin-nvptx.c |  6 ++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git libgomp/oacc-async.c libgomp/oacc-async.c
index 89a405ebcdb1..68e4e65e8182 100644
--- libgomp/oacc-async.c
+++ libgomp/oacc-async.c
@@ -84,17 +84,21 @@ lookup_goacc_asyncqueue (struct goacc_thread *thr, bool 
create, int async)
   if (id < 0)
     return NULL;
 
+  struct goacc_asyncqueue *ret = NULL;
+
   struct gomp_device_descr *dev = thr->dev;
 
+  gomp_mutex_lock (&dev->openacc.async.lock);
+
   if (!create
       && (id >= dev->openacc.async.nasyncqueue
          || !dev->openacc.async.asyncqueue[id]))
-    return NULL;
+    goto out;
 
-  gomp_mutex_lock (&dev->openacc.async.lock);
   if (id >= dev->openacc.async.nasyncqueue)
     {
       int diff = id + 1 - dev->openacc.async.nasyncqueue;
+      // TODO gomp_realloc might call "gomp_fatal" with 
"&dev->openacc.async.lock" locked.  Might cause deadlock?
       dev->openacc.async.asyncqueue
        = gomp_realloc (dev->openacc.async.asyncqueue,
                        sizeof (goacc_aq) * (id + 1));
@@ -105,16 +109,23 @@ lookup_goacc_asyncqueue (struct goacc_thread *thr, bool 
create, int async)
 
   if (!dev->openacc.async.asyncqueue[id])
     {
+      //TODO We have "&dev->openacc.async.lock" locked here, and if 
"openacc.async.construct_func" calls "GOMP_PLUGIN_fatal" (via 
"CUDA_CALL_ASSERT", for example), that might cause deadlock?
+      //TODO Change the interface to emit an error in the plugin, but then 
"return NULL", and we catch that here, unlock, and bail out?
       dev->openacc.async.asyncqueue[id] = dev->openacc.async.construct_func ();
 
       /* Link new async queue into active list.  */
+      // TODO gomp_malloc might call "gomp_fatal" with 
"&dev->openacc.async.lock" locked.  Might cause deadlock?
       goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list));
       n->aq = dev->openacc.async.asyncqueue[id];
       n->next = dev->openacc.async.active;
       dev->openacc.async.active = n;
     }
+  ret = dev->openacc.async.asyncqueue[id];
+
+ out:
   gomp_mutex_unlock (&dev->openacc.async.lock);
-  return dev->openacc.async.asyncqueue[id];
+
+  return ret;
 }
 
 /* Return the asyncqueue to be used for OpenACC async-argument ASYNC.  This
@@ -305,6 +316,7 @@ goacc_fini_asyncqueues (struct gomp_device_descr *devicep)
       goacc_aq_list next;
       for (goacc_aq_list l = devicep->openacc.async.active; l; l = next)
        {
+         //TODO Can/should/must we "synchronize" here (how?), so as to make 
sure that no other operation on this asyncqueue is going on while/after we've 
destructed it here?
          ret &= devicep->openacc.async.destruct_func (l->aq);
          next = l->next;
          free (l);
diff --git libgomp/plugin/plugin-nvptx.c libgomp/plugin/plugin-nvptx.c
index 577ed39ef3f6..872e91f05e78 100644
--- libgomp/plugin/plugin-nvptx.c
+++ libgomp/plugin/plugin-nvptx.c
@@ -1389,6 +1389,7 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue 
*aq)
   if (r == CUDA_ERROR_NOT_READY)
     return 0;
 
+  //TODO Is this safe to call, or might this cause deadlock if something's 
locked?
   GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
   return -1;
 }
@@ -1396,6 +1397,7 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue 
*aq)
 void
 GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
 {
+  //TODO Is this safe to call, or might this cause deadlock if something's 
locked?
   CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
 }
 
@@ -1404,6 +1406,7 @@ GOMP_OFFLOAD_openacc_async_serialize (struct 
goacc_asyncqueue *aq1,
                                      struct goacc_asyncqueue *aq2)
 {
   CUevent e;
+  //TODO Are these safe to call, or might this cause deadlock if something's 
locked?
   CUDA_CALL_ASSERT (cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
   CUDA_CALL_ASSERT (cuEventRecord, e, aq1->cuda_stream);
   CUDA_CALL_ASSERT (cuStreamWaitEvent, aq2->cuda_stream, e, 0);
@@ -1413,6 +1416,7 @@ static void
 cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
 {
   if (res != CUDA_SUCCESS)
+    //TODO Is this safe to call, or might this cause deadlock if something's 
locked?
     GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
   struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
   cb->fn (cb->ptr);
@@ -1424,10 +1428,12 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct 
goacc_asyncqueue *aq,
                                           void (*callback_fn)(void *),
                                           void *userptr)
 {
+  //TODO Is this safe to call, or might this cause deadlock if something's 
locked?
   struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
   b->fn = callback_fn;
   b->ptr = userptr;
   b->aq = aq;
+  //TODO Is this safe to call, or might this cause deadlock if something's 
locked?
   CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
                    cuda_callback_wrapper, (void *) b, 0);
 }


But then, I wonder if we couldn't skip all that locking, if we moved the
"asyncqueue"s from "acc_dispatch_t" into "goacc_thread"?

commit c9282e058f67cb8f8ca1720d7f9e3fe0c04b6c89
Author: Thomas Schwinge <tho...@codesourcery.com>
Date:   Thu Dec 13 18:00:16 2018 +0100

    [TODO] into async re-work: move "asyncqueue"s from "acc_dispatch_t" into 
"goacc_thread"?
---
 libgomp/libgomp.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git libgomp/libgomp.h libgomp/libgomp.h
index 574fcd1ee4ad..09852589d2f1 100644
--- libgomp/libgomp.h
+++ libgomp/libgomp.h
@@ -949,6 +949,11 @@ typedef struct acc_dispatch_t
   __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func;
 
   struct {
+    //TODO Why do these live in the "device" data structure, and not in the 
"per-thread" data structure?
+    //TODO Aren't they meant to be separate per thread?
+    //TODO That is, as far as I remember right now, OpenACC explicitly states 
that an asyncqueue doesn't entail any synchronization between different host 
threads.
+    //TODO Verify OpenACC.
+    //TODO With that moved into "goacc_thread", we could get rid of all the 
locking needed here?
     /* Once created and put into the "active" list, asyncqueues are then never
        destructed and removed from the "active" list, other than if the TODO
        device is shut down.  */

At this point, I will again (as in that other email) state that my
understanding of OpenACC is that an async queue does not entail any
inter-thread synchronization, so it would seem reasonable that all async
queues are separate per thread.


Grüße
 Thomas

Reply via email to