Hi Chung-Lin! On Tue, 25 Sep 2018 21:10:47 +0800, Chung-Lin Tang <chunglin_t...@mentor.com> wrote: > --- a/libgomp/oacc-async.c > +++ b/libgomp/oacc-async.c
> +attribute_hidden struct goacc_asyncqueue * > +lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async) > +{ > + /* The special value acc_async_noval (-1) maps to the thread-specific > + default async stream. */ > + if (async == acc_async_noval) > + async = thr->default_async; > + > + if (async == acc_async_sync) > + return NULL; > + > + if (async < 0) > + gomp_fatal ("bad async %d", async); > + > + struct gomp_device_descr *dev = thr->dev; > + > + if (!create > + && (async >= dev->openacc.async.nasyncqueue > + || !dev->openacc.async.asyncqueue[async])) > + return NULL; > + Doesn't this last block also have to be included in the lock you're taking below? > + gomp_mutex_lock (&dev->openacc.async.lock); > + if (async >= dev->openacc.async.nasyncqueue) > + { > + int diff = async + 1 - dev->openacc.async.nasyncqueue; > + dev->openacc.async.asyncqueue > + = gomp_realloc (dev->openacc.async.asyncqueue, > + sizeof (goacc_aq) * (async + 1)); > + memset (dev->openacc.async.asyncqueue + dev->openacc.async.nasyncqueue, > + 0, sizeof (goacc_aq) * diff); > + dev->openacc.async.nasyncqueue = async + 1; > + } > + > + if (!dev->openacc.async.asyncqueue[async]) > + { > + dev->openacc.async.asyncqueue[async] = > dev->openacc.async.construct_func (); > + > + /* Link new async queue into active list. */ > + goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list)); > + n->aq = dev->openacc.async.asyncqueue[async]; > + n->next = dev->openacc.async.active; > + dev->openacc.async.active = n; > + } > + gomp_mutex_unlock (&dev->openacc.async.lock); > + return dev->openacc.async.asyncqueue[async]; > +} And then, some more concerns, as encoded in the following patch (but please also continue reading below): commit d2d6aaeca840debbec14e421be705ef56d444ac7 Author: Thomas Schwinge <tho...@codesourcery.com> Date: Wed Dec 12 15:57:30 2018 +0100 into async re-work: locking concerns --- libgomp/oacc-async.c | 18 +++++++++++++++--- libgomp/plugin/plugin-nvptx.c | 6 ++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git libgomp/oacc-async.c libgomp/oacc-async.c index 89a405ebcdb1..68e4e65e8182 100644 --- libgomp/oacc-async.c +++ libgomp/oacc-async.c @@ -84,17 +84,21 @@ lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async) if (id < 0) return NULL; + struct goacc_asyncqueue *ret = NULL; + struct gomp_device_descr *dev = thr->dev; + gomp_mutex_lock (&dev->openacc.async.lock); + if (!create && (id >= dev->openacc.async.nasyncqueue || !dev->openacc.async.asyncqueue[id])) - return NULL; + goto out; - gomp_mutex_lock (&dev->openacc.async.lock); if (id >= dev->openacc.async.nasyncqueue) { int diff = id + 1 - dev->openacc.async.nasyncqueue; + // TODO gomp_realloc might call "gomp_fatal" with "&dev->openacc.async.lock" locked. Might cause deadlock? dev->openacc.async.asyncqueue = gomp_realloc (dev->openacc.async.asyncqueue, sizeof (goacc_aq) * (id + 1)); @@ -105,16 +109,23 @@ lookup_goacc_asyncqueue (struct goacc_thread *thr, bool create, int async) if (!dev->openacc.async.asyncqueue[id]) { + //TODO We have "&dev->openacc.async.lock" locked here, and if "openacc.async.construct_func" calls "GOMP_PLUGIN_fatal" (via "CUDA_CALL_ASSERT", for example), that might cause deadlock? + //TODO Change the interface to emit an error in the plugin, but then "return NULL", and we catch that here, unlock, and bail out? dev->openacc.async.asyncqueue[id] = dev->openacc.async.construct_func (); /* Link new async queue into active list. */ + // TODO gomp_malloc might call "gomp_fatal" with "&dev->openacc.async.lock" locked. Might cause deadlock? goacc_aq_list n = gomp_malloc (sizeof (struct goacc_asyncqueue_list)); n->aq = dev->openacc.async.asyncqueue[id]; n->next = dev->openacc.async.active; dev->openacc.async.active = n; } + ret = dev->openacc.async.asyncqueue[id]; + + out: gomp_mutex_unlock (&dev->openacc.async.lock); - return dev->openacc.async.asyncqueue[id]; + + return ret; } /* Return the asyncqueue to be used for OpenACC async-argument ASYNC. This @@ -305,6 +316,7 @@ goacc_fini_asyncqueues (struct gomp_device_descr *devicep) goacc_aq_list next; for (goacc_aq_list l = devicep->openacc.async.active; l; l = next) { + //TODO Can/should/must we "synchronize" here (how?), so as to make sure that no other operation on this asyncqueue is going on while/after we've destructed it here? ret &= devicep->openacc.async.destruct_func (l->aq); next = l->next; free (l); diff --git libgomp/plugin/plugin-nvptx.c libgomp/plugin/plugin-nvptx.c index 577ed39ef3f6..872e91f05e78 100644 --- libgomp/plugin/plugin-nvptx.c +++ libgomp/plugin/plugin-nvptx.c @@ -1389,6 +1389,7 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) if (r == CUDA_ERROR_NOT_READY) return 0; + //TODO Is this safe to call, or might this cause deadlock if something's locked? GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r)); return -1; } @@ -1396,6 +1397,7 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq) void GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq) { + //TODO Is this safe to call, or might this cause deadlock if something's locked? CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream); } @@ -1404,6 +1406,7 @@ GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1, struct goacc_asyncqueue *aq2) { CUevent e; + //TODO Are these safe to call, or might this cause deadlock if something's locked? CUDA_CALL_ASSERT (cuEventCreate, &e, CU_EVENT_DISABLE_TIMING); CUDA_CALL_ASSERT (cuEventRecord, e, aq1->cuda_stream); CUDA_CALL_ASSERT (cuStreamWaitEvent, aq2->cuda_stream, e, 0); @@ -1413,6 +1416,7 @@ static void cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr) { if (res != CUDA_SUCCESS) + //TODO Is this safe to call, or might this cause deadlock if something's locked? GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res)); struct nvptx_callback *cb = (struct nvptx_callback *) ptr; cb->fn (cb->ptr); @@ -1424,10 +1428,12 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, void (*callback_fn)(void *), void *userptr) { + //TODO Is this safe to call, or might this cause deadlock if something's locked? struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b)); b->fn = callback_fn; b->ptr = userptr; b->aq = aq; + //TODO Is this safe to call, or might this cause deadlock if something's locked? CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream, cuda_callback_wrapper, (void *) b, 0); } But then, I wonder if we couldn't skip all that locking, if we moved the "asyncqueue"s from "acc_dispatch_t" into "goacc_thread"? commit c9282e058f67cb8f8ca1720d7f9e3fe0c04b6c89 Author: Thomas Schwinge <tho...@codesourcery.com> Date: Thu Dec 13 18:00:16 2018 +0100 [TODO] into async re-work: move "asyncqueue"s from "acc_dispatch_t" into "goacc_thread"? --- libgomp/libgomp.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git libgomp/libgomp.h libgomp/libgomp.h index 574fcd1ee4ad..09852589d2f1 100644 --- libgomp/libgomp.h +++ libgomp/libgomp.h @@ -949,6 +949,11 @@ typedef struct acc_dispatch_t __typeof (GOMP_OFFLOAD_openacc_exec) *exec_func; struct { + //TODO Why do these live in the "device" data structure, and not in the "per-thread" data structure? + //TODO Aren't they meant to be separate per thread? + //TODO That is, as far as I remember right now, OpenACC explicitly states that an asyncqueue doesn't entail any synchronization between different host threads. + //TODO Verify OpenACC. + //TODO With that moved into "goacc_thread", we could get rid of all the locking needed here? /* Once created and put into the "active" list, asyncqueues are then never destructed and removed from the "active" list, other than if the TODO device is shut down. */ At this point, I will again (as in that other email) state that my understanding of OpenACC is that an async queue does not entail any inter-thread synchronization, so it would seem reasonable that all async queues are separate per thread. Grüße Thomas