Hi, this patch reworks some of the way that asynchronous copyouts are
implemented for OpenACC in libgomp.
Before this patch, we had a somewhat confusing way of implementing this
by having two refcounts for each mapping: refcount and async_refcount,
which I never got working again after the last wave of async regressions
showed up.
So this patch implements what I believe to be a simplification: async_refcount
is removed, and instead of trying to queue the async copyouts during unmapping
we actually do that during the plugin event handling. This requires a addition
of the async stream integer as an argument to the register_async_cleanup
plugin hook, but overall I think this should be more elegant than before.
This patch fixes the libgomp.oacc-c-c++-common/asyncwait-1.c regression.
It also fixed data-[23].c regressions before, but some other recent check-in
happened to already fixed those.
Tested without regressions, is this okay for trunk?
Thanks,
Chung-Lin
2015-11-24 Chung-Lin Tang <[email protected]>
* oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.
* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async'
parameter, use to set async stream around call to gomp_unmap_vars,
call gomp_unmap_vars() with 'do_copyfrom' set to true.
* plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field.
(event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP
events and call GOMP_PLUGIN_async_unmap_vars() for each of them.
(event_add): Add int parameter, initialize 'val' field when
adding new ptx_event struct.
(nvptx_evec): Adjust event_add() call arguments.
(nvptx_host2dev): Likewise.
(nvptx_dev2host): Likewise.
(nvptx_wait_async): Likewise.
(nvptx_wait_all_async): Likewise.
(GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter,
pass to event_add() call.
* oacc-host.c (host_openacc_register_async_cleanup): Add 'int async'
parameter.
* oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to
call openacc.register_async_cleanup_func() hook.
* oacc-parallel.c (GOACC_parallel_keyed): Likewise.
* target.c (gomp_copy_from_async): Delete function.
(gomp_map_vars): Remove async_refcount.
(gomp_unmap_vars): Likewise.
(gomp_load_image_to_device): Likewise.
(omp_target_associate_ptr): Likewise.
* libgomp.h (struct splay_tree_key_s): Remove async_refcount.
(acc_dispatch_t.register_async_cleanup_func): Add int parameter.
(gomp_copy_from_async): Remove.
Index: plugin/plugin-nvptx.c
===================================================================
--- plugin/plugin-nvptx.c (revision 230796)
+++ plugin/plugin-nvptx.c (working copy)
@@ -310,6 +310,7 @@ struct ptx_event
int type;
void *addr;
int ord;
+ int val;
struct ptx_event *next;
};
@@ -786,6 +787,7 @@ static void
event_gc (bool memmap_lockable)
{
struct ptx_event *ptx_event = ptx_events;
+ struct ptx_event *async_cleanups = NULL;
struct nvptx_thread *nvthd = nvptx_thread ();
pthread_mutex_lock (&ptx_event_lock);
@@ -803,6 +805,7 @@ event_gc (bool memmap_lockable)
r = cuEventQuery (*e->evt);
if (r == CUDA_SUCCESS)
{
+ bool append_async = false;
CUevent *te;
te = e->evt;
@@ -827,7 +830,7 @@ event_gc (bool memmap_lockable)
if (!memmap_lockable)
continue;
- GOMP_PLUGIN_async_unmap_vars (e->addr);
+ append_async = true;
}
break;
}
@@ -835,6 +838,7 @@ event_gc (bool memmap_lockable)
cuEventDestroy (*te);
free ((void *)te);
+ /* Unlink 'e' from ptx_events list. */
if (ptx_events == e)
ptx_events = ptx_events->next;
else
@@ -845,15 +849,31 @@ event_gc (bool memmap_lockable)
e_->next = e_->next->next;
}
- free (e);
+ if (append_async)
+ {
+ e->next = async_cleanups;
+ async_cleanups = e;
+ }
+ else
+ free (e);
}
}
pthread_mutex_unlock (&ptx_event_lock);
+
+ /* We have to do these here, after ptx_event_lock is released. */
+ while (async_cleanups)
+ {
+ struct ptx_event *e = async_cleanups;
+ async_cleanups = async_cleanups->next;
+
+ GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
+ free (e);
+ }
}
static void
-event_add (enum ptx_event_type type, CUevent *e, void *h)
+event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
{
struct ptx_event *ptx_event;
struct nvptx_thread *nvthd = nvptx_thread ();
@@ -866,6 +886,7 @@ static void
ptx_event->evt = e;
ptx_event->addr = h;
ptx_event->ord = nvthd->ptx_dev->ord;
+ ptx_event->val = val;
pthread_mutex_lock (&ptx_event_lock);
@@ -966,7 +987,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **host
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_KNL, e, (void *)dev_str);
+ event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
}
#else
r = cuCtxSynchronize ();
@@ -1073,7 +1094,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_MEM, e, (void *)h);
+ event_add (PTX_EVT_MEM, e, (void *)h, 0);
}
else
#endif
@@ -1138,7 +1159,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_MEM, e, (void *)h);
+ event_add (PTX_EVT_MEM, e, (void *)h, 0);
}
else
#endif
@@ -1264,7 +1285,7 @@ nvptx_wait_async (int async1, int async2)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_SYNC, e, NULL);
+ event_add (PTX_EVT_SYNC, e, NULL, 0);
r = cuStreamWaitEvent (s2->stream, *e, 0);
if (r != CUDA_SUCCESS)
@@ -1346,7 +1367,7 @@ nvptx_wait_all_async (int async)
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_SYNC, e, NULL);
+ event_add (PTX_EVT_SYNC, e, NULL, 0);
r = cuStreamWaitEvent (waiting_stream->stream, *e, 0);
if (r != CUDA_SUCCESS)
@@ -1658,7 +1679,7 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *)
}
void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
{
CUevent *e;
CUresult r;
@@ -1674,7 +1695,7 @@ void
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));
- event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
+ event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
}
int
Index: oacc-mem.c
===================================================================
--- oacc-mem.c (revision 230796)
+++ oacc-mem.c (working copy)
@@ -659,10 +659,7 @@ gomp_acc_remove_pointer (void *h, bool force_copyf
if (async < acc_async_noval)
gomp_unmap_vars (t, true);
else
- {
- gomp_copy_from_async (t);
- acc_dev->openacc.register_async_cleanup_func (t);
- }
+ t->device_descr->openacc.register_async_cleanup_func (t, async);
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
}
Index: libgomp.h
===================================================================
--- libgomp.h (revision 230796)
+++ libgomp.h (working copy)
@@ -829,8 +829,6 @@ struct splay_tree_key_s {
uintptr_t tgt_offset;
/* Reference count. */
uintptr_t refcount;
- /* Asynchronous reference count. */
- uintptr_t async_refcount;
};
/* The comparison function. */
@@ -864,7 +862,7 @@ typedef struct acc_dispatch_t
unsigned *, void *);
/* Async cleanup callback registration. */
- void (*register_async_cleanup_func) (void *);
+ void (*register_async_cleanup_func) (void *, int);
/* Asynchronous routines. */
int (*async_test_func) (int);
@@ -958,7 +956,6 @@ extern struct target_mem_desc *gomp_map_vars (stru
size_t, void **, void **,
size_t *, void *, bool,
enum gomp_map_vars_kind);
-extern void gomp_copy_from_async (struct target_mem_desc *);
extern void gomp_unmap_vars (struct target_mem_desc *, bool);
extern void gomp_init_device (struct gomp_device_descr *);
extern void gomp_free_memmap (struct splay_tree_s *);
Index: oacc-plugin.c
===================================================================
--- oacc-plugin.c (revision 230796)
+++ oacc-plugin.c (working copy)
@@ -31,11 +31,14 @@
#include "oacc-int.h"
void
-GOMP_PLUGIN_async_unmap_vars (void *ptr)
+GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
{
struct target_mem_desc *tgt = ptr;
+ struct gomp_device_descr *devicep = tgt->device_descr;
- gomp_unmap_vars (tgt, false);
+ devicep->openacc.async_set_async_func (async);
+ gomp_unmap_vars (tgt, true);
+ devicep->openacc.async_set_async_func (acc_async_sync);
}
/* Return the target-specific part of the TLS data for the current thread. */
Index: oacc-plugin.h
===================================================================
--- oacc-plugin.h (revision 230796)
+++ oacc-plugin.h (working copy)
@@ -27,7 +27,7 @@
#ifndef OACC_PLUGIN_H
#define OACC_PLUGIN_H 1
-extern void GOMP_PLUGIN_async_unmap_vars (void *);
+extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
extern void *GOMP_PLUGIN_acc_thread (void);
#endif
Index: oacc-host.c
===================================================================
--- oacc-host.c (revision 230796)
+++ oacc-host.c (working copy)
@@ -143,7 +143,8 @@ host_openacc_exec (void (*fn) (void *),
}
static void
-host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__
((unused)))
+host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__
((unused)),
+ int async __attribute__ ((unused)))
{
}
Index: target.c
===================================================================
--- target.c (revision 230796)
+++ target.c (working copy)
@@ -644,7 +644,6 @@ gomp_map_vars (struct gomp_device_descr *devicep,
tgt->list[i].offset = 0;
tgt->list[i].length = k->host_end - k->host_start;
k->refcount = 1;
- k->async_refcount = 0;
tgt->refcount++;
array->left = NULL;
array->right = NULL;
@@ -784,40 +783,6 @@ gomp_unmap_tgt (struct target_mem_desc *tgt)
free (tgt);
}
-/* Decrease the refcount for a set of mapped variables, and queue asychronous
- copies from the device back to the host after any work that has been issued.
- Because the regions are still "live", increment an asynchronous reference
- count to indicate that they should not be unmapped from host-side data
- structures until the asynchronous copy has completed. */
-
-attribute_hidden void
-gomp_copy_from_async (struct target_mem_desc *tgt)
-{
- struct gomp_device_descr *devicep = tgt->device_descr;
- size_t i;
-
- gomp_mutex_lock (&devicep->lock);
-
- for (i = 0; i < tgt->list_count; i++)
- if (tgt->list[i].key == NULL)
- ;
- else if (tgt->list[i].key->refcount > 1)
- {
- tgt->list[i].key->refcount--;
- tgt->list[i].key->async_refcount++;
- }
- else
- {
- splay_tree_key k = tgt->list[i].key;
- if (tgt->list[i].copy_from)
- devicep->dev2host_func (devicep->target_id, (void *) k->host_start,
- (void *) (k->tgt->tgt_start + k->tgt_offset),
- k->host_end - k->host_start);
- }
-
- gomp_mutex_unlock (&devicep->lock);
-}
-
/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
variables back from device to host: if it is false, it is assumed that this
has been done already, i.e. by gomp_copy_from_async above. */
@@ -847,13 +812,8 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool
k->refcount--;
else if (k->refcount == 1)
{
- if (k->async_refcount > 0)
- k->async_refcount--;
- else
- {
- k->refcount--;
- do_unmap = true;
- }
+ k->refcount--;
+ do_unmap = true;
}
if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
@@ -995,7 +955,6 @@ gomp_load_image_to_device (struct gomp_device_desc
k->tgt = tgt;
k->tgt_offset = target_table[i].start;
k->refcount = REFCOUNT_INFINITY;
- k->async_refcount = 0;
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
@@ -1020,7 +979,6 @@ gomp_load_image_to_device (struct gomp_device_desc
k->tgt = tgt;
k->tgt_offset = target_var->start;
k->refcount = REFCOUNT_INFINITY;
- k->async_refcount = 0;
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
@@ -2120,7 +2078,6 @@ omp_target_associate_ptr (void *host_ptr, void *de
k->tgt = tgt;
k->tgt_offset = (uintptr_t) device_ptr + device_offset;
k->refcount = REFCOUNT_INFINITY;
- k->async_refcount = 0;
array->left = NULL;
array->right = NULL;
splay_tree_insert (&devicep->mem_map, array);
Index: oacc-parallel.c
===================================================================
--- oacc-parallel.c (revision 230796)
+++ oacc-parallel.c (working copy)
@@ -182,10 +182,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void
if (async < acc_async_noval)
gomp_unmap_vars (tgt, true);
else
- {
- gomp_copy_from_async (tgt);
- acc_dev->openacc.register_async_cleanup_func (tgt);
- }
+ tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
acc_dev->openacc.async_set_async_func (acc_async_sync);
}