This patch reworks my previous one to just deal with the PTX-specific unloading
breakage. I don't change the API between libgomp and the plugins, and fixup the
PTX plugin to have per-device instance lists of loaded programs. As with the
previous patch, we fix an ordering problem by unloading the target images before
destroying the memory maps containing said images. Mkoffloads now emits a
static destructor call.
Ok for gomp4?
nathan
2015-07-15 Nathan Sidwell <nat...@codesourcery.com>
libgomp/
* target.c (gomp_offload_image_to_device): Rename to ...
(gomp_load_image_to_device): ... here.
(GOMP_offload_register): Adjust call.
(gomp_init_device): Likewise.
(gomp_unload_image_from_devuce): New. Broken out of ...
(GOMP_offload_unregister): ... here. Call it.
(gomp_unload_device): New.
* libgomp.h (gomp_unload_device): Declare.
* plugin/plugin-nvptx.c (struct targ_fn_descriptor): Move later.
(struct ptx_image_data): Move earlier, add fns field.
(struct ptx_device): Add images and image_lock fields.
(ptx_images, ptx_image_lock): Delete.
(nvptx_open_device): Initialize images and image_lock fields.
(GOMP_OFFLOAD_load_image): Register image to device-specific fields.
(GOMP_OFFLOAD_unload_image): Unregister from device-specific.
* oacc-init.c (acc_shutdown_1): Unload from device before deleting
mem maps.
gcc/
* config/nvptx/mkoffload.c (process): Reformat printing.
Add destructor call.
Index: libgomp/target.c
===================================================================
--- libgomp/target.c (revision 225829)
+++ libgomp/target.c (working copy)
@@ -647,12 +647,13 @@ gomp_update (struct gomp_device_descr *d
/* Load image pointed by TARGET_DATA to the device, specified by DEVICEP.
And insert to splay tree the mapping between addresses from HOST_TABLE and
- from loaded target image. */
+ from loaded target image. We rely in the host and device compiler
+ emitting variable and functions in the same order. */
static void
-gomp_offload_image_to_device (struct gomp_device_descr *devicep,
- void *host_table, void *target_data,
- bool is_register_lock)
+gomp_load_image_to_device (struct gomp_device_descr *devicep,
+ void *host_table, void *target_data,
+ bool is_register_lock)
{
void **host_func_table = ((void ***) host_table)[0];
void **host_funcs_end = ((void ***) host_table)[1];
@@ -667,7 +668,8 @@ gomp_offload_image_to_device (struct gom
/* Load image to device and get target addresses for the image. */
struct addr_pair *target_table = NULL;
int i, num_target_entries
- = devicep->load_image_func (devicep->target_id, target_data, &target_table);
+ = devicep->load_image_func (devicep->target_id, target_data,
+ &target_table);
if (num_target_entries != num_funcs + num_vars)
{
@@ -736,6 +738,59 @@ gomp_offload_image_to_device (struct gom
free (target_table);
}
+/* Unload the mappings described by target_data from device DEVICE_P.
+ The device must be locked. */
+
+static void
+gomp_unload_image_from_device (struct gomp_device_descr *devicep,
+ void *host_table, void *target_data)
+{
+ void **host_func_table = ((void ***) host_table)[0];
+ void **host_funcs_end = ((void ***) host_table)[1];
+ void **host_var_table = ((void ***) host_table)[2];
+ void **host_vars_end = ((void ***) host_table)[3];
+
+ /* The func table contains only addresses, the var table contains addresses
+ and corresponding sizes. */
+ int num_funcs = host_funcs_end - host_func_table;
+ int num_vars = (host_vars_end - host_var_table) / 2;
+
+ unsigned j;
+ struct splay_tree_key_s k;
+ splay_tree_key node = NULL;
+
+ /* Find mapping at start of node array */
+ if (num_funcs || num_vars)
+ {
+ k.host_start = num_funcs ? (uintptr_t) host_func_table[0] : (uintptr_t) host_var_table[0];
+ k.host_end = k.host_start + 1;
+ node = splay_tree_lookup (&devicep->mem_map, &k);
+ }
+
+ devicep->unload_image_func (devicep->target_id, target_data);
+
+ /* Remove mappings from splay tree. */
+ for (j = 0; j < num_funcs; j++)
+ {
+ k.host_start = (uintptr_t) host_func_table[j];
+ k.host_end = k.host_start + 1;
+ splay_tree_remove (&devicep->mem_map, &k);
+ }
+
+ for (j = 0; j < num_vars; j++)
+ {
+ k.host_start = (uintptr_t) host_var_table[j * 2];
+ k.host_end = k.host_start + (uintptr_t) host_var_table[j * 2 + 1];
+ splay_tree_remove (&devicep->mem_map, &k);
+ }
+
+ if (node)
+ {
+ free (node->tgt);
+ free (node);
+ }
+}
+
/* This function should be called from every offload image while loading.
It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of
the target, and TARGET_DATA needed by target plugin. */
@@ -753,7 +808,7 @@ GOMP_offload_register (void *host_table,
struct gomp_device_descr *devicep = &devices[i];
gomp_mutex_lock (&devicep->lock);
if (devicep->type == target_type && devicep->is_initialized)
- gomp_offload_image_to_device (devicep, host_table, target_data, true);
+ gomp_load_image_to_device (devicep, host_table, target_data, true);
gomp_mutex_unlock (&devicep->lock);
}
@@ -775,72 +830,21 @@ GOMP_offload_register (void *host_table,
the target, and TARGET_DATA needed by target plugin. */
void
-GOMP_offload_unregister (void *host_table, enum offload_target_type target_type,
+GOMP_offload_unregister (void *host_table,
+ enum offload_target_type target_type,
void *target_data)
{
- void **host_func_table = ((void ***) host_table)[0];
- void **host_funcs_end = ((void ***) host_table)[1];
- void **host_var_table = ((void ***) host_table)[2];
- void **host_vars_end = ((void ***) host_table)[3];
int i;
- /* The func table contains only addresses, the var table contains addresses
- and corresponding sizes. */
- int num_funcs = host_funcs_end - host_func_table;
- int num_vars = (host_vars_end - host_var_table) / 2;
-
gomp_mutex_lock (®ister_lock);
/* Unload image from all initialized devices. */
for (i = 0; i < num_devices; i++)
{
- int j;
struct gomp_device_descr *devicep = &devices[i];
gomp_mutex_lock (&devicep->lock);
- if (devicep->type != target_type || !devicep->is_initialized)
- {
- gomp_mutex_unlock (&devicep->lock);
- continue;
- }
-
- devicep->unload_image_func (devicep->target_id, target_data);
-
- /* Remove mapping from splay tree. */
- struct splay_tree_key_s k;
- splay_tree_key node = NULL;
- if (num_funcs > 0)
- {
- k.host_start = (uintptr_t) host_func_table[0];
- k.host_end = k.host_start + 1;
- node = splay_tree_lookup (&devicep->mem_map, &k);
- }
- else if (num_vars > 0)
- {
- k.host_start = (uintptr_t) host_var_table[0];
- k.host_end = k.host_start + (uintptr_t) host_var_table[1];
- node = splay_tree_lookup (&devicep->mem_map, &k);
- }
-
- for (j = 0; j < num_funcs; j++)
- {
- k.host_start = (uintptr_t) host_func_table[j];
- k.host_end = k.host_start + 1;
- splay_tree_remove (&devicep->mem_map, &k);
- }
-
- for (j = 0; j < num_vars; j++)
- {
- k.host_start = (uintptr_t) host_var_table[j * 2];
- k.host_end = k.host_start + (uintptr_t) host_var_table[j * 2 + 1];
- splay_tree_remove (&devicep->mem_map, &k);
- }
-
- if (node)
- {
- free (node->tgt);
- free (node);
- }
-
+ if (devicep->type == target_type && devicep->is_initialized)
+ gomp_unload_image_from_device(devicep, host_table, target_data);
gomp_mutex_unlock (&devicep->lock);
}
@@ -869,13 +873,31 @@ gomp_init_device (struct gomp_device_des
{
struct offload_image_descr *image = &offload_images[i];
if (image->type == devicep->type)
- gomp_offload_image_to_device (devicep, image->host_table,
- image->target_data, false);
+ gomp_load_image_to_device (devicep, image->host_table,
+ image->target_data, false);
}
devicep->is_initialized = true;
}
+attribute_hidden void
+gomp_unload_device (struct gomp_device_descr *devicep)
+{
+ if (devicep->is_initialized)
+ {
+ unsigned i;
+
+ /* Unload from device all images registered at the moment. */
+ for (i = 0; i < num_offload_images; i++)
+ {
+ struct offload_image_descr *image = &offload_images[i];
+ if (image->type == devicep->type)
+ gomp_unload_image_from_device (devicep, image->host_table,
+ image->target_data);
+ }
+ }
+}
+
/* Free address mapping tables. MM must be locked on entry, and remains locked
on return. */
Index: libgomp/libgomp.h
===================================================================
--- libgomp/libgomp.h (revision 225829)
+++ libgomp/libgomp.h (working copy)
@@ -784,6 +784,7 @@ extern void gomp_unmap_vars (struct targ
extern void gomp_init_device (struct gomp_device_descr *);
extern void gomp_free_memmap (struct splay_tree_s *);
extern void gomp_fini_device (struct gomp_device_descr *);
+extern void gomp_unload_device (struct gomp_device_descr *);
/* work.c */
Index: libgomp/plugin/plugin-nvptx.c
===================================================================
--- libgomp/plugin/plugin-nvptx.c (revision 225829)
+++ libgomp/plugin/plugin-nvptx.c (working copy)
@@ -126,12 +126,6 @@ cuda_error (CUresult r)
return &errmsg[0];
}
-struct targ_fn_descriptor
-{
- CUfunction fn;
- const char *name;
-};
-
static unsigned int instantiated_devices = 0;
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -287,6 +281,25 @@ map_push (struct ptx_stream *s, int asyn
return;
}
+/* Descriptor of a loaded function. */
+
+struct targ_fn_descriptor
+{
+ CUfunction fn;
+ const char *name;
+};
+
+/* A loaded PTX image. */
+struct ptx_image_data
+{
+ void *target_data;
+ CUmodule module;
+
+ struct targ_fn_descriptor *fns; /* Array of functions. */
+
+ struct ptx_image_data *next;
+};
+
struct ptx_device
{
CUcontext ctx;
@@ -310,6 +323,9 @@ struct ptx_device
int mode;
bool mkern;
+ struct ptx_image_data *images; /* Images loaded on device. */
+ pthread_mutex_t image_lock; /* Lock for above list. */
+
struct ptx_device *next;
};
@@ -331,21 +347,11 @@ struct ptx_event
struct ptx_event *next;
};
-struct ptx_image_data
-{
- void *target_data;
- CUmodule module;
- struct ptx_image_data *next;
-};
-
static pthread_mutex_t ptx_event_lock;
static struct ptx_event *ptx_events;
static struct ptx_device **ptx_devices;
-static struct ptx_image_data *ptx_images = NULL;
-static pthread_mutex_t ptx_image_lock = PTHREAD_MUTEX_INITIALIZER;
-
#define _XSTR(s) _STR(s)
#define _STR(s) #s
@@ -589,6 +595,7 @@ select_stream_for_async (int async, pthr
/* Initialize the device. Return TRUE on success, else FALSE. PTX_DEV_LOCK
should be locked on entry and remains locked on exit. */
+
static bool
nvptx_init (void)
{
@@ -745,6 +752,9 @@ nvptx_open_device (int n)
if (r != CUDA_SUCCESS)
async_engines = 1;
+ ptx_dev->images = NULL;
+ pthread_mutex_init (&ptx_dev->image_lock, NULL);
+
init_streams_for_device (ptx_dev, async_engines);
return ptx_dev;
@@ -1599,6 +1609,9 @@ typedef struct nvptx_tdata
size_t fn_num;
} nvptx_tdata_t;
+/* Load the (partial) program described by TARGET_DATA to device
+ number ORD. Allocate and return TARGET_TABLE. */
+
int
GOMP_OFFLOAD_load_image (int ord, void *target_data,
struct addr_pair **target_table)
@@ -1608,24 +1621,19 @@ GOMP_OFFLOAD_load_image (int ord, void *
unsigned int fn_entries, var_entries, i, j;
CUresult r;
struct targ_fn_descriptor *targ_fns;
+ struct addr_pair *targ_tbl;
nvptx_tdata_t const *img_header = (nvptx_tdata_t const *) target_data;
struct ptx_image_data *new_image;
+ struct ptx_device *dev;
GOMP_OFFLOAD_init_device (ord);
+ dev = ptx_devices[ord];
+
nvptx_attach_host_thread_to_device (ord);
link_ptx (&module, img_header->ptx_src, img_header->ptx_len);
- pthread_mutex_lock (&ptx_image_lock);
- new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
- new_image->target_data = target_data;
- new_image->module = module;
- new_image->next = ptx_images;
- ptx_images = new_image;
- pthread_mutex_unlock (&ptx_image_lock);
-
-
/* The mkoffload utility emits a struct of pointers/integers at the
start of each offload image. The array of kernel names and the
functions addresses form a one-to-one correspondence. */
@@ -1635,12 +1643,24 @@ GOMP_OFFLOAD_load_image (int ord, void *
fn_entries = img_header->fn_num;
fn_names = img_header->fn_names;
- *target_table = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
- * (fn_entries + var_entries));
+ targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
+ * (fn_entries + var_entries));
targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
* fn_entries);
- for (i = 0; i < fn_entries; i++)
+ *target_table = targ_tbl;
+
+ new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
+ new_image->target_data = target_data;
+ new_image->module = module;
+ new_image->fns = targ_fns;
+
+ pthread_mutex_lock (&dev->image_lock);
+ new_image->next = dev->images;
+ dev->images = new_image;
+ pthread_mutex_unlock (&dev->image_lock);
+
+ for (i = 0; i < fn_entries; i++, targ_tbl++, targ_fns++)
{
CUfunction function;
@@ -1648,14 +1668,14 @@ GOMP_OFFLOAD_load_image (int ord, void *
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));
- targ_fns[i].fn = function;
- targ_fns[i].name = (const char *) fn_names[i];
+ targ_fns->fn = function;
+ targ_fns->name = (const char *) fn_names[i];
- (*target_table)[i].start = (uintptr_t) &targ_fns[i];
- (*target_table)[i].end = (*target_table)[i].start + 1;
+ targ_tbl->start = (uintptr_t) targ_fns;
+ targ_tbl->end = targ_tbl->start + 1;
}
- for (j = 0; j < var_entries; j++, i++)
+ for (j = 0; j < var_entries; j++, targ_tbl++)
{
CUdeviceptr var;
size_t bytes;
@@ -1664,46 +1684,33 @@ GOMP_OFFLOAD_load_image (int ord, void *
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
- (*target_table)[i].start = (uintptr_t) var;
- (*target_table)[i].end = (*target_table)[i].start + bytes;
+ targ_tbl->start = (uintptr_t) var;
+ targ_tbl->end = targ_tbl->start + bytes;
}
- return i;
+ return fn_entries + var_entries;
}
+/* Unload the program described by TARGET_DATA. DEV_DATA is the
+ function descriptors allocated by G_O_load_image. */
+
void
-GOMP_OFFLOAD_unload_image (int tid __attribute__((unused)), void *target_data)
+GOMP_OFFLOAD_unload_image (int ord, void *target_data)
{
- void **img_header = (void **) target_data;
- struct targ_fn_descriptor *targ_fns
- = (struct targ_fn_descriptor *) img_header[0];
- struct ptx_image_data *image, *prev = NULL, *newhd = NULL;
-
- free (targ_fns);
-
- pthread_mutex_lock (&ptx_image_lock);
- for (image = ptx_images; image != NULL;)
- {
- struct ptx_image_data *next = image->next;
+ struct ptx_image_data *image, **prev_p;
+ struct ptx_device *dev = ptx_devices[ord];
- if (image->target_data == target_data)
- {
- cuModuleUnload (image->module);
- free (image);
- if (prev)
- prev->next = next;
- }
- else
- {
- prev = image;
- if (!newhd)
- newhd = image;
- }
-
- image = next;
- }
- ptx_images = newhd;
- pthread_mutex_unlock (&ptx_image_lock);
+ pthread_mutex_lock (&dev->image_lock);
+ for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
+ if (image->target_data == target_data)
+ {
+ *prev_p = image->next;
+ cuModuleUnload (image->module);
+ free (image->fns);
+ free (image);
+ break;
+ }
+ pthread_mutex_unlock (&dev->image_lock);
}
void *
Index: libgomp/oacc-init.c
===================================================================
--- libgomp/oacc-init.c (revision 225829)
+++ libgomp/oacc-init.c (working copy)
@@ -254,6 +254,18 @@ acc_shutdown_1 (acc_device_t d)
goacc_deallocate_static (d);
+ ndevs = base_dev->get_num_devices_func ();
+
+ /* Unload all the devices of this type that have been opened. */
+ for (i = 0; i < ndevs; i++)
+ {
+ struct gomp_device_descr *acc_dev = &base_dev[i];
+
+ gomp_mutex_lock (&acc_dev->lock);
+ gomp_unload_device (acc_dev);
+ gomp_mutex_unlock (&acc_dev->lock);
+ }
+
gomp_mutex_lock (&goacc_thread_lock);
/* Free target-specific TLS data and close all devices. */
@@ -292,7 +304,6 @@ acc_shutdown_1 (acc_device_t d)
gomp_mutex_unlock (&goacc_thread_lock);
- ndevs = base_dev->get_num_devices_func ();
/* Close all the devices of this type that have been opened. */
for (i = 0; i < ndevs; i++)
Index: gcc/config/nvptx/mkoffload.c
===================================================================
--- gcc/config/nvptx/mkoffload.c (revision 225829)
+++ gcc/config/nvptx/mkoffload.c (working copy)
@@ -292,22 +292,30 @@ process (FILE *in, FILE *out)
" sizeof (func_mappings) / sizeof (func_mappings[0])\n"
"};\n\n");
- fprintf (out, "#ifdef __cplusplus\n");
- fprintf (out, "extern \"C\" {\n");
- fprintf (out, "#endif\n");
+ fprintf (out, "#ifdef __cplusplus\n"
+ "extern \"C\" {\n"
+ "#endif\n");
fprintf (out, "extern void GOMP_offload_register (const void *, int, void *);\n");
+ fprintf (out, "extern void GOMP_offload_unregister (const void *, int, void *);\n");
- fprintf (out, "#ifdef __cplusplus\n");
- fprintf (out, "}\n");
- fprintf (out, "#endif\n");
+ fprintf (out, "#ifdef __cplusplus\n"
+ "}\n"
+ "#endif\n");
fprintf (out, "extern void *__OFFLOAD_TABLE__[];\n\n");
- fprintf (out, "static __attribute__((constructor)) void init (void)\n{\n");
- fprintf (out, " GOMP_offload_register (__OFFLOAD_TABLE__, %d,\n",
- GOMP_DEVICE_NVIDIA_PTX);
- fprintf (out, " &target_data);\n");
- fprintf (out, "};\n");
+
+ fprintf (out, "static __attribute__((constructor)) void init (void)\n"
+ "{\n"
+ " GOMP_offload_register (__OFFLOAD_TABLE__, %d/*NVIDIA_PTX*/,\n"
+ " &target_data);\n"
+ "};\n", GOMP_DEVICE_NVIDIA_PTX);
+
+ fprintf (out, "static __attribute__((destructor)) void fini (void)\n"
+ "{\n"
+ " GOMP_offload_unregister (__OFFLOAD_TABLE__, %d/*NVIDIA_PTX*/,\n"
+ " &target_data);\n"
+ "};\n", GOMP_DEVICE_NVIDIA_PTX);
}
static void