Attached is the updated/rediffed version, which now uses the builtin
instead of the 'asm("s8").
The code in principle works; that is: If no private stack variables are
copied, it works.
Or in other words: reverse-offload target regions that don't use
firstprivate or mapping work, the rest would crash. That's avoided by
not accepting reverse offload inside GOMP_OFFLOAD_get_num_devices for now.
To get it working, the manual stack allocation patch + the trivial
update to that get_num_devices func is needed, but no change to the
attached patch.
In order to reduce local patches, I would love to have it on mainline –
otherwise, I have at least the current version in gcc-patches@.
Tobias
PS: Previous patch email quoted below. Note: there were two follow up
emails, one by Andrew and one by me; cf. your own mail archive (of this
thread) or
https://gcc.gnu.org/pipermail/gcc-patches/2022-October/603383.html + the
next two by thread messages.
On 12.10.22 16:29, Tobias Burnus wrote:
On 29.09.22 18:24, Andrew Stubbs wrote:
On 27/09/2022 14:16, Tobias Burnus wrote:
Andrew did suggest a while back to piggyback on the console_output
handling,
avoiding another atomic access. - If this is still wanted, I like to
have some
guidance regarding how to actually implement it.
[...]
The point is that you can use the "msg" and "text" fields for
whatever data you want, as long as you invent a new value for "type".
[....]
You can make "case 4" do whatever you want. There are enough bytes
for 4 pointers, and you could use multiple packets (although it's not
safe to assume they're contiguous or already arrived; maybe "case 4"
for part 1, "case 5" for part 2). It's possible to change this
structure, of course, but the target implementation is in newlib so
versioning becomes a problem.
I think – also looking at the Newlib write.c implementation - that
the data is contiguous: there is an atomic add, where instead of
passing '1' for a single slot, I could also add '2' for two slots.
Attached is one variant – for the decl of the GOMP_OFFLOAD_target_rev,
it needs the generic parts of the sister nvptx patch.*
2*128 bytes were not enough, I need 3*128 bytes. (Or rather 5*64 +
32.) As target_ext is blocking, I decided to use a stack local
variable for the remaining arguments and pass it along. Alternatively,
I could also use 2 slots - and process them together. This would avoid
one device->host memory copy but would make console_output less clear.
OK for mainline?
Tobias
* https://gcc.gnu.org/pipermail/gcc-patches/2022-October/603354.html
PS: Currently, device stack variables are private and cannot be
accessed from the host; this will change in a separate patch. It not
only affects the "rest" part as used in this patch but also the actual
arrays behind addr, kinds, and sizes. And quite likely a lot of the
map/firstprivate variables passed to addr.
As num_devices() will return 0 or -1, this is for now a non-issue.
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht
München, HRB 106955
libgomp/gcn: Prepare for reverse-offload callback handling
libgomp/ChangeLog:
* config/gcn/libgomp-gcn.h: New file; contains
struct output, declared previously in plugin-gcn.c.
* config/gcn/target.c: Include it.
(GOMP_ADDITIONAL_ICVS): Declare as extern var.
(GOMP_target_ext): Handle reverse offload.
* plugin/plugin-gcn.c: Include libgomp-gcn.h.
(struct kernargs): Replace struct def by the one
from libgomp-gcn.h for output_data.
(process_reverse_offload): New.
(console_output): Call it.
libgomp/config/gcn/libgomp-gcn.h | 61 ++++++++++++++++++++++++++++++++++++++++
libgomp/config/gcn/target.c | 44 ++++++++++++++++++++++++-----
libgomp/plugin/plugin-gcn.c | 34 ++++++++++++----------
3 files changed, 117 insertions(+), 22 deletions(-)
diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h
new file mode 100644
index 00000000000..91560be787f
--- /dev/null
+++ b/libgomp/config/gcn/libgomp-gcn.h
@@ -0,0 +1,61 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ Contributed by Tobias Burnus <tob...@codesourcery.com>.
+
+ This file is part of the GNU Offloading and Multi Processing Library
+ (libgomp).
+
+ Libgomp is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This file contains defines and type definitions shared between the
+ nvptx target's libgomp.a and the plugin-nvptx.c, but that is only
+ needef for this target. */
+
+#ifndef LIBGOMP_GCN_H
+#define LIBGOMP_GCN_H 1
+
+/* This struct is also used in Newlib's libc/sys/amdgcn/write.c. */
+struct output
+{
+ int return_value;
+ unsigned int next_output;
+ struct printf_data {
+ int written;
+ union {
+ char msg[128];
+ uint64_t msg_u64[2];
+ };
+ int type;
+ union {
+ int64_t ivalue;
+ double dvalue;
+ char text[128];
+ uint64_t value_u64[2];
+ };
+ } queue[1024];
+ unsigned int consumed;
+};
+
+#if (__SIZEOF_SHORT__ != 2 \
+ || __SIZEOF_SIZE_T__ != 8 \
+ || __SIZEOF_POINTER__ != 8)
+#error "Data-type conversion required for rev_offload"
+#endif
+
+#endif /* LIBGOMP_GCN_H */
diff --git a/libgomp/config/gcn/target.c b/libgomp/config/gcn/target.c
index c8484fa18d9..27854565d40 100644
--- a/libgomp/config/gcn/target.c
+++ b/libgomp/config/gcn/target.c
@@ -24,8 +24,11 @@
<http://www.gnu.org/licenses/>. */
#include "libgomp.h"
+#include "libgomp-gcn.h"
#include <limits.h>
+extern volatile struct gomp_offload_icvs GOMP_ADDITIONAL_ICVS;
+
bool
GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
unsigned int thread_limit, bool first)
@@ -75,16 +78,43 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned short *kinds,
unsigned int flags, void **depend, void **args)
{
- (void) device;
- (void) fn;
- (void) mapnum;
- (void) hostaddrs;
- (void) sizes;
- (void) kinds;
(void) flags;
(void) depend;
(void) args;
- __builtin_unreachable ();
+
+ if (device != GOMP_DEVICE_HOST_FALLBACK || fn == NULL)
+ return;
+
+ /* The output data is at ((void*) kernargs)[2]. */
+ register void **kernargs = (void**) __builtin_gcn_kernarg_ptr ();
+ struct output *data = (struct output *) kernargs[2];
+ /* Reserve one slot. */
+ unsigned int index = __atomic_fetch_add (&data->next_output, 1,
+ __ATOMIC_ACQUIRE);
+
+ if ((unsigned int) (index + 1) < data->consumed)
+ abort (); /* Overflow. */
+
+ /* Spinlock while the host catches up. */
+ if (index >= 1024)
+ while (__atomic_load_n (&data->consumed, __ATOMIC_ACQUIRE)
+ <= (index - 1024))
+ asm ("s_sleep 64");
+
+ unsigned int slot = index % 1024;
+ uint64_t addrs_sizes_kind[3] = {(uint64_t) hostaddrs, (uint64_t) sizes,
+ (uint64_t) kinds};
+ data->queue[slot].msg_u64[0] = (uint64_t) fn;
+ data->queue[slot].msg_u64[1] = (uint64_t) mapnum;
+ data->queue[slot].value_u64[0] = (uint64_t) &addrs_sizes_kind[0];
+ data->queue[slot].value_u64[1] = (uint64_t) GOMP_ADDITIONAL_ICVS.device_num;
+
+ data->queue[slot].type = 4; /* Reverse offload. */
+ __atomic_store_n (&data->queue[slot].written, 1, __ATOMIC_RELEASE);
+
+ /* Spinlock while the host catches up. */
+ while (__atomic_load_n (&data->queue[slot].written, __ATOMIC_ACQUIRE) != 0)
+ asm ("s_sleep 64");
}
void
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 04b122f2a09..ffe5cf5af2c 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -42,6 +42,7 @@
#include <dlfcn.h>
#include <signal.h>
#include "libgomp-plugin.h"
+#include "config/gcn/libgomp-gcn.h" /* For struct output. */
#include "gomp-constants.h"
#include <elf.h>
#include "oacc-plugin.h"
@@ -252,21 +253,7 @@ struct kernargs {
int64_t arena_ptr;
/* Output data. */
- struct output {
- int return_value;
- unsigned int next_output;
- struct printf_data {
- int written;
- char msg[128];
- int type;
- union {
- int64_t ivalue;
- double dvalue;
- char text[128];
- };
- } queue[1024];
- unsigned int consumed;
- } output_data;
+ struct output output_data;
};
/* A queue entry for a future asynchronous launch. */
@@ -1931,6 +1918,19 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
return shadow;
}
+static void
+process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t rev_data,
+ uint64_t dev_num64)
+{
+ int dev_num = dev_num64;
+ uint64_t addrs_sizes_kinds[3];
+ GOMP_OFFLOAD_host2dev (dev_num, &addrs_sizes_kinds, (void *) rev_data,
+ sizeof (addrs_sizes_kinds));
+ GOMP_PLUGIN_target_rev (fn, mapnum, addrs_sizes_kinds[0],
+ addrs_sizes_kinds[1], addrs_sizes_kinds[2],
+ dev_num, NULL, NULL, NULL);
+}
+
/* Output any data written to console output from the kernel. It is expected
that this function is polled during kernel execution.
@@ -1975,6 +1975,10 @@ console_output (struct kernel_info *kernel, struct kernargs *kernargs,
case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
case 3: printf ("%.128s%.128s", data->msg, data->text); break;
+ case 4:
+ process_reverse_offload (data->msg_u64[0], data->msg_u64[1],
+ data->value_u64[0],data->value_u64[1]);
+ break;
default: printf ("GCN print buffer error!\n"); break;
}
data->written = 0;