We somehow missed to implement these OpenACC 2.6 functions (the Fortran routines are newer: 3.3). It is actually used, at least, by two SPEC hpc/accel (soma + lbm) tests (and OpenACC_VV) - and it was trivial to implement, which was my workaround to make them compile.
Besides adding the same-device copy function, it also adds some shortcuts (size 0 → to nothing; same ptr (shared mem or same device) → do nothing, using memcpy not memmove per OpenACC semantics.) Unless there are comments, I intent to commit the attached patch on Friday. Tobias
libgomp: Add OpenACC's acc_memcpy_device{,_async} routines [PR93226] libgomp/ChangeLog: PR libgomp/93226 * libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_dev2dev): New prototype. * libgomp.h (struct acc_dispatch_t): Add dev2dev_func. (gomp_copy_dev2dev): New prototype. * libgomp.map (OACC_2.6.1): New; add acc_memcpy_device{,_async}. * libgomp.texi (acc_memcpy_device): New. * oacc-mem.c (memcpy_tofrom_device): Change to take from/to device boolean; use memcpy not memmove; add early return if size == 0 or same device + same ptr. (acc_memcpy_to_device, acc_memcpy_to_device_async, acc_memcpy_from_device, acc_memcpy_from_device_async): Update. (acc_memcpy_device, acc_memcpy_device_async): New. * openacc.f90 (acc_memcpy_device, acc_memcpy_device_async): Add interface. * openacc_lib.h (acc_memcpy_device, acc_memcpy_device_async): Likewise. * openacc.h (acc_memcpy_device, acc_memcpy_device_async): Add prototype. * plugin/plugin-gcn.c (GOMP_OFFLOAD_openacc_async_host2dev): Update comment. (GOMP_OFFLOAD_openacc_async_dev2host): Update call. (GOMP_OFFLOAD_openacc_async_dev2dev): New. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_dev2dev): New. * target.c (gomp_copy_dev2dev): New. (gomp_load_plugin_for_device): Load dev2dev and async_dev2dev. * testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c: New test. * testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90: New test. libgomp/libgomp-plugin.h | 2 + libgomp/libgomp.h | 4 + libgomp/libgomp.map | 6 ++ libgomp/libgomp.texi | 39 +++++++ libgomp/oacc-mem.c | 44 +++++--- libgomp/openacc.f90 | 22 ++++ libgomp/openacc.h | 4 +- libgomp/openacc_lib.h | 24 +++++ libgomp/plugin/plugin-gcn.c | 17 +++- libgomp/plugin/plugin-nvptx.c | 43 ++++++++ libgomp/target.c | 14 +++ .../acc_memcpy_device-1.c | 96 +++++++++++++++++ .../libgomp.oacc-fortran/acc_memcpy_device-1.f90 | 113 +++++++++++++++++++++ 13 files changed, 409 insertions(+), 19 deletions(-) diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 924fc1f44b1..50c89feaf73 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -200,6 +200,8 @@ extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size struct goacc_asyncqueue *); extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t, struct goacc_asyncqueue *); +extern bool GOMP_OFFLOAD_openacc_async_dev2dev (int, void *, const void *, size_t, + struct goacc_asyncqueue *); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void); extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 6030f9d0a2c..ed4e23ae3e1 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1360,6 +1360,7 @@ typedef struct acc_dispatch_t __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func; __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func; __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func; + __typeof (GOMP_OFFLOAD_openacc_async_dev2dev) *dev2dev_func; } async; __typeof (GOMP_OFFLOAD_openacc_get_property) *get_property_func; @@ -1467,6 +1468,9 @@ extern void gomp_copy_host2dev (struct gomp_device_descr *, extern void gomp_copy_dev2host (struct gomp_device_descr *, struct goacc_asyncqueue *, void *, const void *, size_t); +extern void gomp_copy_dev2dev (struct gomp_device_descr *, + struct goacc_asyncqueue *, void *, const void *, + size_t); extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t); extern bool gomp_attach_pointer (struct gomp_device_descr *, struct goacc_asyncqueue *, splay_tree, diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index eae2f53bab1..ad9787ca4c0 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -609,6 +609,12 @@ OACC_2.6 { acc_get_property_string_h_; } OACC_2.5.1; +OACC_2.6.1 { + global: + acc_memcpy_device; + acc_memcpy_device_async; +} OACC_2.6; + GOACC_2.0 { global: GOACC_data_end; diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index 6909c2b16f8..5aec358c774 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -4763,6 +4763,7 @@ acceleration device. present on device. * acc_memcpy_to_device:: Copy host memory to device memory. * acc_memcpy_from_device:: Copy device memory to host memory. +* acc_memcpy_device:: Copy memory within a device. * acc_attach:: Let device pointer point to device-pointer target. * acc_detach:: Let device pointer point to host-pointer target. @@ -5837,6 +5838,44 @@ This function copies device memory specified by device address of +@node acc_memcpy_device +@section @code{acc_memcpy_device} -- Copy memory within a device. +@table @asis +@item @emph{Description} +This function copies device memory from one memory location to another +on the current device. It copies @var{bytes} bytes of data from the device +address, specified by @var{data_dev_src}, to the device address +@var{data_dev_dest}. The @code{_async} version performs the transfer +asnychronously using the queue associated with @var{async_arg}. + +@item @emph{C/C++}: +@multitable @columnfractions .20 .80 +@item @emph{Prototype}: @tab @code{void acc_memcpy_device(d_void* data_dev_dest,} +@item @tab @code{d_void* data_dev_src, size_t bytes);} +@item @emph{Prototype}: @tab @code{void acc_memcpy_device_async(d_void* data_dev_dest,} +@item @tab @code{d_void* data_dev_src, size_t bytes, int async_arg);} +@end multitable + +@item @emph{Fortran}: +@multitable @columnfractions .20 .80 +@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device(data_dev_dest, &} +@item @tab @code{data_dev_src, bytes)} +@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device_async(data_dev_dest, &} +@item @tab @code{data_dev_src, bytes, async_arg)} +@item @tab @code{type(c_ptr), value :: data_dev_dest} +@item @tab @code{type(c_ptr), value :: data_dev_src} +@item @tab @code{integer(c_size_t), value :: bytes} +@item @tab @code{integer(acc_handle_kind), value :: async_arg} +@end multitable + +@item @emph{Reference}: +@uref{https://www.openacc.org, OpenACC specification v2.6}, section +3.2.33. @uref{https://www.openacc.org, OpenACC specification v3.3}, section +3.2.28. +@end table + + + @node acc_attach @section @code{acc_attach} -- Let device pointer point to device-pointer target. @table @asis diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index 0482ed37d95..5b8ba7e1072 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -171,21 +171,22 @@ acc_free (void *d) } static void -memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, - const char *libfnname) +memcpy_tofrom_device (bool dev_to, bool dev_from, void *dst, void *src, + size_t s, int async, const char *libfnname) { /* No need to call lazy open here, as the device pointer must have been obtained from a routine that did that. */ struct goacc_thread *thr = goacc_thread (); assert (thr && thr->dev); + if (s == 0) + return; if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) { - if (from) - memmove (h, d, s); - else - memmove (d, h, s); + if (src == dst) + return; + memcpy (dst, src, s); return; } @@ -199,10 +200,15 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, } goacc_aq aq = get_goacc_asyncqueue (async); - if (from) - gomp_copy_dev2host (thr->dev, aq, h, d, s); + if (dev_to && dev_from) + { + if (dst != src) + gomp_copy_dev2dev (thr->dev, aq, dst, src, s); + } + else if (dev_from) + gomp_copy_dev2host (thr->dev, aq, dst, src, s); else - gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL); + gomp_copy_host2dev (thr->dev, aq, dst, src, s, false, /* TODO: cbuf? */ NULL); if (profiling_p) { @@ -214,25 +220,37 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, void acc_memcpy_to_device (void *d, void *h, size_t s) { - memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); + memcpy_tofrom_device (true, false, d, h, s, acc_async_sync, __FUNCTION__); } void acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) { - memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); + memcpy_tofrom_device (true, false, d, h, s, async, __FUNCTION__); } void acc_memcpy_from_device (void *h, void *d, size_t s) { - memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); + memcpy_tofrom_device (false, true, h, d, s, acc_async_sync, __FUNCTION__); } void acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) { - memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); + memcpy_tofrom_device (false, true, h, d, s, async, __FUNCTION__); +} + +void +acc_memcpy_device (void *dst, void *src, size_t s) +{ + memcpy_tofrom_device (true, true, dst, src, s, acc_async_sync, __FUNCTION__); +} + +void +acc_memcpy_device_async (void *dst, void *src, size_t s, int async) +{ + memcpy_tofrom_device (true, true, dst, src, s, async, __FUNCTION__); } /* Return the device pointer that corresponds to host data H. Or NULL diff --git a/libgomp/openacc.f90 b/libgomp/openacc.f90 index 8ef107e959d..9d51f017985 100644 --- a/libgomp/openacc.f90 +++ b/libgomp/openacc.f90 @@ -797,6 +797,7 @@ module openacc public :: acc_copyout_finalize, acc_delete_finalize public :: acc_memcpy_to_device, acc_memcpy_to_device_async public :: acc_memcpy_from_device, acc_memcpy_from_device_async + public :: acc_memcpy_device, acc_memcpy_device_async integer, parameter :: openacc_version = 201711 @@ -1046,6 +1047,27 @@ module openacc end subroutine end interface + interface + subroutine acc_memcpy_device (data_dev_dest, data_dev_src, bytes) bind(C) + use iso_c_binding, only: c_ptr, c_size_t + type(c_ptr), value :: data_dev_dest + type(c_ptr), value :: data_dev_src + integer(c_size_t), value :: bytes + end subroutine + end interface + + interface + subroutine acc_memcpy_device_async (data_dev_dest, data_dev_src, & + bytes, async_arg) bind(C) + use iso_c_binding, only: c_ptr, c_size_t + import :: acc_handle_kind + type(c_ptr), value :: data_dev_dest + type(c_ptr), value :: data_dev_src + integer(c_size_t), value :: bytes + integer(acc_handle_kind), value :: async_arg + end subroutine + end interface + interface acc_copyin_async procedure :: acc_copyin_async_32_h procedure :: acc_copyin_async_64_h diff --git a/libgomp/openacc.h b/libgomp/openacc.h index a520bbe00db..3085b007efa 100644 --- a/libgomp/openacc.h +++ b/libgomp/openacc.h @@ -123,6 +123,7 @@ void *acc_hostptr (void *) __GOACC_NOTHROW; int acc_is_present (void *, size_t) __GOACC_NOTHROW; void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW; void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW; +void acc_memcpy_device (void *, void *, size_t) __GOACC_NOTHROW; void acc_attach (void **) __GOACC_NOTHROW; void acc_attach_async (void **, int) __GOACC_NOTHROW; void acc_detach (void **) __GOACC_NOTHROW; @@ -136,7 +137,7 @@ void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW; void acc_detach_finalize (void **) __GOACC_NOTHROW; void acc_detach_finalize_async (void **, int) __GOACC_NOTHROW; -/* Async functions, specified in OpenACC 2.5. */ +/* Async functions, specified in OpenACC 2.5, acc_memcpy_device in 2.6. */ void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW; void acc_create_async (void *, size_t, int) __GOACC_NOTHROW; void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW; @@ -145,6 +146,7 @@ void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW; void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW; void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; +void acc_memcpy_device_async (void *, void *, size_t, int) __GOACC_NOTHROW; /* CUDA-specific routines. */ void *acc_get_current_cuda_device (void) __GOACC_NOTHROW; diff --git a/libgomp/openacc_lib.h b/libgomp/openacc_lib.h index b0d287ed5f8..192ff7d7771 100644 --- a/libgomp/openacc_lib.h +++ b/libgomp/openacc_lib.h @@ -528,6 +528,30 @@ end subroutine end interface + interface + subroutine acc_memcpy_device(data_dev_dest, data_dev_src, & + & bytes) bind(C) + use iso_c_binding, only: c_ptr, c_size_t + type(c_ptr), value :: data_dev_dest + type(c_ptr), value :: data_dev_src + integer(c_size_t), value :: bytes + end subroutine + end interface + + interface + subroutine acc_memcpy_device_async(data_dev_dest, & + & data_dev_src, bytes, & + & async_arg) bind(C) + use iso_c_binding, only: c_ptr, c_size_t + import :: acc_handle_kind + type(c_ptr), value :: data_dev_dest + type(c_ptr), value :: data_dev_src + integer(c_size_t), value :: bytes + integer(acc_handle_kind), value :: async_arg + end subroutine + end interface + + interface acc_copyin_async subroutine acc_copyin_async_32_h (a, len, async) use iso_c_binding, only: c_int32_t diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 4b42a597cbd..46203838e7c 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -5079,7 +5079,8 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, queue_push_callback (aq, fn, data); } -/* Queue up an asynchronous data copy from host to DEVICE. */ +/* Queue up an asynchronous data copy from host to DEVICE. + (Also handles dev2host and dev2dev.) */ bool GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src, @@ -5097,10 +5098,16 @@ bool GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src, size_t n, struct goacc_asyncqueue *aq) { - struct agent_info *agent = get_agent_info (device); - assert (agent == aq->agent); - queue_push_copy (aq, dst, src, n); - return true; + return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq); +} + +/* Queue up an asynchronous data copy from DEVICE to DEVICE. */ + +bool +GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq); } union goacc_property_value diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index a5cf859db19..2f03c441bf3 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -2018,6 +2018,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, cuda_callback_wrapper, (void *) b, 0); } +static bool +cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s) +{ + CUdeviceptr pb1, pb2; + size_t ps1, ps2; + if (!s) + return true; + if (!d1 || !d2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1); + CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2); + if (!pb1 || !pb2) + { + GOMP_PLUGIN_error ("invalid device address"); + return false; + } + if ((void *)(d1 + s) > (void *)(pb1 + ps1) + || (void *)(d2 + s) > (void *)(pb2 + ps2)) + { + GOMP_PLUGIN_error ("invalid size"); + return false; + } + return true; +} + static bool cuda_memcpy_sanity_check (const void *h, const void *d, size_t s) { @@ -2077,6 +2105,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n) bool GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) { + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_dev_sanity_check (dst, src, n)) + return false; CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL); return true; } @@ -2288,6 +2319,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src, return true; } +bool +GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src, + size_t n, struct goacc_asyncqueue *aq) +{ + if (!nvptx_attach_host_thread_to_device (ord) + || !cuda_memcpy_sanity_check (dst, src, n)) + return false; + CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, + aq->cuda_stream); + return true; +} + union goacc_property_value GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop) { diff --git a/libgomp/target.c b/libgomp/target.c index 9674ff4c9c0..fe94978309d 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -461,6 +461,19 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep, gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz); } +attribute_hidden void +gomp_copy_dev2dev (struct gomp_device_descr *devicep, + struct goacc_asyncqueue *aq, + void *dst, const void *src, size_t sz) +{ + if (__builtin_expect (aq != NULL, 0)) + goacc_device_copy_async (devicep, devicep->openacc.async.dev2dev_func, + "dev", dst, "dev", src, NULL, sz, aq); + else + gomp_device_copy (devicep, devicep->dev2dev_func, "dev", dst, + "dev", src, sz); +} + static void gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr) { @@ -5573,6 +5586,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, || !DLSYM_OPT (openacc.async.exec, openacc_async_exec) || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host) || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev) + || !DLSYM_OPT (openacc.async.dev2dev, openacc_async_dev2dev) || !DLSYM_OPT (openacc.get_property, openacc_get_property)) { /* Require all the OpenACC handlers if we have diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c new file mode 100644 index 00000000000..e261e0c5fea --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c @@ -0,0 +1,96 @@ +/* { dg-prune-output "using .vector_length \\(32\\)" } */ + +/* PR libgomp/93226 */ + +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <openacc.h> + +enum { N = 1024 }; + +static int D[N]; +#pragma acc declare device_resident(D) + +#pragma acc routine +intptr_t init_d() +{ + for (int i = 0; i < N; i++) + D[i] = 27*i; + return (intptr_t) &D[0]; +} + +int +main () +{ + int *a, *b, *e; + void *d_a, *d_b, *d_c, *d_d, *d_e, *d_f; + intptr_t intptr; + bool fail = false; + + a = (int *) malloc (N*sizeof (int)); + b = (int *) malloc (N*sizeof (int)); + e = (int *) malloc (N*sizeof (int)); + d_c = acc_malloc (N*sizeof (int)); + d_f = acc_malloc (N*sizeof (int)); + + memset (e, 0xff, N*sizeof (int)); + d_e = acc_copyin (e, N*sizeof (int)); + + #pragma acc serial copyout(intptr) + intptr = init_d (); + d_d = (void*) intptr; + acc_memcpy_device (d_c, d_d, N*sizeof (int)); + + #pragma acc serial copy(fail) deviceptr(d_c) firstprivate(intptr) + { + int *cc = (int *) d_c; + int *dd = (int *) intptr; + for (int i = 0; i < N; i++) + if (dd[i] != 27*i || cc[i] != 27*i) + { + fail = true; + __builtin_abort (); + } + } + if (fail) __builtin_abort (); + + for (int i = 0; i < N; i++) + a[i] = 11*i; + for (int i = 0; i < N; i++) + b[i] = 31*i; + + d_a = acc_copyin (a, N*sizeof (int)); + acc_copyin_async (b, N*sizeof (int), acc_async_noval); + + #pragma acc parallel deviceptr(d_c) async + { + int *cc = (int *) d_c; + #pragma acc loop + for (int i = 0; i < N; i++) + cc[i] = -17*i; + } + + acc_memcpy_device_async (d_d, d_a, N*sizeof (int), acc_async_noval); + acc_memcpy_device_async (d_f, d_c, N*sizeof (int), acc_async_noval); + acc_wait (acc_async_noval); + d_b = acc_deviceptr (b); + acc_memcpy_device_async (d_e, d_b, N*sizeof (int), acc_async_noval); + acc_wait (acc_async_noval); + + #pragma acc serial deviceptr(d_d, d_e, d_f) copy(fail) + { + int *dd = (int *) d_d; + int *ee = (int *) d_e; + int *ff = (int *) d_f; + for (int i = 0; i < N; i++) + if (dd[i] != 11*i + || ee[i] != 31*i + || ff[i] != -17*i) + { + fail = true; + __builtin_abort (); + } + } + if (fail) __builtin_abort (); +} diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90 new file mode 100644 index 00000000000..28199a979fd --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90 @@ -0,0 +1,113 @@ +! { dg-prune-output "using .vector_length \\(32\\)" } + +! PR libgomp/93226 */ + +module m + use iso_c_binding + use openacc + implicit none (external, type) + + integer, parameter :: N = 1024 + + integer :: D(N) + !$acc declare device_resident(D) + +contains + + integer(c_intptr_t) function init_d() + !$acc routine + integer :: i + do i = 1, N + D(i) = 27*i + end do + init_d = loc(D) + end +end module + +program main + use m + implicit none (external, type) + + integer, allocatable, target :: a(:), b(:), e(:) + type(c_ptr) :: d_a, d_b, d_c, d_d, d_e, d_f + integer(c_intptr_t) intptr + integer :: i + logical fail + + fail = .false. + + allocate(a(N), b(N), e(N)) + d_c = acc_malloc (N*c_sizeof (i)) + d_f = acc_malloc (N*c_sizeof (i)) + + e = huge(e) + call acc_copyin (e, N*c_sizeof (i)); + d_e = acc_deviceptr (e); + + !$acc serial copyout(intptr) + intptr = init_d () + !$acc end serial + d_d = transfer(intptr, d_d) + call acc_memcpy_device (d_c, d_d, N*c_sizeof (i)) + + !$acc serial copy(fail) copy(a) deviceptr(d_c, d_d) firstprivate(intptr) + block + integer, pointer :: cc(:), dd(:) + call c_f_pointer (d_c, cc, [N]) + call c_f_pointer (d_d, dd, [N]) + a = cc + do i = 1, N + if (dd(i) /= 27*i .or. cc(i) /= 27*i) then + fail = .true. + stop 1 + end if + end do + end block + !$acc end serial + if (fail) error stop 1 + + do i = 1, N + a(i) = 11*i + b(i) = 31*i + end do + + call acc_copyin (a, N*c_sizeof (i)) + d_a = acc_deviceptr (a) + call acc_copyin_async (b, N*c_sizeof (i), acc_async_noval) + + !$acc parallel deviceptr(d_c) private(i) async + block + integer, pointer :: cc(:) + call c_f_pointer (d_c, cc, [N]) + !$acc loop + do i = 1, N + cc(i) = -17*i + end do + end block + !$acc end parallel + + call acc_memcpy_device_async (d_d, d_a, N*c_sizeof (i), acc_async_noval) + call acc_memcpy_device_async (d_f, d_c, N*c_sizeof (i), acc_async_noval) + call acc_wait (acc_async_noval) + d_b = acc_deviceptr (b) + call acc_memcpy_device_async (d_e, d_b, N*c_sizeof (i), acc_async_noval) + call acc_wait (acc_async_noval) + + !$acc serial deviceptr(d_d, d_e, d_f) private(i) copy(fail) + block + integer, pointer :: dd(:), ee(:), ff(:) + call c_f_pointer (d_d, dd, [N]) + call c_f_pointer (d_e, ee, [N]) + call c_f_pointer (d_f, ff, [N]) + do i = 1, N + if (dd(i) /= 11*i & + .or. ee(i) /= 31*i & + .or. ff(i) /= -17*i) then + fail = .true. + stop 2 + end if + end do + end block + !$acc end serial + if (fail) error stop 2 +end