We somehow missed to implement these OpenACC 2.6 functions (the
Fortran routines are newer: 3.3). It is actually used, at least,
by two SPEC hpc/accel (soma + lbm) tests (and OpenACC_VV) - and
it was trivial to implement, which was my workaround to make them
compile.

Besides adding the same-device copy function, it also adds some
shortcuts (size 0 → to nothing; same ptr (shared mem or same device)
→ do nothing, using memcpy not memmove per OpenACC semantics.)

Unless there are comments, I intent to commit the attached patch
on Friday.

Tobias
libgomp: Add OpenACC's acc_memcpy_device{,_async} routines [PR93226]

libgomp/ChangeLog:

	PR libgomp/93226
	* libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_dev2dev): New
	prototype.
	* libgomp.h (struct acc_dispatch_t): Add dev2dev_func.
	(gomp_copy_dev2dev): New prototype.
	* libgomp.map (OACC_2.6.1): New; add acc_memcpy_device{,_async}.
	* libgomp.texi (acc_memcpy_device): New.
	* oacc-mem.c (memcpy_tofrom_device): Change to take from/to
	device boolean; use memcpy not memmove; add early return if
	size == 0 or same device + same ptr.
	(acc_memcpy_to_device, acc_memcpy_to_device_async,
	acc_memcpy_from_device, acc_memcpy_from_device_async): Update.
	(acc_memcpy_device, acc_memcpy_device_async): New.
	* openacc.f90 (acc_memcpy_device, acc_memcpy_device_async):
	Add interface.
	* openacc_lib.h (acc_memcpy_device, acc_memcpy_device_async):
	Likewise.
	* openacc.h (acc_memcpy_device, acc_memcpy_device_async): Add
	prototype.
	* plugin/plugin-gcn.c (GOMP_OFFLOAD_openacc_async_host2dev):
	Update comment.
	(GOMP_OFFLOAD_openacc_async_dev2host): Update call.
	(GOMP_OFFLOAD_openacc_async_dev2dev): New.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_dev2dev):
	New.
	* target.c (gomp_copy_dev2dev): New.
	(gomp_load_plugin_for_device): Load dev2dev and async_dev2dev.
	* testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c: New test.
	* testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90: New test.

 libgomp/libgomp-plugin.h                           |   2 +
 libgomp/libgomp.h                                  |   4 +
 libgomp/libgomp.map                                |   6 ++
 libgomp/libgomp.texi                               |  39 +++++++
 libgomp/oacc-mem.c                                 |  44 +++++---
 libgomp/openacc.f90                                |  22 ++++
 libgomp/openacc.h                                  |   4 +-
 libgomp/openacc_lib.h                              |  24 +++++
 libgomp/plugin/plugin-gcn.c                        |  17 +++-
 libgomp/plugin/plugin-nvptx.c                      |  43 ++++++++
 libgomp/target.c                                   |  14 +++
 .../acc_memcpy_device-1.c                          |  96 +++++++++++++++++
 .../libgomp.oacc-fortran/acc_memcpy_device-1.f90   | 113 +++++++++++++++++++++
 13 files changed, 409 insertions(+), 19 deletions(-)

diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 924fc1f44b1..50c89feaf73 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -200,6 +200,8 @@ extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size
 						 struct goacc_asyncqueue *);
 extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t,
 						 struct goacc_asyncqueue *);
+extern bool GOMP_OFFLOAD_openacc_async_dev2dev (int, void *, const void *, size_t,
+						struct goacc_asyncqueue *);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void);
 extern void *GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 6030f9d0a2c..ed4e23ae3e1 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1360,6 +1360,7 @@ typedef struct acc_dispatch_t
     __typeof (GOMP_OFFLOAD_openacc_async_exec) *exec_func;
     __typeof (GOMP_OFFLOAD_openacc_async_dev2host) *dev2host_func;
     __typeof (GOMP_OFFLOAD_openacc_async_host2dev) *host2dev_func;
+    __typeof (GOMP_OFFLOAD_openacc_async_dev2dev) *dev2dev_func;
   } async;
 
   __typeof (GOMP_OFFLOAD_openacc_get_property) *get_property_func;
@@ -1467,6 +1468,9 @@ extern void gomp_copy_host2dev (struct gomp_device_descr *,
 extern void gomp_copy_dev2host (struct gomp_device_descr *,
 				struct goacc_asyncqueue *, void *, const void *,
 				size_t);
+extern void gomp_copy_dev2dev (struct gomp_device_descr *,
+			       struct goacc_asyncqueue *, void *, const void *,
+			       size_t);
 extern uintptr_t gomp_map_val (struct target_mem_desc *, void **, size_t);
 extern bool gomp_attach_pointer (struct gomp_device_descr *,
 				 struct goacc_asyncqueue *, splay_tree,
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index eae2f53bab1..ad9787ca4c0 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -609,6 +609,12 @@ OACC_2.6 {
 	acc_get_property_string_h_;
 } OACC_2.5.1;
 
+OACC_2.6.1 {
+  global:
+	acc_memcpy_device;
+	acc_memcpy_device_async;
+} OACC_2.6;
+
 GOACC_2.0 {
   global:
 	GOACC_data_end;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 6909c2b16f8..5aec358c774 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -4763,6 +4763,7 @@ acceleration device.
                                 present on device.
 * acc_memcpy_to_device::        Copy host memory to device memory.
 * acc_memcpy_from_device::      Copy device memory to host memory.
+* acc_memcpy_device::           Copy memory within a device.
 * acc_attach::                  Let device pointer point to device-pointer target.
 * acc_detach::                  Let device pointer point to host-pointer target.
 
@@ -5837,6 +5838,44 @@ This function copies device memory specified by device address of
 
 
 
+@node acc_memcpy_device
+@section @code{acc_memcpy_device} -- Copy memory within a device.
+@table @asis
+@item @emph{Description}
+This function copies device memory from one memory location to another
+on the current device.  It copies @var{bytes} bytes of data from the device
+address, specified by @var{data_dev_src}, to the device address
+@var{data_dev_dest}.  The @code{_async} version performs the transfer
+asnychronously using the queue associated with @var{async_arg}.
+
+@item @emph{C/C++}:
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void acc_memcpy_device(d_void* data_dev_dest,}
+@item                   @tab @code{d_void* data_dev_src, size_t bytes);}
+@item @emph{Prototype}: @tab @code{void acc_memcpy_device_async(d_void* data_dev_dest,}
+@item                   @tab @code{d_void* data_dev_src, size_t bytes, int async_arg);}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device(data_dev_dest, &}
+@item                   @tab @code{data_dev_src, bytes)}
+@item @emph{Interface}: @tab @code{subroutine acc_memcpy_device_async(data_dev_dest, &}
+@item                   @tab @code{data_dev_src, bytes, async_arg)}
+@item                   @tab @code{type(c_ptr), value :: data_dev_dest}
+@item                   @tab @code{type(c_ptr), value :: data_dev_src}
+@item                   @tab @code{integer(c_size_t), value :: bytes}
+@item                   @tab @code{integer(acc_handle_kind), value :: async_arg}
+@end multitable
+
+@item @emph{Reference}:
+@uref{https://www.openacc.org, OpenACC specification v2.6}, section
+3.2.33.  @uref{https://www.openacc.org, OpenACC specification v3.3}, section
+3.2.28.
+@end table
+
+
+
 @node acc_attach
 @section @code{acc_attach} -- Let device pointer point to device-pointer target.
 @table @asis
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 0482ed37d95..5b8ba7e1072 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -171,21 +171,22 @@ acc_free (void *d)
 }
 
 static void
-memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
-		      const char *libfnname)
+memcpy_tofrom_device (bool dev_to, bool dev_from, void *dst, void *src,
+		      size_t s, int async, const char *libfnname)
 {
   /* No need to call lazy open here, as the device pointer must have
      been obtained from a routine that did that.  */
   struct goacc_thread *thr = goacc_thread ();
 
   assert (thr && thr->dev);
+  if (s == 0)
+    return;
 
   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     {
-      if (from)
-	memmove (h, d, s);
-      else
-	memmove (d, h, s);
+      if (src == dst)
+	return;
+      memcpy (dst, src, s);
       return;
     }
 
@@ -199,10 +200,15 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
     }
 
   goacc_aq aq = get_goacc_asyncqueue (async);
-  if (from)
-    gomp_copy_dev2host (thr->dev, aq, h, d, s);
+  if (dev_to && dev_from)
+    {
+      if (dst != src)
+	gomp_copy_dev2dev (thr->dev, aq, dst, src, s);
+    }
+  else if (dev_from)
+    gomp_copy_dev2host (thr->dev, aq, dst, src, s);
   else
-    gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
+    gomp_copy_host2dev (thr->dev, aq, dst, src, s, false, /* TODO: cbuf? */ NULL);
 
   if (profiling_p)
     {
@@ -214,25 +220,37 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
 void
 acc_memcpy_to_device (void *d, void *h, size_t s)
 {
-  memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
+  memcpy_tofrom_device (true, false, d, h, s, acc_async_sync, __FUNCTION__);
 }
 
 void
 acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
 {
-  memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
+  memcpy_tofrom_device (true, false, d, h, s, async, __FUNCTION__);
 }
 
 void
 acc_memcpy_from_device (void *h, void *d, size_t s)
 {
-  memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
+  memcpy_tofrom_device (false, true, h, d, s, acc_async_sync, __FUNCTION__);
 }
 
 void
 acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
 {
-  memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
+  memcpy_tofrom_device (false, true, h, d, s, async, __FUNCTION__);
+}
+
+void
+acc_memcpy_device (void *dst, void *src, size_t s)
+{
+  memcpy_tofrom_device (true, true, dst, src, s, acc_async_sync, __FUNCTION__);
+}
+
+void
+acc_memcpy_device_async (void *dst, void *src, size_t s, int async)
+{
+  memcpy_tofrom_device (true, true, dst, src, s, async, __FUNCTION__);
 }
 
 /* Return the device pointer that corresponds to host data H.  Or NULL
diff --git a/libgomp/openacc.f90 b/libgomp/openacc.f90
index 8ef107e959d..9d51f017985 100644
--- a/libgomp/openacc.f90
+++ b/libgomp/openacc.f90
@@ -797,6 +797,7 @@ module openacc
   public :: acc_copyout_finalize, acc_delete_finalize
   public :: acc_memcpy_to_device, acc_memcpy_to_device_async
   public :: acc_memcpy_from_device, acc_memcpy_from_device_async
+  public :: acc_memcpy_device, acc_memcpy_device_async
 
   integer, parameter :: openacc_version = 201711
 
@@ -1046,6 +1047,27 @@ module openacc
     end subroutine
   end interface
 
+  interface
+    subroutine acc_memcpy_device (data_dev_dest, data_dev_src, bytes) bind(C)
+      use iso_c_binding, only: c_ptr, c_size_t
+      type(c_ptr), value :: data_dev_dest
+      type(c_ptr), value :: data_dev_src
+      integer(c_size_t), value :: bytes
+    end subroutine
+  end interface
+
+  interface
+    subroutine acc_memcpy_device_async (data_dev_dest, data_dev_src,  &
+                                        bytes, async_arg) bind(C)
+      use iso_c_binding, only: c_ptr, c_size_t
+      import :: acc_handle_kind
+      type(c_ptr), value :: data_dev_dest
+      type(c_ptr), value :: data_dev_src
+      integer(c_size_t), value :: bytes
+      integer(acc_handle_kind), value :: async_arg
+    end subroutine
+  end interface
+
   interface acc_copyin_async
     procedure :: acc_copyin_async_32_h
     procedure :: acc_copyin_async_64_h
diff --git a/libgomp/openacc.h b/libgomp/openacc.h
index a520bbe00db..3085b007efa 100644
--- a/libgomp/openacc.h
+++ b/libgomp/openacc.h
@@ -123,6 +123,7 @@ void *acc_hostptr (void *) __GOACC_NOTHROW;
 int acc_is_present (void *, size_t) __GOACC_NOTHROW;
 void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
 void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
+void acc_memcpy_device (void *, void *, size_t) __GOACC_NOTHROW;
 void acc_attach (void **) __GOACC_NOTHROW;
 void acc_attach_async (void **, int) __GOACC_NOTHROW;
 void acc_detach (void **) __GOACC_NOTHROW;
@@ -136,7 +137,7 @@ void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_detach_finalize (void **) __GOACC_NOTHROW;
 void acc_detach_finalize_async (void **, int) __GOACC_NOTHROW;
 
-/* Async functions, specified in OpenACC 2.5.  */
+/* Async functions, specified in OpenACC 2.5, acc_memcpy_device in 2.6.  */
 void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_create_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW;
@@ -145,6 +146,7 @@ void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW;
 void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
 void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
+void acc_memcpy_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
 
 /* CUDA-specific routines.  */
 void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;
diff --git a/libgomp/openacc_lib.h b/libgomp/openacc_lib.h
index b0d287ed5f8..192ff7d7771 100644
--- a/libgomp/openacc_lib.h
+++ b/libgomp/openacc_lib.h
@@ -528,6 +528,30 @@
         end subroutine
       end interface
 
+      interface
+        subroutine acc_memcpy_device(data_dev_dest, data_dev_src,       &
+     &                                  bytes) bind(C)
+          use iso_c_binding, only: c_ptr, c_size_t
+          type(c_ptr), value :: data_dev_dest
+          type(c_ptr), value :: data_dev_src
+          integer(c_size_t), value :: bytes
+        end subroutine
+      end interface
+
+      interface
+        subroutine acc_memcpy_device_async(data_dev_dest,               &
+     &                                          data_dev_src, bytes,    &
+     &                                          async_arg) bind(C)
+          use iso_c_binding, only: c_ptr, c_size_t
+          import :: acc_handle_kind
+          type(c_ptr), value :: data_dev_dest
+          type(c_ptr), value :: data_dev_src
+          integer(c_size_t), value :: bytes
+          integer(acc_handle_kind), value :: async_arg
+        end subroutine
+      end interface
+
+
       interface acc_copyin_async
         subroutine acc_copyin_async_32_h (a, len, async)
           use iso_c_binding, only: c_int32_t
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 4b42a597cbd..46203838e7c 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -5079,7 +5079,8 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
   queue_push_callback (aq, fn, data);
 }
 
-/* Queue up an asynchronous data copy from host to DEVICE.  */
+/* Queue up an asynchronous data copy from host to DEVICE.
+   (Also handles dev2host and dev2dev.)  */
 
 bool
 GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
@@ -5097,10 +5098,16 @@ bool
 GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
 				     size_t n, struct goacc_asyncqueue *aq)
 {
-  struct agent_info *agent = get_agent_info (device);
-  assert (agent == aq->agent);
-  queue_push_copy (aq, dst, src, n);
-  return true;
+  return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
+}
+
+/* Queue up an asynchronous data copy from DEVICE to DEVICE.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int device, void *dst, const void *src,
+				    size_t n, struct goacc_asyncqueue *aq)
+{
+  return GOMP_OFFLOAD_openacc_async_host2dev (device, dst, src, n, aq);
 }
 
 union goacc_property_value
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a5cf859db19..2f03c441bf3 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -2018,6 +2018,34 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
 		    cuda_callback_wrapper, (void *) b, 0);
 }
 
+static bool
+cuda_memcpy_dev_sanity_check (const void *d1, const void *d2, size_t s)
+{
+  CUdeviceptr pb1, pb2;
+  size_t ps1, ps2;
+  if (!s)
+    return true;
+  if (!d1 || !d2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  CUDA_CALL (cuMemGetAddressRange, &pb1, &ps1, (CUdeviceptr) d1);
+  CUDA_CALL (cuMemGetAddressRange, &pb2, &ps2, (CUdeviceptr) d2);
+  if (!pb1 || !pb2)
+    {
+      GOMP_PLUGIN_error ("invalid device address");
+      return false;
+    }
+  if ((void *)(d1 + s) > (void *)(pb1 + ps1)
+      || (void *)(d2 + s) > (void *)(pb2 + ps2))
+    {
+      GOMP_PLUGIN_error ("invalid size");
+      return false;
+    }
+  return true;
+}
+
 static bool
 cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
 {
@@ -2077,6 +2105,9 @@ GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
 bool
 GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
 {
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_dev_sanity_check (dst, src, n))
+    return false;
   CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
   return true;
 }
@@ -2288,6 +2319,18 @@ GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
   return true;
 }
 
+bool
+GOMP_OFFLOAD_openacc_async_dev2dev (int ord, void *dst, const void *src,
+				    size_t n, struct goacc_asyncqueue *aq)
+{
+  if (!nvptx_attach_host_thread_to_device (ord)
+      || !cuda_memcpy_sanity_check (dst, src, n))
+    return false;
+  CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
+	     aq->cuda_stream);
+  return true;
+}
+
 union goacc_property_value
 GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
 {
diff --git a/libgomp/target.c b/libgomp/target.c
index 9674ff4c9c0..fe94978309d 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -461,6 +461,19 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep,
     gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz);
 }
 
+attribute_hidden void
+gomp_copy_dev2dev (struct gomp_device_descr *devicep,
+		   struct goacc_asyncqueue *aq,
+		   void *dst, const void *src, size_t sz)
+{
+  if (__builtin_expect (aq != NULL, 0))
+    goacc_device_copy_async (devicep, devicep->openacc.async.dev2dev_func,
+			     "dev", dst, "dev", src, NULL, sz, aq);
+  else
+    gomp_device_copy (devicep, devicep->dev2dev_func, "dev", dst,
+		      "dev", src, sz);
+}
+
 static void
 gomp_free_device_memory (struct gomp_device_descr *devicep, void *devptr)
 {
@@ -5573,6 +5586,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
 	  || !DLSYM_OPT (openacc.async.exec, openacc_async_exec)
 	  || !DLSYM_OPT (openacc.async.dev2host, openacc_async_dev2host)
 	  || !DLSYM_OPT (openacc.async.host2dev, openacc_async_host2dev)
+	  || !DLSYM_OPT (openacc.async.dev2dev, openacc_async_dev2dev)
 	  || !DLSYM_OPT (openacc.get_property, openacc_get_property))
 	{
 	  /* Require all the OpenACC handlers if we have
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c
new file mode 100644
index 00000000000..e261e0c5fea
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_memcpy_device-1.c
@@ -0,0 +1,96 @@
+/* { dg-prune-output "using .vector_length \\(32\\)" } */
+
+/* PR libgomp/93226  */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <openacc.h>
+
+enum { N = 1024 };
+
+static int D[N];
+#pragma acc declare device_resident(D)
+
+#pragma acc routine
+intptr_t init_d()
+{
+  for (int i = 0; i < N; i++)
+    D[i] = 27*i;
+  return (intptr_t) &D[0];
+}
+
+int
+main ()
+{
+  int *a, *b, *e;
+  void *d_a, *d_b, *d_c, *d_d, *d_e, *d_f;
+  intptr_t intptr;
+  bool fail = false;
+
+  a = (int *) malloc (N*sizeof (int));
+  b = (int *) malloc (N*sizeof (int));
+  e = (int *) malloc (N*sizeof (int));
+  d_c = acc_malloc (N*sizeof (int));
+  d_f = acc_malloc (N*sizeof (int));
+
+  memset (e, 0xff, N*sizeof (int));
+  d_e = acc_copyin (e, N*sizeof (int));
+
+  #pragma acc serial copyout(intptr)
+    intptr = init_d ();
+  d_d = (void*) intptr;
+  acc_memcpy_device (d_c, d_d, N*sizeof (int));
+
+  #pragma acc serial copy(fail) deviceptr(d_c) firstprivate(intptr)
+  {
+    int *cc = (int *) d_c;
+    int *dd = (int *) intptr;
+    for (int i = 0; i < N; i++)
+      if (dd[i] != 27*i || cc[i] != 27*i)
+	{
+	  fail = true;
+	  __builtin_abort ();
+	}
+  }
+  if (fail) __builtin_abort ();
+
+  for (int i = 0; i < N; i++)
+    a[i] = 11*i;
+  for (int i = 0; i < N; i++)
+    b[i] = 31*i;
+
+  d_a = acc_copyin (a, N*sizeof (int));
+  acc_copyin_async (b, N*sizeof (int), acc_async_noval);
+  
+  #pragma acc parallel deviceptr(d_c) async
+  {
+    int *cc = (int *) d_c;
+    #pragma acc loop
+    for (int i = 0; i < N; i++)
+      cc[i] = -17*i;
+  }
+
+  acc_memcpy_device_async (d_d, d_a, N*sizeof (int), acc_async_noval);
+  acc_memcpy_device_async (d_f, d_c, N*sizeof (int), acc_async_noval);
+  acc_wait (acc_async_noval);
+  d_b = acc_deviceptr (b);
+  acc_memcpy_device_async (d_e, d_b, N*sizeof (int), acc_async_noval);
+  acc_wait (acc_async_noval);
+
+  #pragma acc serial deviceptr(d_d, d_e, d_f) copy(fail)
+  {
+    int *dd = (int *) d_d;
+    int *ee = (int *) d_e;
+    int *ff = (int *) d_f;
+    for (int i = 0; i < N; i++)
+      if (dd[i] != 11*i
+	  || ee[i] != 31*i
+	  || ff[i] != -17*i)
+	{
+	  fail = true;
+	  __builtin_abort ();
+	}
+  }
+  if (fail) __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90
new file mode 100644
index 00000000000..28199a979fd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/acc_memcpy_device-1.f90
@@ -0,0 +1,113 @@
+! { dg-prune-output "using .vector_length \\(32\\)" }
+
+! PR libgomp/93226  */
+
+module m
+  use iso_c_binding
+  use openacc
+  implicit none (external, type)
+
+  integer, parameter :: N = 1024
+
+  integer :: D(N)
+  !$acc declare device_resident(D)
+
+contains
+
+  integer(c_intptr_t) function init_d()
+    !$acc routine
+    integer :: i
+    do i = 1, N
+      D(i) = 27*i
+    end do
+    init_d = loc(D)
+  end
+end module
+
+program main
+  use m
+  implicit none (external, type)
+
+  integer, allocatable, target :: a(:), b(:), e(:)
+  type(c_ptr) :: d_a, d_b, d_c, d_d, d_e, d_f
+  integer(c_intptr_t) intptr
+  integer :: i
+  logical fail
+
+  fail = .false.
+
+  allocate(a(N), b(N), e(N))
+  d_c = acc_malloc (N*c_sizeof (i))
+  d_f = acc_malloc (N*c_sizeof (i))
+
+  e = huge(e)
+  call acc_copyin (e, N*c_sizeof (i));
+  d_e = acc_deviceptr (e);
+
+  !$acc serial copyout(intptr)
+    intptr = init_d ()
+  !$acc end serial
+  d_d = transfer(intptr, d_d)
+  call acc_memcpy_device (d_c, d_d, N*c_sizeof (i))
+
+  !$acc serial copy(fail) copy(a) deviceptr(d_c, d_d) firstprivate(intptr)
+    block
+      integer, pointer :: cc(:), dd(:)
+      call c_f_pointer (d_c, cc, [N])
+      call c_f_pointer (d_d, dd, [N])
+      a = cc
+      do i = 1, N
+        if (dd(i) /= 27*i .or. cc(i) /= 27*i) then
+          fail = .true.
+          stop 1
+        end if
+      end do
+    end block
+  !$acc end serial
+  if (fail) error stop 1
+
+  do i = 1, N
+    a(i) = 11*i
+    b(i) = 31*i
+  end do
+
+  call acc_copyin (a, N*c_sizeof (i))
+  d_a = acc_deviceptr (a)
+  call acc_copyin_async (b, N*c_sizeof (i), acc_async_noval)
+  
+  !$acc parallel deviceptr(d_c) private(i) async
+    block
+      integer, pointer :: cc(:)
+      call c_f_pointer (d_c, cc, [N])
+      !$acc loop
+      do i = 1, N 
+        cc(i) = -17*i
+      end do
+    end block
+  !$acc end parallel
+
+  call acc_memcpy_device_async (d_d, d_a, N*c_sizeof (i), acc_async_noval)
+  call acc_memcpy_device_async (d_f, d_c, N*c_sizeof (i), acc_async_noval)
+  call acc_wait (acc_async_noval)
+  d_b = acc_deviceptr (b)
+  call acc_memcpy_device_async (d_e, d_b, N*c_sizeof (i), acc_async_noval)
+  call acc_wait (acc_async_noval)
+
+  !$acc serial deviceptr(d_d, d_e, d_f) private(i) copy(fail)
+    block
+    integer, pointer :: dd(:), ee(:), ff(:)
+    call c_f_pointer (d_d, dd, [N])
+    call c_f_pointer (d_e, ee, [N])
+    call c_f_pointer (d_f, ff, [N])
+    do i = 1, N
+      if (dd(i) /= 11*i        &
+          .or. ee(i) /= 31*i   &
+          .or. ff(i) /= -17*i) then
+        fail = .true.
+        stop 2
+      end if
+    end do
+    end block
+  !$acc end serial
+  if (fail) error stop 2
+end

Reply via email to