Attached patch adds omp_target_memset and omp_target_memset_async
permitting to set (potentially large) data on the device to a
certain value - in particular to '\0'.

It uses 'memset' on the host (and for shared memory, e.g. via
requires unified_shared_memory/self_maps). For nvptx, cuMemsetD8
is used and for AMD GPUs hsa_amd_memory_fill. However, the latter
only supports 4byte aligned data, working in multiples of 4byte.

@Sandra: Any .texi comments? (Or generic comments.)
@Thomas, Jakub, anyone: Any comment?

@Andrew, anyone: Any suggestion regarding the GCN implementation?
At the moment, the code is fine for 4-byte aligned data that has
a size of multiples of 4 bytes, count being large, or count < 4.
Worst case is size 1+4+1 with 1 byte required to get aligned data.
The question is when the turnover from calloc + host2dev + free
to using: misalign dev2host + fill + tailing dev2host.
Thoughts?
Tobias

PS: As some implementation is better than no and as it works,
I intent to commit the patch early next week, but it feels like
something that should eventually be revisited for the AMD issue.
libgomp: Add OpenMP's omp_target_memset/omp_target_memset_async

	PR libgomp/120444

include/ChangeLog:

	* cuda/cuda.h (cuMemsetD8, cuMemsetD8Async): Declare.

libgomp/ChangeLog:

	* libgomp-plugin.h (GOMP_OFFLOAD_memset): Declare.
	* libgomp.h (struct gomp_device_descr): Add memset_func.
	* libgomp.map (GOMP_6.0.1): Add omp_target_memset{,_async}.
	* libgomp.texi (Device Memory Routines): Document them.
	* omp.h.in (omp_target_memset, omp_target_memset_async): Declare.
	* omp_lib.f90.in (omp_target_memset, omp_target_memset_async):
	Add interfaces.
	* omp_lib.h.in (omp_target_memset, omp_target_memset_async): Likewise.
	* plugin/cuda-lib.def: Add cuMemsetD8.
	* plugin/plugin-gcn.c (struct hsa_runtime_fn_info): Add
	hsa_amd_memory_fill_fn.
	(init_hsa_runtime_functions): DLSYM_OPT_FN load it.
	(GOMP_OFFLOAD_memset): New.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_memset): New.
	* target.c (omp_target_memset_int, omp_target_memset,
	omp_target_memset_async_helper, omp_target_memset_async): New.
	(gomp_load_plugin_for_device): Add DLSYM (memset).
	* testsuite/libgomp.c-c++-common/omp_target_memset.c: New test.
	* testsuite/libgomp.c-c++-common/omp_target_memset-2.c: New test.
	* testsuite/libgomp.fortran/omp_target_memset.f90: New test.
	* testsuite/libgomp.fortran/omp_target_memset-2.f90: New test.

 include/cuda/cuda.h                                |  3 +
 libgomp/libgomp-plugin.h                           |  1 +
 libgomp/libgomp.h                                  |  3 +-
 libgomp/libgomp.map                                |  6 ++
 libgomp/libgomp.texi                               | 98 +++++++++++++++++++++-
 libgomp/omp.h.in                                   |  4 +
 libgomp/omp_lib.f90.in                             | 23 +++++
 libgomp/omp_lib.h.in                               | 24 ++++++
 libgomp/plugin/cuda-lib.def                        |  1 +
 libgomp/plugin/plugin-gcn.c                        | 50 +++++++++++
 libgomp/plugin/plugin-nvptx.c                      |  9 ++
 libgomp/target.c                                   | 83 ++++++++++++++++++
 .../libgomp.c-c++-common/omp_target_memset-2.c     | 62 ++++++++++++++
 .../libgomp.c-c++-common/omp_target_memset.c       | 62 ++++++++++++++
 .../libgomp.fortran/omp_target_memset-2.f90        | 67 +++++++++++++++
 .../libgomp.fortran/omp_target_memset.f90          | 39 +++++++++
 16 files changed, 531 insertions(+), 4 deletions(-)

diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h
index 5e4b7f190eb..6be1ac0ab43 100644
--- a/include/cuda/cuda.h
+++ b/include/cuda/cuda.h
@@ -279,6 +279,9 @@ CUresult cuMemcpy3D (const CUDA_MEMCPY3D *);
 CUresult cuMemcpy3DAsync (const CUDA_MEMCPY3D *, CUstream);
 CUresult cuMemcpy3DPeer (const CUDA_MEMCPY3D_PEER *);
 CUresult cuMemcpy3DPeerAsync (const CUDA_MEMCPY3D_PEER *, CUstream);
+#define cuMemsetD8 cuMemsetD8_v2
+CUresult cuMemsetD8 (CUdeviceptr, unsigned char, size_t);
+CUresult cuMemsetD8Async (CUdeviceptr, unsigned char, size_t, CUstream);
 #define cuMemFree cuMemFree_v2
 CUresult cuMemFree (CUdeviceptr);
 CUresult cuMemFreeHost (void *);
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 50c89feaf73..191106b5275 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -177,6 +177,7 @@ extern int GOMP_OFFLOAD_memcpy3d (int, int, size_t, size_t, size_t, void *,
 				  size_t, size_t, size_t, size_t, size_t,
 				  const void *, size_t, size_t, size_t, size_t,
 				  size_t);
+extern bool GOMP_OFFLOAD_memset (int, void *, int, size_t);
 extern bool GOMP_OFFLOAD_can_run (void *);
 extern void GOMP_OFFLOAD_run (int, void *, void *, void **);
 extern void GOMP_OFFLOAD_async_run (int, void *, void *, void **, void *);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index ed4e23ae3e1..a43398300a5 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1421,9 +1421,10 @@ struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_free) *free_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
+  __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
   __typeof (GOMP_OFFLOAD_memcpy2d) *memcpy2d_func;
   __typeof (GOMP_OFFLOAD_memcpy3d) *memcpy3d_func;
-  __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
+  __typeof (GOMP_OFFLOAD_memset) *memset_func;
   __typeof (GOMP_OFFLOAD_can_run) *can_run_func;
   __typeof (GOMP_OFFLOAD_run) *run_func;
   __typeof (GOMP_OFFLOAD_async_run) *async_run_func;
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index ad9787ca4c0..f6aee7c0394 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -453,6 +453,12 @@ GOMP_6.0 {
 	omp_get_uid_from_device_8_;
 } GOMP_5.1.3;
 
+GOMP_6.0.1 {
+  global:
+	omp_target_memset;
+	omp_target_memset_async;
+} GOMP_6.0;
+
 OACC_2.0 {
   global:
 	acc_get_num_devices;
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index e6ebe226975..72cf45a57a2 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -603,7 +603,7 @@ to address of matching mapped list item per 5.1, Sect. 2.21.7.2 @tab N @tab
       @code{omp_get_device_teams_thread_limit}, and
       @code{omp_set_device_teams_thread_limit} routines @tab N @tab
 @item @code{omp_target_memset} and @code{omp_target_memset_async} routines
-      @tab N @tab
+      @tab Y @tab
 @item Fortran version of the interop runtime routines @tab Y @tab
 @item Routines for obtaining memory spaces/allocators for shared/device memory
       @tab N @tab
@@ -1984,8 +1984,8 @@ pointers on devices. They have C linkage and do not throw exceptions.
 * omp_target_memcpy_async:: Copy data between devices asynchronously
 * omp_target_memcpy_rect:: Copy a subvolume of data between devices
 * omp_target_memcpy_rect_async:: Copy a subvolume of data between devices asynchronously
-@c * omp_target_memset:: <fixme>/TR12
-@c * omp_target_memset_async:: <fixme>/TR12
+* omp_target_memset:: Set bytes in device memory
+* omp_target_memset_async:: Set bytes in device memory asynchronously
 * omp_target_associate_ptr:: Associate a device pointer with a host pointer
 * omp_target_disassociate_ptr:: Remove device--host pointer association
 * omp_get_mapped_ptr:: Return device pointer to a host pointer
@@ -2398,6 +2398,98 @@ the initial device.
 @end table
 
 
+@node omp_target_memset
+@subsection @code{omp_target_memset} -- Set bytes in device memory
+@table @asis
+@item @emph{Description}:
+This routine fills memory on the device identified by device number
+@var{device_num}.  Starting from the device address @var{ptr}, the first
+@var{count} bytes are set to the value @var{val}, converted to
+@code{unsigned char}. If @var{count} is zero, the routine has no effect;
+if @var{ptr} is @code{NULL}, the behavior is unspecified.  The function
+returns @var{ptr}.
+
+The @var{device_num} must be conforming device number and @var{ptr} must be
+a valid device pointer for that device.  Running this routine in a
+@code{target} region except on the initial device is not supported.
+
+@item @emph{C/C++}
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void *omp_target_memcpy(void *ptr,}
+@item                   @tab @code{                        int val,}
+@item                   @tab @code{                        size_t count,}
+@item                   @tab @code{                        int device_num)}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset( &}
+@item                   @tab @code{    ptr, val, count, device_num) bind(C)}
+@item                   @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int}
+@item                   @tab @code{type(c_ptr), value :: ptr}
+@item                   @tab @code{integer(c_size_t), value :: count}
+@item                   @tab @code{integer(c_int), value :: val, device_num}
+@end multitable
+
+@item @emph{See also}:
+@ref{omp_target_memset_async}
+
+@item @emph{Reference}:
+@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.1
+@end table
+
+
+
+@node omp_target_memset_async
+@subsection @code{omp_target_memset} -- Set bytes in device memory asynchronously
+@table @asis
+@item @emph{Description}:
+This routine fills memory on the device identified by device number
+@var{device_num}.  Starting from the device address @var{ptr}, the first
+@var{count} bytes are set to the value @var{val}, converted to
+@code{unsigned char}. If @var{count} is zero, the routine has no effect;
+if @var{ptr} is @code{NULL}, the behavior is unspecified.  Task dependence
+is expressed by passing an array of depend objects to @var{depobj_list}, where
+the number of array elements is passed as @var{depobj_count}; if the count is
+zero, the @var{depobj_list} argument is ignored.  In C++ and Fortran, the
+@var{depobj_list} argument can also be omitted in that case.  The function
+returns @var{ptr}.
+
+The @var{device_num} must be conforming device number and @var{ptr} must be
+a valid device pointer for that device.  Running this routine in a
+@code{target} region except on the initial device is not supported.
+
+@item @emph{C/C++}
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{void *omp_target_memcpy_async(void *ptr,}
+@item                   @tab @code{                        int val,}
+@item                   @tab @code{                        size_t count,}
+@item                   @tab @code{                        int device_num,}
+@item                   @tab @code{                        int depobj_count,}
+@item                   @tab @code{                        omp_depend_t *depobj_list)}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{type(c_ptr) function omp_target_memset_async( &}
+@item                   @tab @code{    ptr, val, count, device_num, &}
+@item                   @tab @code{    depobj_count, depobj_list) bind(C)}
+@item                   @tab @code{use, intrinsic :: iso_c_binding, only: c_ptr, c_size_t, c_int}
+@item                   @tab @code{type(c_ptr), value :: ptr}
+@item                   @tab @code{integer(c_size_t), value :: count}
+@item                   @tab @code{integer(c_int), value :: val, device_num, depobj_count}
+@item                   @tab @code{integer(omp_depend_kind), optional :: depobj_list(*)}
+@end multitable
+
+
+@item @emph{See also}:
+@ref{omp_target_memset}
+
+@item @emph{Reference}:
+@uref{https://www.openmp.org, OpenMP specification v6.0}, Section 25.8.2
+@end table
+
+
 
 @node omp_target_associate_ptr
 @subsection @code{omp_target_associate_ptr} -- Associate a device pointer with a host pointer
diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in
index 8d17db1da9a..4f2bc46a76a 100644
--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -347,6 +347,10 @@ extern int omp_target_memcpy_rect_async (void *, const void *, __SIZE_TYPE__,
 					 const __SIZE_TYPE__ *, int, int, int,
 					 omp_depend_t * __GOMP_DEFAULT_NULL)
   __GOMP_NOTHROW;
+extern void *omp_target_memset (void *, int, __SIZE_TYPE__, int) __GOMP_NOTHROW;
+extern void *omp_target_memset_async (void *, int, __SIZE_TYPE__, int,
+				      int, omp_depend_t * __GOMP_DEFAULT_NULL)
+  __GOMP_NOTHROW;
 extern int omp_target_associate_ptr (const void *, const void *, __SIZE_TYPE__,
 				     __SIZE_TYPE__, int) __GOMP_NOTHROW;
 extern int omp_target_disassociate_ptr (const void *, int) __GOMP_NOTHROW;
diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in
index cb6b95f5af6..ce866c00121 100644
--- a/libgomp/omp_lib.f90.in
+++ b/libgomp/omp_lib.f90.in
@@ -903,6 +903,29 @@
           end function omp_target_memcpy_rect_async
         end interface
 
+        interface
+          function omp_target_memset (ptr, val, count, device_num) bind(c)
+            use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+            type(c_ptr) :: omp_target_memset
+            type(c_ptr), value :: ptr
+            integer(c_size_t), value :: count
+            integer(c_int), value :: val, device_num
+          end function omp_target_memset
+        end interface
+
+        interface
+          function omp_target_memset_async (ptr, val, count, device_num, &
+                                            depobj_count, depobj_list) bind(c)
+            use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+            import :: omp_depend_kind
+            type(c_ptr) :: omp_target_memset_async
+            type(c_ptr), value :: ptr
+            integer(c_size_t), value :: count
+            integer(c_int), value :: val, device_num, depobj_count
+            integer(omp_depend_kind), optional :: depobj_list(*)
+          end function omp_target_memset_async
+        end interface
+
         interface
           function omp_target_associate_ptr (host_ptr, device_ptr, size, &
                                              device_offset, device_num) bind(c)
diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in
index f7af5ff4698..b3a8c639a4d 100644
--- a/libgomp/omp_lib.h.in
+++ b/libgomp/omp_lib.h.in
@@ -504,6 +504,30 @@
         end function omp_target_memcpy_rect_async
       end interface
 
+      interface
+        function omp_target_memset (ptr, val, count, device_num) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          type(c_ptr) omp_target_memset
+          type(c_ptr), value :: ptr
+          integer(c_size_t), value :: count
+          integer(c_int), value :: val, device_num
+        end function omp_target_memset
+      end interface
+
+      interface
+        function omp_target_memset_async (ptr, val, count, device_num,          &
+     &                                    depobj_count, depobj_list) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          import :: omp_depend_kind
+          type(c_ptr) :: omp_target_memset_async
+          type(c_ptr), value :: ptr
+          integer(c_size_t), value :: count
+          integer(c_int), value :: val, device_num, depobj_count
+          integer(omp_depend_kind), optional :: depobj_list(*)
+        end function omp_target_memset_async
+      end interface
+
+
       interface
         function omp_target_associate_ptr (host_ptr, device_ptr, size,          &
      &                                     device_offset, device_num)           &
diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index eb562ace95e..7f4ddcc6bd1 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -42,6 +42,7 @@ CUDA_ONE_CALL (cuMemcpyHtoDAsync)
 CUDA_ONE_CALL (cuMemcpy2D)
 CUDA_ONE_CALL (cuMemcpy2DUnaligned)
 CUDA_ONE_CALL (cuMemcpy3D)
+CUDA_ONE_CALL (cuMemsetD8)
 CUDA_ONE_CALL (cuMemFree)
 CUDA_ONE_CALL (cuMemFreeHost)
 CUDA_ONE_CALL (cuMemGetAddressRange)
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 46203838e7c..16519cb12f4 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -208,6 +208,8 @@ struct hsa_runtime_fn_info
   hsa_status_t (*hsa_code_object_deserialize_fn)
     (void *serialized_code_object, size_t serialized_code_object_size,
      const char *options, hsa_code_object_t *code_object);
+  hsa_status_t (*hsa_amd_memory_fill_fn)(void *ptr, uint32_t value,
+					 size_t count);
   hsa_status_t (*hsa_amd_memory_lock_fn)
     (void *host_ptr, size_t size, hsa_agent_t *agents, int num_agent,
      void **agent_ptr);
@@ -1456,6 +1458,7 @@ init_hsa_runtime_functions (void)
   DLSYM_FN (hsa_signal_load_acquire)
   DLSYM_FN (hsa_queue_destroy)
   DLSYM_FN (hsa_code_object_deserialize)
+  DLSYM_OPT_FN (hsa_amd_memory_fill)
   DLSYM_OPT_FN (hsa_amd_memory_lock)
   DLSYM_OPT_FN (hsa_amd_memory_unlock)
   DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect)
@@ -4435,6 +4438,53 @@ init_hip_runtime_functions (void)
   return true;
 }
 
+bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+
+  if (__builtin_expect (!hsa_fns.hsa_amd_memory_fill_fn, 0))
+    {
+      void *data = calloc (count, sizeof (uint8_t));
+      if (data && count)
+	status = hsa_fns.hsa_memory_copy_fn (ptr, data, count);
+      free (data);
+      if (status == HSA_STATUS_SUCCESS && (!count || data))
+	return true;
+      GOMP_PLUGIN_error ("memory set failed");
+      return false;
+    }
+
+  uint8_t v8 = (uint8_t) val;
+  uint32_t values = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24);
+
+  size_t before = (4 - (uintptr_t) ptr % 4) % 4;
+  if (count < 4 + before)
+    before = count;
+  if (before)
+    {
+      uint64_t values2 = values | ((uint64_t) values << 32);
+      status = hsa_fns.hsa_memory_copy_fn (ptr, &values2, before);
+      ptr += before;
+      count -= before;
+    }
+
+  if (count >= 4 && status == HSA_STATUS_SUCCESS)
+    {
+      size_t after = count % 4;
+      status = hsa_fns.hsa_amd_memory_fill_fn (ptr, values, count / 4);
+      if (after > 0 && status == HSA_STATUS_SUCCESS)
+	status = hsa_fns.hsa_memory_copy_fn (ptr + count - after, &values,
+					     after);
+    }
+
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("memory set failed");
+      return false;
+    }
+  return true;
+}
 
 void
 GOMP_OFFLOAD_interop (struct interop_obj_t *obj, int ord,
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a52d1ad84c8..0ba445eab9b 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -2297,6 +2297,15 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size,
   return true;
 }
 
+bool
+GOMP_OFFLOAD_memset (int ord, void *ptr, int val, size_t count)
+{
+  if (!nvptx_attach_host_thread_to_device (ord))
+    return false;
+  CUDA_CALL (cuMemsetD8, (CUdeviceptr) ptr, (unsigned char) val, count);
+  return true;
+}
+
 bool
 GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
 				     size_t n, struct goacc_asyncqueue *aq)
diff --git a/libgomp/target.c b/libgomp/target.c
index fe94978309d..a2a4a7299e5 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -5003,6 +5003,88 @@ omp_target_memcpy_rect_async (void *dst, const void *src, size_t element_size,
   return 0;
 }
 
+static void
+omp_target_memset_int (void *ptr, int val, size_t count,
+		       struct gomp_device_descr *devicep)
+{
+  if (__builtin_expect (count == 0, 0))
+    return;
+  if (devicep == NULL)
+    {
+      memset (ptr, val, count);
+      return;
+    }
+
+  gomp_mutex_lock (&devicep->lock);
+  int ret = devicep->memset_func (devicep->target_id, ptr, val, count);
+  gomp_mutex_unlock (&devicep->lock);
+  if (!ret)
+    gomp_fatal ("omp_target_memset failed");
+}
+
+void*
+omp_target_memset (void *ptr, int val, size_t count, int device_num)
+{
+  struct gomp_device_descr *devicep;
+  if (device_num == omp_initial_device
+      || device_num == gomp_get_num_devices ()
+      || (devicep = resolve_device (device_num, false)) == NULL
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+    devicep = NULL;
+
+  omp_target_memset_int (ptr, val, count, devicep);
+  return ptr;
+}
+
+typedef struct
+{
+  void *ptr;
+  size_t count;
+  struct gomp_device_descr *devicep;
+  int val;
+} omp_target_memset_data;
+
+static void
+omp_target_memset_async_helper (void *args)
+{
+  omp_target_memset_data *a = args;
+  omp_target_memset_int (a->ptr, a->val, a->count, a->devicep);
+}
+
+void*
+omp_target_memset_async (void *ptr, int val, size_t count, int device_num,
+			 int depobj_count, omp_depend_t *depobj_list)
+{
+  void *depend[depobj_count + 5];
+  struct gomp_device_descr *devicep;
+  unsigned flags = 0;
+  int i;
+
+  if (device_num == omp_initial_device
+      || device_num == gomp_get_num_devices ()
+      || (devicep = resolve_device (device_num, false)) == NULL
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+      || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+    devicep = NULL;
+
+  omp_target_memset_data s = {.ptr = ptr, .val = val, .count = count,
+			      .devicep = devicep};
+  if (depobj_count > 0 && depobj_list != NULL)
+    {
+      flags |= GOMP_TASK_FLAG_DEPEND;
+      depend[0] = 0;
+      depend[1] = (void *) (uintptr_t) depobj_count;
+      depend[2] = depend[3] = depend[4] = 0;
+      for (i = 0; i < depobj_count; ++i)
+	depend[i + 5] = &depobj_list[i];
+    }
+
+  GOMP_task (omp_target_memset_async_helper, &s, NULL, sizeof (s),
+	     __alignof__ (s), true, flags, depend, 0, NULL);
+  return ptr;
+}
+
 int
 omp_target_associate_ptr (const void *host_ptr, const void *device_ptr,
 			  size_t size, size_t device_offset, int device_num)
@@ -5568,6 +5650,7 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
       DLSYM_OPT (async_run, async_run);
       DLSYM_OPT (can_run, can_run);
       DLSYM (dev2dev);
+      DLSYM (memset);
     }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
     {
diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c
new file mode 100644
index 00000000000..cd6d07ceeb4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset-2.c
@@ -0,0 +1,62 @@
+// PR libgomp/120444
+// Async version
+
+#include <omp.h>
+
+int main()
+{
+  #pragma omp parallel for
+  for (int dev = omp_initial_device; dev <= omp_get_num_devices (); dev++)
+    {
+      char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev);
+
+      omp_depend_t dep;
+      #pragma omp depobj(dep) depend(inout: ptr)
+
+      /* Play also around with the alignment - as hsa_amd_memory_fill operates
+	 on multiples of 4 bytes (uint32_t).  */
+
+      for (int start = 0; start < 32; start++)
+        for (int tail = 0; tail < 32; tail++)
+	  {
+	    unsigned char val = '0' + start + tail;
+#if __cplusplus
+	    void *ptr2 = omp_target_memset_async (ptr + start, val,
+					    1024 - start - tail, dev, 0);
+#else
+	    void *ptr2 = omp_target_memset_async (ptr + start, val,
+					    1024 - start - tail, dev, 0, nullptr);
+#endif
+	    if (ptr + start != ptr2)
+	      __builtin_abort ();
+
+	    #pragma omp taskwait
+	    
+	    #pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait
+	      for (int i = start; i < 1024 - start - tail; i++)
+		{
+		  if (ptr[i] != val)
+		    __builtin_abort ();
+		  ptr[i] += 2;
+		}
+
+	    omp_target_memset_async (ptr + start, val + 3,
+				     1024 - start - tail, dev, 1, &dep);
+
+	    #pragma omp target device(dev) is_device_ptr(ptr) depend(depobj: dep) nowait
+	      for (int i = start; i < 1024 - start - tail; i++)
+		{
+		  if (ptr[i] != val + 3)
+		    __builtin_abort ();
+		  ptr[i] += 1;
+		}
+
+	    omp_target_memset_async (ptr + start, val - 3,
+				     1024 - start - tail, dev, 1, &dep);
+
+	    #pragma omp taskwait depend (depobj: dep)
+	  }
+      #pragma omp depobj(dep) destroy
+      omp_target_free (ptr, dev);
+    }
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c
new file mode 100644
index 00000000000..01909f854b6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/omp_target_memset.c
@@ -0,0 +1,62 @@
+// PR libgomp/120444
+
+#include <omp.h>
+
+int main()
+{
+  for (int dev = omp_initial_device; dev < omp_get_num_devices (); dev++)
+    {
+      char *ptr = (char *) omp_target_alloc (sizeof(int) * 1024, dev);
+
+      /* Play also around with the alignment - as hsa_amd_memory_fill operates
+	 on multiples of 4 bytes (uint32_t).  */
+
+      for (int start = 0; start < 32; start++)
+	for (int tail = 0; tail < 32; tail++)
+	  {
+	    unsigned char val = '0' + start + tail;
+	    void *ptr2 = omp_target_memset (ptr + start, val,
+					    1024 - start - tail, dev);
+	    if (ptr + start != ptr2)
+	      __builtin_abort ();
+
+	    #pragma omp target device(dev) is_device_ptr(ptr)
+	      for (int i = start; i < 1024 - start - tail; i++)
+		if (ptr[i] != val)
+		  __builtin_abort ();
+
+	  }
+
+      /* Check 'small' values for correctness.  */
+
+      for (int start = 0; start < 32; start++)
+	for (int size = 0; size <= 64 + 32; size++)
+	  {
+	    omp_target_memset (ptr, 'a' - 2, 1024, dev);
+
+	    unsigned char val = '0' + start + size % 32;
+	    void *ptr2 = omp_target_memset (ptr + start, val, size, dev);
+
+	    if (ptr + start != ptr2)
+	      __builtin_abort ();
+
+	    if (size == 0)
+	      continue;
+
+	    #pragma omp target device(dev) is_device_ptr(ptr)
+	    {
+	      for (int i = 0; i < start; i++)
+		if (ptr[i] != 'a' - 2)
+		  __builtin_abort ();
+	      for (int i = start; i < start + size; i++)
+		if (ptr[i] != val)
+		  __builtin_abort ();
+	      for (int i = start + size + 1; i < 1024; i++)
+		if (ptr[i] != 'a' - 2)
+		  __builtin_abort ();
+	    }
+	  }
+
+      omp_target_free (ptr, dev);
+    }
+}
diff --git a/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90 b/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90
new file mode 100644
index 00000000000..2641086f60d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/omp_target_memset-2.f90
@@ -0,0 +1,67 @@
+! PR libgomp/120444
+! Async version
+
+use omp_lib
+use iso_c_binding
+implicit none (type, external)
+integer(c_int) :: dev
+
+!$omp parallel do
+do dev = omp_initial_device, omp_get_num_devices ()
+block
+  integer(c_int) :: i, val, start, tail
+  type(c_ptr) :: ptr, ptr2, tmpptr
+  integer(c_int8_t), pointer, contiguous :: fptr(:)
+  integer(c_intptr_t) :: intptr
+  integer(c_size_t), parameter :: count = 1024
+  integer(omp_depend_kind) :: dep(1)
+
+  ptr = omp_target_alloc (count, dev)
+
+  !$omp depobj(dep(1)) depend(inout: ptr)
+
+  ! Play also around with the alignment - as hsa_amd_memory_fill operates
+  ! on multiples of 4 bytes (c_int32_t)
+
+  do start = 0, 31
+    do tail = 0, 31
+      val = iachar('0') + start + tail
+
+      tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr)
+      ptr2 = omp_target_memset_async (tmpptr, val, count - start - tail, dev, 0)
+
+      if (.not. c_associated (tmpptr, ptr2)) stop 1
+
+      !$omp taskwait
+
+      !$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait
+        do i = 1 + start, int(count, c_int) - start - tail
+          call c_f_pointer (ptr, fptr, [count])
+          if (fptr(i) /= int (val, c_int8_t)) stop 2
+          fptr(i) = fptr(i) + 2_c_int8_t
+        end do
+      !$omp end target
+
+      ptr2 = omp_target_memset_async (tmpptr, val + 3, &
+                                      count - start - tail, dev, 1, dep)
+
+      !$omp target device(dev) is_device_ptr(ptr) depend(depobj: dep(1)) nowait
+        do i = 1 + start, int(count, c_int) - start - tail
+          call c_f_pointer (ptr, fptr, [count])
+          if (fptr(i) /= int (val + 3, c_int8_t)) stop 3
+          fptr(i) = fptr(i) - 1_c_int8_t
+        end do
+      !$omp end target
+
+      ptr2 = omp_target_memset_async (tmpptr, val - 3, &
+                                      count - start - tail, dev, 1, dep)
+
+      !$omp taskwait depend (depobj: dep(1))
+    end do
+  end do
+
+  !$omp depobj(dep(1)) destroy
+  call omp_target_free (ptr, dev);
+end block
+end do
+end
diff --git a/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90 b/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90
new file mode 100644
index 00000000000..1ee184ac47c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/omp_target_memset.f90
@@ -0,0 +1,39 @@
+! PR libgomp/120444
+
+use omp_lib
+use iso_c_binding
+implicit none (type, external)
+
+integer(c_int) :: dev, i, val, start, tail
+type(c_ptr) :: ptr, ptr2, tmpptr
+integer(c_int8_t), pointer, contiguous :: fptr(:)
+integer(c_intptr_t) :: intptr
+integer(c_size_t), parameter :: count = 1024
+
+do dev = omp_initial_device, omp_get_num_devices ()
+  ptr = omp_target_alloc (count, dev)
+
+  ! Play also around with the alignment - as hsa_amd_memory_fill operates
+  ! on multiples of 4 bytes (c_int32_t)
+
+  do start = 0, 31
+    do tail = 0, 31
+      val = iachar('0') + start + tail
+
+      tmpptr = transfer (transfer (ptr, intptr) + start, tmpptr)
+      ptr2 = omp_target_memset (tmpptr, val, count - start - tail, dev)
+
+      if (.not. c_associated (tmpptr, ptr2)) stop 1
+
+      !$omp target device(dev) is_device_ptr(ptr)
+        do i = 1 + start, int(count, c_int) - start - tail
+          call c_f_pointer (ptr, fptr, [count])
+          if (fptr(i) /= int (val, c_int8_t)) stop 2
+        end do
+      !$omp end target
+    end do
+  end do
+
+  call omp_target_free (ptr, dev);
+end do
+end

Reply via email to