jz10 updated this revision to Diff 470339.
jz10 added a comment.

Thanks Johannes and Shilei

I added few test cases for asynchronous routine at test/api folder


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h
  openmp/libomptarget/test/api/omp_target_memcpy_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_async3.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c

Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c
@@ -0,0 +1,250 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#define NUM_DIMS 3
+#define RECT_WIDTH 4
+
+#define VERBOSE_TIME
+
+int num_timers;
+double *times;
+void timer_set(const char *name, int n) {
+  printf("%s:\n", name);
+  num_timers = n;
+  times = (double *) calloc(n, sizeof(double));
+}
+
+void timer_start(int i) {
+  if (i < 0 || i > num_timers)
+    abort();
+  times[i] = omp_get_wtime();
+}
+
+void timer_end(int i) {
+  if (i < 0 || i > num_timers)
+    abort();
+  double t = omp_get_wtime() - times[i];
+  times[i] = t;
+  printf(" - Round %d: %f[sec]\n", i, t);
+}
+
+void timer_summarize() {
+  double min = -1;
+  double max = 0;
+  double total = 0;
+
+  for (int i = 0; i < num_timers; i++) {
+    double t = times[i];
+    if (min < 0 || t < min)
+      min = t;
+    if (t > max)
+      max = t;
+    total += t;
+  }
+  double avg = total / num_timers;
+  printf(" - Summary: min = %f[sec], max = %f[sec], avg = %f[sec]\n", min, max, avg);
+}
+
+void init(int *p, size_t size) {
+  for (int i = 0; i < size; i++)
+    p[i] = i;
+}
+
+void local(int *q, size_t size) {
+  for (int i = 0; i < size; i++) {
+    int i2 = ((int) (size * 1.5)) % size;
+    int tmp = q[i];
+    q[i] = q[i2];
+    q[i2] = tmp;
+  }
+}
+
+void kernel_sync(int d, int id, size_t size1, size_t size2, int round,
+		 int *p1, int *p2, void *p_dev, int *q,
+		 size_t* volumes, size_t* dst_offsets, size_t* src_offsets,
+		 size_t* dst_dimensions, size_t* src_dimensions) {
+  init(p1, size1 * size1 * size1);
+  init(q, size2);
+  timer_set("kernel_sync", round);
+
+  for (int r = 0; r < round; r++) {
+    timer_start(r);
+#ifdef VERBOSE_TIME
+    double t1, t2, t3;
+    t1 = omp_get_wtime();
+#endif
+
+    // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+    omp_target_memcpy_rect(p_dev, p1, size1 * sizeof(int), NUM_DIMS, volumes, dst_offsets, src_offsets,
+			   dst_dimensions, src_dimensions, d, id);
+    omp_target_memcpy_rect(p2, p_dev, size1 * sizeof(int), NUM_DIMS, volumes, src_offsets, dst_offsets, 
+                           src_dimensions, dst_dimensions, id, d);
+    
+#ifdef VERBOSE_TIME
+    t2 = omp_get_wtime();
+#endif
+
+    // Local task
+    local(q, size2);
+#ifdef VERBOSE_TIME
+    t3 = omp_get_wtime();
+#endif
+
+    timer_end(r);
+#ifdef VERBOSE_TIME
+    printf("  -- Invoking memcpy: %f[sec]\n", t2-t1);
+    printf("  -- Local work: %f[sec]\n", t3-t2);
+#endif
+  }
+  timer_summarize();
+}
+
+void kernel_async(int d, int id, size_t size1, size_t size2, int round,
+		  int *p1, int *p2, void *p_dev, int *q,
+		  size_t* volumes, size_t* dst_offsets, size_t* src_offsets,
+		  size_t* dst_dimensions, size_t* src_dimensions) {
+  init(p1, size1 * size1 * size1);
+  init(q, size2);
+  timer_set("kernel_async", round);
+
+  for (int r = 0; r < round; r++) {
+    timer_start(r);
+#ifdef VERBOSE_TIME
+    double t1, t2, t3, t4, t5;
+    t1 = omp_get_wtime();
+#endif
+
+    {
+#ifdef VERBOSE_TIME
+      t2 = omp_get_wtime();
+#endif
+
+      // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+      omp_depend_t obj1[1], obj2[1];
+#pragma omp depobj(obj1[0]) depend(out: p_dev)
+      omp_target_memcpy_rect_async(p_dev, p1, sizeof(int), NUM_DIMS, volumes,
+				   dst_offsets, src_offsets, dst_dimensions, src_dimensions,
+				   d, id, 1, obj1);
+#pragma omp depobj(obj2[0]) depend(in: p_dev)
+      omp_target_memcpy_rect_async(p2, p_dev, sizeof(int), NUM_DIMS, volumes,
+				   src_offsets, dst_offsets, src_dimensions, dst_dimensions,
+				   id, d, 1, obj2);
+#ifdef VERBOSE_TIME
+      t3 = omp_get_wtime();
+#endif
+
+      // Local task
+      local(q, size2);
+#ifdef VERBOSE_TIME
+      t4 = omp_get_wtime();
+#endif
+
+#pragma omp taskwait
+    }
+#ifdef VERBOSE_TIME
+    t5 = omp_get_wtime();
+#endif
+
+    timer_end(r);
+#ifdef VERBOSE_TIME
+    printf("  -- Starting parallel region: %f[sec]\n", t2-t1);
+    printf("  -- Invoking memcpy async: %f[sec]\n", t3-t2);
+    printf("  -- Local work: %f[sec]\n", t4-t3);
+    printf("  -- Task wait & ending parallel region: %f[sec]\n", t5-t4);
+#endif
+  }
+  timer_summarize();
+}
+
+void kernel_task(int d, int id, size_t size1, size_t size2, int round,
+		 int *p1, int *p2, void *p_dev, int *q,
+                 size_t* volumes, size_t* dst_offsets, size_t* src_offsets,
+                 size_t* dst_dimensions, size_t* src_dimensions) {
+  init(p1, size1 * size1 * size1);
+  init(q, size2);
+  timer_set("kernel_task", round);
+
+  for (int r = 0; r < round; r++) {
+    timer_start(r);
+#ifdef VERBOSE_TIME
+    double t1, t2, t3, t4, t5;
+    t1 = omp_get_wtime();
+#endif
+
+#pragma omp parallel
+#pragma omp single
+    {
+#ifdef VERBOSE_TIME
+      t2 = omp_get_wtime();
+#endif
+
+      // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+#pragma omp task depend(out: p_dev)
+      omp_target_memcpy_rect(p_dev, p1, size1 * sizeof(int), NUM_DIMS, volumes,
+			     dst_offsets, src_offsets, dst_dimensions, src_dimensions, d, id);
+#pragma omp task depend(in: p_dev)
+      omp_target_memcpy_rect(p2, p_dev, size1 * sizeof(int), NUM_DIMS, volumes,
+			     src_offsets, dst_offsets, src_dimensions, dst_dimensions, id, d);
+#ifdef VERBOSE_TIME
+      t3 = omp_get_wtime();
+#endif
+
+      // Local task
+      local(q, size2);
+#ifdef VERBOSE_TIME
+      t4 = omp_get_wtime();
+#endif
+
+#pragma omp taskwait
+    }
+#ifdef VERBOSE_TIME
+    t5 = omp_get_wtime();
+#endif
+
+    timer_end(r);
+#ifdef VERBOSE_TIME
+    printf("  -- Starting parallel region: %f[sec]\n", t2-t1);
+    printf("  -- Invoking memcpy as task: %f[sec]\n", t3-t2);
+    printf("  -- Local work: %f[sec]\n", t4-t3);
+    printf("  -- Task wait & ending parallel region: %f[sec]\n", t5-t4);
+#endif
+  }
+  timer_summarize();
+}
+
+int main(int argc, char* argv[]) {
+  size_t size1 = (argc > 1) ? atoi(argv[1]) * 20 : RECT_WIDTH * 20;
+
+  size_t size2 = size1;
+  int round = (argc > 2) ? atoi(argv[2]) : 5;
+  
+  printf("memory copy size = %lu, local work size = %lu, total rounds = %d\n", size1, size2, round);
+
+  size_t volume[NUM_DIMS] = { size1 * size1, size1, size1 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_dimensions[NUM_DIMS] = { 0, 0, 0 };
+  
+  int d = omp_get_default_device();
+  int id = omp_get_initial_device();
+  if (d < 0 || d >= omp_get_num_devices())
+    d = id;
+
+  // Arrays for target memcpy
+  int *p1 = (int *) malloc(size1 * size1 * size1 * sizeof(int));
+  int *p2 = (int *) malloc(size1 * size1 * size1 * sizeof(int));
+  void *p_dev = omp_target_alloc(size1 * size1 * size1 * sizeof(int), d);
+
+  // Array for local work
+  int *q = (int *) malloc(size2 *  sizeof(int));
+
+  kernel_sync(d, id, size1, size2, round, p1, p2, p_dev, q,
+	      volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions);
+  kernel_async(d, id, size1, size2, round, p1, p2, p_dev, q,
+              volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions);
+  kernel_task(d, id, size1, size2, round, p1, p2, p_dev, q,
+              volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions);
+  return 0;
+}
Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
@@ -0,0 +1,87 @@
+#include <omp.h>
+#include <stdlib.h>
+
+#define NUM_DIMS 3
+
+int main () {
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int a[128], b[64], c[128], e[16], q[128], i;
+  void *p;
+  
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+  
+  for (i = 0; i < 128; i++)
+    q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+    abort ();
+  
+  size_t volume[NUM_DIMS] = { 2, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+  
+  for (i = 0; i < 128; i++)
+    a[i] = 42;
+  for (i = 0; i < 64; i++)
+    b[i] = 24;
+  for (i = 0; i < 128; i++)
+    c[i] = 0;
+  for (i = 0; i < 16; i++)
+    e[i] = 77;
+
+  omp_depend_t obj[2];
+  
+#pragma omp parallel num_threads(5)
+#pragma omp single
+  {
+#pragma omp task depend (out: p)
+    omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id);
+    
+#pragma omp task depend(inout: p)
+    omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id);
+    
+#pragma omp task depend(out: c)
+    for (i = 0; i < 128; i++)
+      c[i] = i + 1;
+    
+#pragma omp depobj(obj[0]) depend(inout: p)
+#pragma omp depobj(obj[1]) depend(in: c)
+    
+    /*  This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
+	13 14 15 - - 17 18 19 - - at positions 20..29.  */
+    omp_target_memcpy_rect_async (p, c, sizeof (int), NUM_DIMS, volume,
+				  dst_offsets, src_offsets, dst_dimensions,
+				  src_dimensions, d, id, 2, obj);
+    
+#pragma omp task depend(in: p)
+    omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id);
+  }
+  
+#pragma omp taskwait
+  
+  if (omp_target_memcpy (q, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+    abort ();
+  
+  for (i = 0; i < 16; ++i)
+    if (q[i] != 77)
+      abort ();
+  if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18
+      || q[27] != 19)
+    abort ();
+  for (i = 28; i < 64; ++i)
+    if (q[i] != 24)
+      abort ();
+  for (i = 64; i < 128; ++i)
+   if (q[i] != 42)
+     abort ();
+  
+  omp_target_free (p, d);
+  return 0;
+}
Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
@@ -0,0 +1,64 @@
+#include <omp.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define NUM_DIMS 3
+
+int main() {
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int q[128], q2[128], i;
+  void *p;
+  
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+  
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  if (omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+				    NULL, d, id, 0, NULL) < 3
+      || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+				       NULL, id, d, 0, NULL) < 3
+      || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+				       NULL, id, id, 0, NULL) < 3)
+    abort ();
+ 
+  for (i = 0; i < 128; i++)
+    q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+    abort ();
+  
+  for (i = 0; i < 128; i++)
+    q[i] = i + 1;
+  
+  size_t volume[NUM_DIMS] = { 1, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+  
+  if (omp_target_memcpy_rect_async (p, q, sizeof (int), NUM_DIMS, volume,
+				    dst_offsets, src_offsets, dst_dimensions,
+				    src_dimensions, d, id, 0, NULL) != 0)
+    abort ();
+  
+#pragma omp taskwait
+  
+  for (i = 0; i < 128; i++)
+    q2[i] = 0;
+  if (omp_target_memcpy (q2, p, 128 * sizeof (int), 0, 0, id, d) != 0)
+    abort ();
+  
+  /* q2 is expected to contain: 1 2 3 0 0 5 6 7 0 0 .. 0  */
+  if (q2[0] != 1 || q2[1] != 2 || q2[2] !=3 || q2[3] != 0 || q2[4] != 0
+      || q2[5] != 5 || q2[6] != 6 || q2[7] != 7)
+    abort ();
+  for (i = 8; i < 128; ++i)
+    if (q2[i] != 0)
+      abort ();
+  
+  omp_target_free (p, d);
+  return 0;
+}
Index: openmp/libomptarget/test/api/omp_target_memcpy_async3.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_async3.c
@@ -0,0 +1,221 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#define VERBOSE_TIME
+
+int num_timers;
+double *times;
+void timer_set(const char *name, int n) {
+  printf("%s:\n", name);
+  num_timers = n;
+  times = (double *) calloc(n, sizeof(double));
+}
+
+void timer_start(int i) {
+  if (i < 0 || i > num_timers)
+    abort();
+  times[i] = omp_get_wtime();
+}
+
+void timer_end(int i) {
+  if (i < 0 || i > num_timers)
+    abort();
+  double t = omp_get_wtime() - times[i];
+  times[i] = t;
+  printf(" - Round %d: %f[sec]\n", i, t);
+}
+
+void timer_summarize() {
+  double min = -1;
+  double max = 0;
+  double total = 0;
+
+  for (int i = 0; i < num_timers; i++) {
+    double t = times[i];
+    if (min < 0 || t < min)
+      min = t;
+    if (t > max)
+      max = t;
+    total += t;
+  }
+  double avg = total / num_timers;
+  printf(" - Summary: min = %f[sec], max = %f[sec], avg = %f[sec]\n", min, max, avg);
+}
+
+void init(int *p, size_t size) {
+  for (int i = 0; i < size; i++)
+    p[i] = i;
+}
+
+void local(int *q, size_t size) {
+  for (int i = 0; i < size; i++) {
+    int i2 = ((int) (size * 1.5)) % size;
+    int tmp = q[i];
+    q[i] = q[i2];
+    q[i2] = tmp;
+  }
+}
+
+void kernel_sync(int d, int id, size_t size1, size_t size2, int round,
+		 int *p1, int *p2, void *p_dev, int *q) {
+  init(p1, size1);
+  init(q, size2);
+  timer_set("kernel_sync", round);
+
+  for (int r = 0; r < round; r++) {
+    timer_start(r);
+#ifdef VERBOSE_TIME
+    double t1, t2, t3;
+    t1 = omp_get_wtime();
+#endif
+
+    // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+    omp_target_memcpy(p_dev, p1, size1 * sizeof(int), 0, 0, d, id);
+    omp_target_memcpy(p2, p_dev, size1 * sizeof(int), 0, 0, id, d);
+#ifdef VERBOSE_TIME
+    t2 = omp_get_wtime();
+#endif
+
+    // Local task
+    local(q, size2);
+#ifdef VERBOSE_TIME
+    t3 = omp_get_wtime();
+#endif
+
+    timer_end(r);
+#ifdef VERBOSE_TIME
+    printf("  -- Invoking memcpy: %f[sec]\n", t2-t1);
+    printf("  -- Local work: %f[sec]\n", t3-t2);
+#endif
+  }
+  timer_summarize();
+}
+
+void kernel_async(int d, int id, size_t size1, size_t size2, int round,
+		  int *p1, int *p2, void *p_dev, int *q) {
+  init(p1, size1);
+  init(q, size2);
+  timer_set("kernel_async", round);
+
+  for (int r = 0; r < round; r++) {
+    timer_start(r);
+#ifdef VERBOSE_TIME
+    double t1, t2, t3, t4, t5;
+    t1 = omp_get_wtime();
+#endif
+
+    {
+#ifdef VERBOSE_TIME
+      t2 = omp_get_wtime();
+#endif
+
+      // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+      omp_depend_t obj1[1], obj2[1];
+#pragma omp depobj(obj1[0]) depend(out: p_dev)
+      omp_target_memcpy_async(p_dev, p1, size1 * sizeof(int), 0, 0, d, id, 1, obj1);
+#pragma omp depobj(obj2[0]) depend(in: p_dev)
+      omp_target_memcpy_async(p2, p_dev, size1 * sizeof(int), 0, 0, id, d, 1, obj2);
+#ifdef VERBOSE_TIME
+      t3 = omp_get_wtime();
+#endif
+
+      // Local task
+      local(q, size2);
+#ifdef VERBOSE_TIME
+      t4 = omp_get_wtime();
+#endif
+
+#pragma omp taskwait
+    }
+#ifdef VERBOSE_TIME
+    t5 = omp_get_wtime();
+#endif
+
+    timer_end(r);
+#ifdef VERBOSE_TIME
+    printf("  -- Starting parallel region: %f[sec]\n", t2-t1);
+    printf("  -- Invoking memcpy async: %f[sec]\n", t3-t2);
+    printf("  -- Local work: %f[sec]\n", t4-t3);
+    printf("  -- Task wait & ending parallel region: %f[sec]\n", t5-t4);
+#endif
+  }
+  timer_summarize();
+}
+
+void kernel_task(int d, int id, size_t size1, size_t size2, int round,
+		 int *p1, int *p2, void *p_dev, int *q) {
+  init(p1, size1);
+  init(q, size2);
+  timer_set("kernel_task", round);
+
+  for (int r = 0; r < round; r++) {
+    timer_start(r);
+#ifdef VERBOSE_TIME
+    double t1, t2, t3, t4, t5;
+    t1 = omp_get_wtime();
+#endif
+
+#pragma omp parallel
+#pragma omp single
+    {
+#ifdef VERBOSE_TIME
+      t2 = omp_get_wtime();
+#endif
+
+      // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+#pragma omp task depend(out: p_dev)
+      omp_target_memcpy(p_dev, p1, size1 * sizeof(int), 0, 0, d, id);
+#pragma omp task depend(in: p_dev)
+      omp_target_memcpy(p2, p_dev, size1 * sizeof(int), 0, 0, id, d);
+#ifdef VERBOSE_TIME
+      t3 = omp_get_wtime();
+#endif
+
+      // Local task
+      local(q, size2);
+#ifdef VERBOSE_TIME
+      t4 = omp_get_wtime();
+#endif
+
+#pragma omp taskwait
+    }
+#ifdef VERBOSE_TIME
+    t5 = omp_get_wtime();
+#endif
+
+    timer_end(r);
+#ifdef VERBOSE_TIME
+    printf("  -- Starting parallel region: %f[sec]\n", t2-t1);
+    printf("  -- Invoking memcpy as task: %f[sec]\n", t3-t2);
+    printf("  -- Local work: %f[sec]\n", t4-t3);
+    printf("  -- Task wait & ending parallel region: %f[sec]\n", t5-t4);
+#endif
+  }
+  timer_summarize();
+}
+
+int main(int argc, char* argv[]) {
+  size_t size1 = (argc > 1) ? atoi(argv[1]) : 600 * 1000 * 1000;
+  size_t size2 = (argc > 2) ? atoi(argv[2]) : size1;
+  int round = (argc > 3) ? atoi(argv[3]) : 5;
+  printf("memory copy size = %lu, local work size = %lu, total rounds = %d\n", size1, size2, round);
+
+  int d = omp_get_default_device();
+  int id = omp_get_initial_device();
+  if (d < 0 || d >= omp_get_num_devices())
+    d = id;
+
+  // Arrays for target memcpy
+  int *p1 = (int *) malloc(size1 * sizeof(int));
+  int *p2 = (int *) malloc(size1 * sizeof(int));
+  void *p_dev = omp_target_alloc(size1 * sizeof(int), d);
+
+  // Array for local work
+  int *q = (int *) malloc(size2 * sizeof(int));
+
+  kernel_sync(d, id, size1, size2, round, p1, p2, p_dev, q);
+  kernel_async(d, id, size1, size2, round, p1, p2, p_dev, q);
+  kernel_task(d, id, size1, size2, round, p1, p2, p_dev, q);
+  return 0;
+}
Index: openmp/libomptarget/test/api/omp_target_memcpy_async2.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_async2.c
@@ -0,0 +1,71 @@
+#include <omp.h>
+#include "stdio.h"
+#include <stdlib.h>
+
+int main() {
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int a[128], b[64], c[32], e[16], q[128], i;
+  void *p;
+  
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+  
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+  
+  for (i = 0; i < 128; ++i)
+    a[i] = i + 1;
+  for (i = 0; i < 64; ++i)
+    b[i] = i + 2;
+  for (i = 0; i < 32; i++)
+    c[i] = 0;
+  for (i = 0; i < 16; i++)
+    e[i] = i + 4;
+  
+  omp_depend_t obj[2];
+  
+#pragma omp parallel num_threads(5)
+#pragma omp single
+  {
+#pragma omp task depend(out: p)
+    omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id);
+    
+#pragma omp task depend(inout: p)
+    omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id);
+    
+#pragma omp task depend(out: c)
+    for (i = 0; i < 32; i++)
+      c[i] = i + 3;
+    
+#pragma omp depobj(obj[0]) depend(inout: p)
+#pragma omp depobj(obj[1]) depend(in: c)
+    omp_target_memcpy_async (p, c, 32 * sizeof (int), 0, 0, d, id, 2, obj);
+    
+#pragma omp task depend(in: p)
+    omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id);
+  }
+  
+#pragma omp taskwait
+  
+  for (i = 0; i < 128; ++i)
+    q[i] = 0;
+  omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d);
+  for (i = 0; i < 16; ++i)
+    if (q[i] != i + 4)
+      abort();
+ for (i = 16; i < 32; ++i)
+   if (q[i] != i + 3)
+     abort();
+ for (i = 32; i < 64; ++i)
+   if (q[i] != i + 2)
+     abort();
+ for (i = 64; i < 128; ++i)
+   if (q[i] != i + 1)
+     abort();
+ 
+ omp_target_free (p, d);
+ 
+ return 0;
+}
Index: openmp/libomptarget/test/api/omp_target_memcpy_async1.c
===================================================================
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_async1.c
@@ -0,0 +1,47 @@
+// Test case for omp_target_memcpy_async, oringally from GCC 
+
+#include <omp.h>
+#include "stdio.h"
+#include <stdlib.h>
+// #include <omptarget.h>
+
+int main () {
+  // __tgt_init_all_rtls();
+    
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+  
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; i++)
+    q[i] = i;
+
+  if (omp_target_memcpy_async(p, q, 128 * sizeof (int), sizeof (int), 0, d, id, 0, NULL)) {
+    abort();
+  }
+  
+#pragma omp taskwait
+
+  int q2[128];
+  for (i = 0; i < 128; ++i)
+    q2[i] = 0;
+  if (omp_target_memcpy_async (q2, p, 128 * sizeof(int), 0, sizeof (int), id, d, 0, NULL))
+    abort ();
+  
+#pragma omp taskwait
+  
+  for (i = 0; i < 128; ++i)
+    if (q2[i] != q[i])
+      abort ();
+  
+  omp_target_free (p, d);
+  
+  return 0;
+}
Index: openmp/libomptarget/src/private.h
===================================================================
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */             /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;           /* task is either tied (1) or untied (0) */
+  unsigned final : 1;              /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+                              code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+                                     invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+                         context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+                                      setting for the task */
+  unsigned detachable : 1;         /* 1 == can detach */
+  unsigned hidden_helper : 1;      /* 1 == hidden helper task */
+  unsigned reserved : 8;           /* reserved for compiler use */
+
+  /* Library flags */       /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;    /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;    /* 1==started, 0==not started     */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;      /* 1==freed, 0==allocated        */
+  unsigned native : 1;     /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,88 @@
                           kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
                           kmp_depend_info_t *noalias_dep_list)
     __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_int32 flags, size_t sizeof_kmp_task_t,
+                                  size_t sizeof_shareds,
+                                  kmp_routine_entry_t task_entry)
+    __attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                             kmp_routine_entry_t task_entry,
+                             kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_task_t *new_task, kmp_int32 ndeps,
+                                    kmp_depend_info_t *dep_list,
+                                    kmp_int32 ndeps_noalias,
+                                    kmp_depend_info_t *noalias_dep_list)
+    __attribute__((weak));
+
+/**
+ * The argument set that is passed from asynchronous memory copy to block
+ * version of memory copy invoked in helper task
+ */
+struct TargetMemcpyArgsTy {
+  /**
+   * Common attribuutes
+   */
+  void *Dst;
+  const void *Src;
+  int DstDevice;
+  int SrcDevice;
+
+  /**
+   * The flag that denotes single dimensional or rectangle dimensional copy
+   */
+  bool IsRectMemcpy;
+
+  /**
+   * Arguments for single dimensional copy
+   */
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+
+  /**
+   * Arguments for rectangle dimensional copy
+   */
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  const size_t *DstOffsets;
+  const size_t *SrcOffsets;
+  const size_t *DstDimensions;
+  const size_t *SrcDimensions;
+
+  /**
+   * Constructor for single dimensional copy
+   */
+  TargetMemcpyArgsTy(void *Dst, const void *Src, size_t Length,
+                     size_t DstOffset, size_t SrcOffset, int DstDevice,
+                     int SrcDevice)
+      : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
+        IsRectMemcpy(false), Length(Length), DstOffset(DstOffset),
+        SrcOffset(SrcOffset), ElementSize(0), NumDims(0), Volume(0),
+        DstOffsets(0), SrcOffsets(0), DstDimensions(0), SrcDimensions(0){};
+
+  /**
+   * Constructor for rectangle dimensional copy
+   */
+  TargetMemcpyArgsTy(void *Dst, const void *Src, size_t ElementSize,
+                     int NumDims, const size_t *Volume,
+                     const size_t *DstOffsets, const size_t *SrcOffsets,
+                     const size_t *DstDimensions, const size_t *SrcDimensions,
+                     int DstDevice, int SrcDevice)
+      : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
+        IsRectMemcpy(true), Length(0), DstOffset(0), SrcOffset(0),
+        ElementSize(ElementSize), NumDims(NumDims), Volume(Volume),
+        DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
+        DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
+};
+
 #ifdef __cplusplus
 }
 #endif
Index: openmp/libomptarget/src/exports
===================================================================
--- openmp/libomptarget/src/exports
+++ openmp/libomptarget/src/exports
@@ -38,6 +38,8 @@
     omp_target_is_present;
     omp_target_memcpy;
     omp_target_memcpy_rect;
+    omp_target_memcpy_async;
+    omp_target_memcpy_rect_async;
     omp_target_associate_ptr;
     omp_target_disassociate_ptr;
     llvm_omp_target_alloc_host;
Index: openmp/libomptarget/src/api.cpp
===================================================================
--- openmp/libomptarget/src/api.cpp
+++ openmp/libomptarget/src/api.cpp
@@ -15,6 +15,8 @@
 #include "private.h"
 #include "rtl.h"
 
+#include "llvm/ADT/SmallVector.h"
+
 #include <climits>
 #include <cstdlib>
 #include <cstring>
@@ -200,6 +202,105 @@
   return Rc;
 }
 
+// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
+static int __kmpc_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+  if (Task == nullptr)
+    return OFFLOAD_FAIL;
+
+  TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds;
+
+  if (Args == nullptr)
+    return OFFLOAD_FAIL;
+
+  // Call blocked version
+  int Rc = OFFLOAD_SUCCESS;
+  if (Args->IsRectMemcpy) {
+    Rc = omp_target_memcpy_rect(
+        Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume,
+        Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions,
+        Args->SrcDimensions, Args->DstDevice, Args->SrcDevice);
+
+    DP("omp_target_memcpy_rect returns %d\n", Rc);
+  } else {
+    Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset,
+                           Args->SrcOffset, Args->DstDevice, Args->SrcDevice);
+
+    DP("omp_target_memcpy returns %d\n", Rc);
+  }
+
+  // Release the arguments object
+  delete Args;
+
+  return Rc;
+}
+
+// Allocate and launch helper task
+static int __kmpc_helper_task_creation(TargetMemcpyArgsTy *Args,
+                                       int DepObjCount,
+                                       omp_depend_t *DepObjList) {
+  // Create global thread ID
+  int Gtid = __kmpc_global_thread_num(nullptr);
+  int (*Fn)(kmp_int32, kmp_task_t *) = &__kmpc_target_memcpy_async_helper;
+
+  // Setup the hidden helper flags;
+  kmp_int32 Flags = 0;
+  kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
+  InputFlags->hidden_helper = 1;
+
+  // Alloc helper task
+  kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
+                                                 sizeof(kmp_task_t), 0, Fn, -1);
+
+  if (Ptr == nullptr) {
+    // Task allocation failed, delete the argument object
+    delete Args;
+
+    return OFFLOAD_FAIL;
+  }
+
+  // Setup the arguments passed to helper task
+  Ptr->shareds = Args;
+
+  // Convert the type of depend objects
+  llvm::SmallVector<kmp_depend_info_t> DepObjs;
+  for (int i = 0; i < DepObjCount; i++) {
+    omp_depend_t DepObj = DepObjList[i];
+    DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
+  }
+
+  // Launch the helper task
+  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
+                                     DepObjs.data(), 0, nullptr);
+
+  return Rc;
+}
+
+EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
+                                   size_t DstOffset, size_t SrcOffset,
+                                   int DstDevice, int SrcDevice,
+                                   int DepObjCount, omp_depend_t *DepObjList) {
+  TIMESCOPE();
+  DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
+     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
+     "src offset %zu, length %zu\n",
+     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
+     Length);
+
+  // Check the source and dest address
+  if (Dst == nullptr || Src == nullptr)
+    return OFFLOAD_FAIL;
+
+  // Create task object
+  TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
+      Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
+
+  // Create and launch helper task
+  int Rc = __kmpc_helper_task_creation(Args, DepObjCount, DepObjList);
+
+  DP("omp_target_memcpy_async returns %d\n", Rc);
+  return Rc;
+}
+
 EXTERN int
 omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                        int NumDims, const size_t *Volume,
@@ -260,6 +361,36 @@
   return Rc;
 }
 
+EXTERN int omp_target_memcpy_rect_async(
+    void *Dst, const void *Src, size_t ElementSize, int NumDims,
+    const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
+    const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
+    int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
+  TIMESCOPE();
+  DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
+     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
+     "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
+     "volume " DPxMOD ", element size %zu, num_dims %d\n",
+     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
+     DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
+     DPxPTR(Volume), ElementSize, NumDims);
+
+  // Check the source and dest address
+  if (Dst == nullptr || Src == nullptr)
+    return OFFLOAD_FAIL;
+
+  // Create task object
+  TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
+      Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets,
+      DstDimensions, SrcDimensions, DstDevice, SrcDevice);
+
+  // Create and launch helper task
+  int Rc = __kmpc_helper_task_creation(Args, DepObjCount, DepObjList);
+
+  DP("omp_target_memcpy_rect_async returns %d\n", Rc);
+  return Rc;
+}
+
 EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
                                     size_t Size, size_t DeviceOffset,
                                     int DeviceNum) {
Index: openmp/libomptarget/include/interop.h
===================================================================
--- openmp/libomptarget/include/interop.h
+++ openmp/libomptarget/include/interop.h
@@ -116,30 +116,6 @@
 extern const char *__KAI_KMPC_CONVENTION
 omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);
 
-typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
-  /* Compiler flags */             /* Total compiler flags must be 16 bits */
-  unsigned tiedness : 1;           /* task is either tied (1) or untied (0) */
-  unsigned final : 1;              /* task is final(1) so execute immediately */
-  unsigned merged_if0 : 1; // no __kmpc_task_{begin/complete}_if0 calls in if0
-  unsigned destructors_thunk : 1; // set if the compiler creates a thunk to
-  unsigned proxy : 1; // task is a proxy task (it will be executed outside the
-  unsigned priority_specified : 1; // set if the compiler provides priority
-  unsigned detachable : 1;         // 1 == can detach */
-  unsigned unshackled : 1;         /* 1 == unshackled task */
-  unsigned target : 1;             /* 1 == target task */
-  unsigned reserved : 7;           /* reserved for compiler use */
-  unsigned tasktype : 1;    /* task is either explicit(1) or implicit (0) */
-  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
-  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
-  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
-  unsigned started : 1;     /* 1==started, 0==not started     */
-  unsigned executing : 1;   /* 1==executing, 0==not executing */
-  unsigned complete : 1;    /* 1==complete, 0==not complete   */
-  unsigned freed : 1;       /* 1==freed, 0==allocated        */
-  unsigned native : 1;      /* 1==gcc-compiled task, 0==intel */
-  unsigned reserved31 : 7;  /* reserved for library use */
-} kmp_tasking_flags_t;
-
 typedef enum omp_interop_backend_type_t {
   // reserve 0
   omp_interop_backend_type_cuda_1 = 1,
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to