[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-18 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 468695.
jz10 added a comment.

Thanks Johannes for your comments, and I relied them below

1. format issues

I ran clang-format to reformat, please check if there's any missed things;

2. replace '0' with 'nullptr'

fixed

3. proper return value for helper functions and async functions

fixed

4. Why can't we have a `kmp_tasking_flags_t` object?

I followed the same access approach in openmp/runtime , so I didn't change this 
part. But I can revise if it is needed

5. make helper function as 'static'

fixed

6. Why do you access args_ for some parts and not for others? That said, where 
does the hidden helper need access to the dependences anyway?

there's type cast for depend objects from 'omp_depend_t' to 
'kmp_depend_info_t*', and the array of casted depend objects is consumed by 
'__kmpc_omp_task_with_deps' , to make it safe, I just make larray of casted 
depend objects to live longer, thus attached it to Args object.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  clang/docs/ReleaseNotes.rst
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,47 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void * omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+typedef struct kmp_task {
+  void *  shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32   part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
+  unsigned final : 1; /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0 code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1; /* 1 == hidden helper task */
+  unsigned reserved : 8; /* reserved for compiler use */
+
+  /* Library flags */ /* Total library flags must be 16 bits */
+  unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0) 
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately 
+  // (1) or may be deferred (0) 
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1; /* 1==started, 0==not started */
+  unsigned executing : 1; /* 1==executing, 0==not executing */
+  unsigned complete : 1; /* 1==complete, 0==not complete   */
+  unsigned freed : 1; /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+  
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +157,96 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t* __kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+  __attribute__((weak));
+
+kmp_task_t* __kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry, kmp_int64 device_id)
+  __attribute__((weak));
+
+void __kmpc_proxy_task_completed_ooo (kmp_task_t *ptask) __attribute__((weak));
+kmp_int32 __kmpc_omp_task_with_deps (ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+ kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+ kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list)
+  __attribute__((weak));
+
+class TargetMemcpyArgsTy {
+public:
+  TargetMemcpyArgsTy

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-18 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 added a comment.

I'm not sure if it copies, will check it to confirm


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-19 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469051.
jz10 added a comment.

Thanks Johannes and Shilei

1. "So, you are saying the task_with_deps function does *not* copy the 
dependences and therefore the array has to outlive the function?"

I checked the omp_task_with_deps, it does some copy operations , so I move the 
depobj_list related code out of the TargetMemcpyArgsTy and 
TargetMemcpyRectArgsTy objects now. Those two classes are also redefined as 
'struct'.

2. "> This would be the third location where this struct is duplicated: 
interop.h, kmp.h and this file. Would it make sense to try to add it to another 
common header file?

IIRC, there are some (non-technical) issues on using `kmp.h` in `libomptarget`."
So should I keep the data structure definitions in private.h? or create a new 
header file?

3. no release of Args

fixed

4. "I'm not sure if it is a failure if source and destination are same"

we thought about this, and should allow user to do this

5. "This is really not LLVM code style."

I changed the 'class' definition to 'struct' and reformat them. Please check if 
that works

6. few formatting issues

fixed


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  clang/docs/ReleaseNotes.rst
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,70 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-19 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469060.
jz10 added a comment.

one minors fix, i.e. delete the depobj_list from TargetMemcpyArgsTy struct


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  clang/docs/ReleaseNotes.rst
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,70 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int32 ndeps,
+kmp_depend_info_t *dep_list,
+kmp_int32 ndeps_noalias,
+kmp_depend_info_t *noalias_dep_list)
+__attribute__((weak));
+
+struct TargetMemcpyArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+  int DstDevice;
+  int SrcDevice;
+
+  TargetMemcpyArgsTy(void *Dst_, const void *Src_, size_t Length_,
+ size_t DstOffset_, size_t SrcOffset_, int DstDevice_,
+ int SrcDevice_)
+  : Dst(Dst_), Src(Src_), Length(Length_), DstOffset(DstOffset_),
+SrcOffset(SrcOffset_), DstDevice(DstDevice_), SrcDevice(SrcDevice_){};
+};
+
+struct TargetMemcpyRectArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  const size_t *DstOffsets;
+  const size_t 

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-19 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469110.
jz10 added a comment.

Thanks Johannes

1. use SmallVector

fixed

2. "module 5 characters"

ran clang-format , please check it that works


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  clang/docs/ReleaseNotes.rst
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,70 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int32 ndeps,
+kmp_depend_info_t *dep_list,
+kmp_int32 ndeps_noalias,
+kmp_depend_info_t *noalias_dep_list)
+__attribute__((weak));
+
+struct TargetMemcpyArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+  int DstDevice;
+  int SrcDevice;
+
+  TargetMemcpyArgsTy(void *Dst_, const void *Src_, size_t Length_,
+ size_t DstOffset_, size_t SrcOffset_, int DstDevice_,
+ int SrcDevice_)
+  : Dst(Dst_), Src(Src_), Length(Length_), DstOffset(DstOffset_),
+SrcOffset(SrcOffset_), DstDevice(DstDevice_), SrcDevice(SrcDevice_){};
+};
+
+struct TargetMemcpyRectArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  c

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-19 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469112.
jz10 added a comment.

1. use SmallVector

fixed

2. "5  characters"

ran clang-format, please check if that works


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  clang/docs/ReleaseNotes.rst
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,67 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int32 ndeps,
+kmp_depend_info_t *dep_list,
+kmp_int32 ndeps_noalias,
+kmp_depend_info_t *noalias_dep_list)
+__attribute__((weak));
+
+struct TargetMemcpyArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+  int DstDevice;
+  int SrcDevice;
+
+  TargetMemcpyArgsTy(void *Dst_, const void *Src_, size_t Length_,
+ size_t DstOffset_, size_t SrcOffset_, int DstDevice_,
+ int SrcDevice_)
+  : Dst(Dst_), Src(Src_), Length(Length_), DstOffset(DstOffset_),
+SrcOffset(SrcOffset_), DstDevice(DstDevice_), SrcDevice(SrcDevice_){};
+};
+
+struct TargetMemcpyRectArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  const size_t *DstOffsets;

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-20 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469328.
jz10 added a comment.

redo the proper patch, i.e. get rid of content related to clang/doc


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,67 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int32 ndeps,
+kmp_depend_info_t *dep_list,
+kmp_int32 ndeps_noalias,
+kmp_depend_info_t *noalias_dep_list)
+__attribute__((weak));
+
+struct TargetMemcpyArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+  int DstDevice;
+  int SrcDevice;
+
+  TargetMemcpyArgsTy(void *Dst_, const void *Src_, size_t Length_,
+ size_t DstOffset_, size_t SrcOffset_, int DstDevice_,
+ int SrcDevice_)
+  : Dst(Dst_), Src(Src_), Length(Length_), DstOffset(DstOffset_),
+SrcOffset(SrcOffset_), DstDevice(DstDevice_), SrcDevice(SrcDevice_){};
+};
+
+struct TargetMemcpyRectArgsTy {
+  void *Dst;
+  const void *Src;
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  const size_t *DstOffsets;
+  const size_t *SrcOffsets;
+  const size_t *DstDime

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-21 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469719.
jz10 added a comment.

Thanks Johannes and Shilei

1. using a common helper function

fixed

2. using push_back for SmallVector

fixed

3. add doxygen comments for struct

added, please check if that works


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,88 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int32 ndeps,
+kmp_depend_info_t *dep_list,
+kmp_int32 ndeps_noalias,
+kmp_depend_info_t *noalias_dep_list)
+__attribute__((weak));
+
+/**
+ * The argument set that is passed from asynchronous memory copy to block
+ * version of memory copy invoked in helper task
+ */
+struct TargetMemcpyArgsTy {
+  /**
+   * Common attribuutes
+   */
+  void *Dst;
+  const void *Src;
+  int DstDevice;
+  int SrcDevice;
+
+  /**
+   * The flag that denotes single dimensional or rectangle dimensional copy
+   */
+  bool IsRectMemcpy;
+
+  /**
+   * Arguments for single dimensional copy
+   */
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+
+  /**
+   * Arguments for rectangle dimensional copy
+   */
+  size_t ElementSize;
+  i

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-21 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469772.
jz10 added a comment.

Thanks Johannes and Shilei

1. '385-387 are the same as in omp_target_memcpy_async. Can we also not 
duplicate those lines?'

I put the common code (i.e.helper task creation) into another static function

2. 'In this code there are also various places with variables not named 
according to the style guide'

fixed, please check if there's remaining issues

3. 'The problem with the int32 Flags I mentioned already. '

The flag variable was defined as 'kmp_int32', since its consumer 
'__kmpc_omp_target_task_alloc' needs  'kmp_int32' type as input. the type cast 
to 'kmp_tasking_flags_t' is to set the 'hidden_helper' bit. So it seems that 
there's no better option for us. Please let me know your suggestion.

4. "Can we put all KMP related code into a separate header"

we used both kmp relevent data structure/types and APIs, so should I wrap all 
those relevant code into several tool functions and put them into separate 
header file?

5. 'aving a variable suffix with _ is generally not a good coding style'

fixed


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,88 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-21 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469810.
jz10 added a comment.

Thanks Johannes

1. 'I feel like I'm missing something. As said before, all but 3 lines are 
identical in these two functions.

Now you created a helper for 1/3 of those identical lines but left the other 
2/3 being duplicated. Could
you elaborate why?'
I put the common part into task_creation function


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,88 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int32 ndeps,
+kmp_depend_info_t *dep_list,
+kmp_int32 ndeps_noalias,
+kmp_depend_info_t *noalias_dep_list)
+__attribute__((weak));
+
+/**
+ * The argument set that is passed from asynchronous memory copy to block
+ * version of memory copy invoked in helper task
+ */
+struct TargetMemcpyArgsTy {
+  /**
+   * Common attribuutes
+   */
+  void *Dst;
+  const void *Src;
+  int DstDevice;
+  int SrcDevice;
+
+  /**
+   * The flag that denotes single dimensional or rectangle dimensional copy
+   */
+  bool IsRectMemcpy;
+
+  /**
+   * Arguments for single dimensional copy
+   */
+  size_t Length;
+  size_t D

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-21 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 469845.
jz10 added a comment.

Thanks Johannes 
I revised those issues, please check if those work


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;   /* task is either tied (1) or untied (0) */
+  unsigned final : 1;  /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+  code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+ invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+ context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+  setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1;  /* 1 == hidden helper task */
+  unsigned reserved : 8;   /* reserved for compiler use */
+
+  /* Library flags */   /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;/* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;/* 1==started, 0==not started */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;  /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +161,88 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+  kmp_int32 flags, size_t sizeof_kmp_task_t,
+  size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+__attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry,
+ kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+kmp_task_t *new_task, kmp_int32 ndeps,
+kmp_depend_info_t *dep_list,
+kmp_int32 ndeps_noalias,
+kmp_depend_info_t *noalias_dep_list)
+__attribute__((weak));
+
+/**
+ * The argument set that is passed from asynchronous memory copy to block
+ * version of memory copy invoked in helper task
+ */
+struct TargetMemcpyArgsTy {
+  /**
+   * Common attribuutes
+   */
+  void *Dst;
+  const void *Src;
+  int DstDevice;
+  int SrcDevice;
+
+  /**
+   * The flag that denotes single dimensional or rectangle dimensional copy
+   */
+  bool IsRectMemcpy;
+
+  /**
+   * Arguments for single dimensional copy
+   */
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+
+  /**
+   * Arguments for rectangle dimensional copy
+   */
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  const size_t *DstOffsets;
+  const size_t *SrcOffsets;
+  const size_t *Dst

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-22 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 added a comment.

Regarding this, can we just move those two helper functions to private.h ?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-24 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 added a comment.

I checked through private.h, this header actually does the functionality
that contains all kmp and kmpc related data structures and APIs, so should
we still have to split a separate header file?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-24 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 added a comment.

Sure, where should I add those tests?


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-24 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 470339.
jz10 added a comment.

Thanks Johannes and Shilei

I added few test cases for asynchronous routine at test/api folder


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h
  openmp/libomptarget/test/api/omp_target_memcpy_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_async3.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c

Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c
===
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c
@@ -0,0 +1,250 @@
+#include 
+#include 
+#include 
+
+#define NUM_DIMS 3
+#define RECT_WIDTH 4
+
+#define VERBOSE_TIME
+
+int num_timers;
+double *times;
+void timer_set(const char *name, int n) {
+  printf("%s:\n", name);
+  num_timers = n;
+  times = (double *) calloc(n, sizeof(double));
+}
+
+void timer_start(int i) {
+  if (i < 0 || i > num_timers)
+abort();
+  times[i] = omp_get_wtime();
+}
+
+void timer_end(int i) {
+  if (i < 0 || i > num_timers)
+abort();
+  double t = omp_get_wtime() - times[i];
+  times[i] = t;
+  printf(" - Round %d: %f[sec]\n", i, t);
+}
+
+void timer_summarize() {
+  double min = -1;
+  double max = 0;
+  double total = 0;
+
+  for (int i = 0; i < num_timers; i++) {
+double t = times[i];
+if (min < 0 || t < min)
+  min = t;
+if (t > max)
+  max = t;
+total += t;
+  }
+  double avg = total / num_timers;
+  printf(" - Summary: min = %f[sec], max = %f[sec], avg = %f[sec]\n", min, max, avg);
+}
+
+void init(int *p, size_t size) {
+  for (int i = 0; i < size; i++)
+p[i] = i;
+}
+
+void local(int *q, size_t size) {
+  for (int i = 0; i < size; i++) {
+int i2 = ((int) (size * 1.5)) % size;
+int tmp = q[i];
+q[i] = q[i2];
+q[i2] = tmp;
+  }
+}
+
+void kernel_sync(int d, int id, size_t size1, size_t size2, int round,
+		 int *p1, int *p2, void *p_dev, int *q,
+		 size_t* volumes, size_t* dst_offsets, size_t* src_offsets,
+		 size_t* dst_dimensions, size_t* src_dimensions) {
+  init(p1, size1 * size1 * size1);
+  init(q, size2);
+  timer_set("kernel_sync", round);
+
+  for (int r = 0; r < round; r++) {
+timer_start(r);
+#ifdef VERBOSE_TIME
+double t1, t2, t3;
+t1 = omp_get_wtime();
+#endif
+
+// Ping-poing memcpy (assume offloading GPU task and retrieving result)
+omp_target_memcpy_rect(p_dev, p1, size1 * sizeof(int), NUM_DIMS, volumes, dst_offsets, src_offsets,
+			   dst_dimensions, src_dimensions, d, id);
+omp_target_memcpy_rect(p2, p_dev, size1 * sizeof(int), NUM_DIMS, volumes, src_offsets, dst_offsets, 
+   src_dimensions, dst_dimensions, id, d);
+
+#ifdef VERBOSE_TIME
+t2 = omp_get_wtime();
+#endif
+
+// Local task
+local(q, size2);
+#ifdef VERBOSE_TIME
+t3 = omp_get_wtime();
+#endif
+
+timer_end(r);
+#ifdef VERBOSE_TIME
+printf("  -- Invoking memcpy: %f[sec]\n", t2-t1);
+printf("  -- Local work: %f[sec]\n", t3-t2);
+#endif
+  }
+  timer_summarize();
+}
+
+void kernel_async(int d, int id, size_t size1, size_t size2, int round,
+		  int *p1, int *p2, void *p_dev, int *q,
+		  size_t* volumes, size_t* dst_offsets, size_t* src_offsets,
+		  size_t* dst_dimensions, size_t* src_dimensions) {
+  init(p1, size1 * size1 * size1);
+  init(q, size2);
+  timer_set("kernel_async", round);
+
+  for (int r = 0; r < round; r++) {
+timer_start(r);
+#ifdef VERBOSE_TIME
+double t1, t2, t3, t4, t5;
+t1 = omp_get_wtime();
+#endif
+
+{
+#ifdef VERBOSE_TIME
+  t2 = omp_get_wtime();
+#endif
+
+  // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+  omp_depend_t obj1[1], obj2[1];
+#pragma omp depobj(obj1[0]) depend(out: p_dev)
+  omp_target_memcpy_rect_async(p_dev, p1, sizeof(int), NUM_DIMS, volumes,
+   dst_offsets, src_offsets, dst_dimensions, src_dimensions,
+   d, id, 1, obj1);
+#pragma omp depobj(obj2[0]) depend(in: p_dev)
+  omp_target_memcpy_rect_async(p2, p_dev, sizeof(int), NUM_DIMS, volumes,
+   src_offsets, dst_offsets, src_dimensions, dst_dimensions,
+   id, d, 1, obj2);
+#ifdef VERBOSE_TIME
+  t3 = omp_get_wtime();
+#endif
+
+  // Local task
+  local(q, size2);
+#ifdef VERBOSE_TIME
+  t4 = omp_get_wtime();
+#endif
+
+#pragma omp taskwait
+}
+#ifdef VERBOSE_TIME
+t5 = omp_get_wtime();
+#endif
+
+timer_end(r);
+#ifdef VERBOSE_TIME
+printf("  -- Starting parallel region: %f[sec]\n", t2-t1);
+printf("  -- Invoking memcpy async: %f[sec]\n", t3-t2);
+printf("  -- Local work: %f[sec]

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-24 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 470354.
jz10 added a comment.

Thanks Shilei

Add RUN line for each test cases


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h
  openmp/libomptarget/test/api/omp_target_memcpy_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_async3.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c

Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c
===
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c
@@ -0,0 +1,253 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include 
+#include 
+#include 
+
+#define NUM_DIMS 3
+#define RECT_WIDTH 4
+
+#define VERBOSE_TIME
+
+int num_timers;
+double *times;
+void timer_set(const char *name, int n) {
+  printf("%s:\n", name);
+  num_timers = n;
+  times = (double *) calloc(n, sizeof(double));
+}
+
+void timer_start(int i) {
+  if (i < 0 || i > num_timers)
+abort();
+  times[i] = omp_get_wtime();
+}
+
+void timer_end(int i) {
+  if (i < 0 || i > num_timers)
+abort();
+  double t = omp_get_wtime() - times[i];
+  times[i] = t;
+  printf(" - Round %d: %f[sec]\n", i, t);
+}
+
+void timer_summarize() {
+  double min = -1;
+  double max = 0;
+  double total = 0;
+
+  for (int i = 0; i < num_timers; i++) {
+double t = times[i];
+if (min < 0 || t < min)
+  min = t;
+if (t > max)
+  max = t;
+total += t;
+  }
+  double avg = total / num_timers;
+  printf(" - Summary: min = %f[sec], max = %f[sec], avg = %f[sec]\n", min, max, avg);
+}
+
+void init(int *p, size_t size) {
+  for (int i = 0; i < size; i++)
+p[i] = i;
+}
+
+void local(int *q, size_t size) {
+  for (int i = 0; i < size; i++) {
+int i2 = ((int) (size * 1.5)) % size;
+int tmp = q[i];
+q[i] = q[i2];
+q[i2] = tmp;
+  }
+}
+
+void kernel_sync(int d, int id, size_t size1, size_t size2, int round,
+		 int *p1, int *p2, void *p_dev, int *q,
+		 size_t* volumes, size_t* dst_offsets, size_t* src_offsets,
+		 size_t* dst_dimensions, size_t* src_dimensions) {
+  init(p1, size1 * size1 * size1);
+  init(q, size2);
+  timer_set("kernel_sync", round);
+
+  for (int r = 0; r < round; r++) {
+timer_start(r);
+#ifdef VERBOSE_TIME
+double t1, t2, t3;
+t1 = omp_get_wtime();
+#endif
+
+// Ping-poing memcpy (assume offloading GPU task and retrieving result)
+omp_target_memcpy_rect(p_dev, p1, size1 * sizeof(int), NUM_DIMS, volumes, dst_offsets, src_offsets,
+			   dst_dimensions, src_dimensions, d, id);
+omp_target_memcpy_rect(p2, p_dev, size1 * sizeof(int), NUM_DIMS, volumes, src_offsets, dst_offsets, 
+   src_dimensions, dst_dimensions, id, d);
+
+#ifdef VERBOSE_TIME
+t2 = omp_get_wtime();
+#endif
+
+// Local task
+local(q, size2);
+#ifdef VERBOSE_TIME
+t3 = omp_get_wtime();
+#endif
+
+timer_end(r);
+#ifdef VERBOSE_TIME
+printf("  -- Invoking memcpy: %f[sec]\n", t2-t1);
+printf("  -- Local work: %f[sec]\n", t3-t2);
+#endif
+  }
+  timer_summarize();
+}
+
+void kernel_async(int d, int id, size_t size1, size_t size2, int round,
+		  int *p1, int *p2, void *p_dev, int *q,
+		  size_t* volumes, size_t* dst_offsets, size_t* src_offsets,
+		  size_t* dst_dimensions, size_t* src_dimensions) {
+  init(p1, size1 * size1 * size1);
+  init(q, size2);
+  timer_set("kernel_async", round);
+
+  for (int r = 0; r < round; r++) {
+timer_start(r);
+#ifdef VERBOSE_TIME
+double t1, t2, t3, t4, t5;
+t1 = omp_get_wtime();
+#endif
+
+{
+#ifdef VERBOSE_TIME
+  t2 = omp_get_wtime();
+#endif
+
+  // Ping-poing memcpy (assume offloading GPU task and retrieving result)
+  omp_depend_t obj1[1], obj2[1];
+#pragma omp depobj(obj1[0]) depend(out: p_dev)
+  omp_target_memcpy_rect_async(p_dev, p1, sizeof(int), NUM_DIMS, volumes,
+   dst_offsets, src_offsets, dst_dimensions, src_dimensions,
+   d, id, 1, obj1);
+#pragma omp depobj(obj2[0]) depend(in: p_dev)
+  omp_target_memcpy_rect_async(p2, p_dev, sizeof(int), NUM_DIMS, volumes,
+   src_offsets, dst_offsets, src_dimensions, dst_dimensions,
+   id, d, 1, obj2);
+#ifdef VERBOSE_TIME
+  t3 = omp_get_wtime();
+#endif
+
+  // Local task
+  local(q, size2);
+#ifdef VERBOSE_TIME
+  t4 = omp_get_wtime();
+#endif
+
+#pragma omp taskwait
+}
+#ifdef VERBOSE_TIME
+t5 = omp_get_wtime();
+#endif
+
+timer_end(r);
+#ifdef VERBOSE_TIME
+printf("  -- Starting parallel region: %f[sec]\n", t2-t1);
+printf("  -- Invoking memcpy async: %f[

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-24 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 470357.
jz10 added a comment.

Thanks Shilei

1. "Does it work on AMDGPU and other targets? Why does it require Nvidia here?"

No, I just remove this line

2. "We don't have any performance test cases yet, and I'm not sure we need them 
right now."

Yes, I remove those two performance test cases


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h
  openmp/libomptarget/test/api/omp_target_memcpy_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c

Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
===
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
@@ -0,0 +1,89 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include 
+#include 
+
+#define NUM_DIMS 3
+
+int main () {
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int a[128], b[64], c[128], e[16], q[128], i;
+  void *p;
+  
+  if (d < 0 || d >= omp_get_num_devices ())
+d = id;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+return 0;
+  
+  for (i = 0; i < 128; i++)
+q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+abort ();
+  
+  size_t volume[NUM_DIMS] = { 2, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+  
+  for (i = 0; i < 128; i++)
+a[i] = 42;
+  for (i = 0; i < 64; i++)
+b[i] = 24;
+  for (i = 0; i < 128; i++)
+c[i] = 0;
+  for (i = 0; i < 16; i++)
+e[i] = 77;
+
+  omp_depend_t obj[2];
+  
+#pragma omp parallel num_threads(5)
+#pragma omp single
+  {
+#pragma omp task depend (out: p)
+omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id);
+
+#pragma omp task depend(inout: p)
+omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id);
+
+#pragma omp task depend(out: c)
+for (i = 0; i < 128; i++)
+  c[i] = i + 1;
+
+#pragma omp depobj(obj[0]) depend(inout: p)
+#pragma omp depobj(obj[1]) depend(in: c)
+
+/*  This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
+	13 14 15 - - 17 18 19 - - at positions 20..29.  */
+omp_target_memcpy_rect_async (p, c, sizeof (int), NUM_DIMS, volume,
+  dst_offsets, src_offsets, dst_dimensions,
+  src_dimensions, d, id, 2, obj);
+
+#pragma omp task depend(in: p)
+omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id);
+  }
+  
+#pragma omp taskwait
+  
+  if (omp_target_memcpy (q, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+abort ();
+  
+  for (i = 0; i < 16; ++i)
+if (q[i] != 77)
+  abort ();
+  if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18
+  || q[27] != 19)
+abort ();
+  for (i = 28; i < 64; ++i)
+if (q[i] != 24)
+  abort ();
+  for (i = 64; i < 128; ++i)
+   if (q[i] != 42)
+ abort ();
+  
+  omp_target_free (p, d);
+  return 0;
+}
Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
===
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
@@ -0,0 +1,66 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include 
+#include 
+#include 
+
+#define NUM_DIMS 3
+
+int main() {
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int q[128], q2[128], i;
+  void *p;
+  
+  if (d < 0 || d >= omp_get_num_devices ())
+d = id;
+  
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+return 0;
+
+  if (omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+NULL, d, id, 0, NULL) < 3
+  || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+   NULL, id, d, 0, NULL) < 3
+  || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+   NULL, id, id, 0, NULL) < 3)
+abort ();
+ 
+  for (i = 0; i < 128; i++)
+q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+abort ();
+  
+  for (i = 0; i < 128; i++)
+q[i] = i + 1;
+  
+  size_t volume[NUM_DIMS] = { 1, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+  
+  if (omp_target_memcpy_rect_async (p, q, sizeof (int), NUM_DIMS, volume,
+dst_offsets, src_offsets, dst_dimensions,
+src_dimensions, d, id, 0, NULL) != 0)
+abort 

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-25 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 added a subscriber: jdoerfert.
jz10 added a comment.

hi Johannes and Shilei, is there revision that needs to be done on this
patch? please let me know


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-25 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 updated this revision to Diff 470684.
jz10 added a comment.

Thanks Shilei

1. "Can you check all resolved comments to make sure there is no open comments?"

checked through the comments you and Johannes made, no more issues

2. "Since this function is not part of `libomp` and it's not gonna be an 
interface function, no need to name it as `__kmpc`."

revised helper functions' names


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D136103/new/

https://reviews.llvm.org/D136103

Files:
  openmp/libomptarget/include/interop.h
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h
  openmp/libomptarget/test/api/omp_target_memcpy_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_async2.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
  openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c

Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
===
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
@@ -0,0 +1,89 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include 
+#include 
+
+#define NUM_DIMS 3
+
+int main () {
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int a[128], b[64], c[128], e[16], q[128], i;
+  void *p;
+  
+  if (d < 0 || d >= omp_get_num_devices ())
+d = id;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+return 0;
+  
+  for (i = 0; i < 128; i++)
+q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+abort ();
+  
+  size_t volume[NUM_DIMS] = { 2, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+  
+  for (i = 0; i < 128; i++)
+a[i] = 42;
+  for (i = 0; i < 64; i++)
+b[i] = 24;
+  for (i = 0; i < 128; i++)
+c[i] = 0;
+  for (i = 0; i < 16; i++)
+e[i] = 77;
+
+  omp_depend_t obj[2];
+  
+#pragma omp parallel num_threads(5)
+#pragma omp single
+  {
+#pragma omp task depend (out: p)
+omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id);
+
+#pragma omp task depend(inout: p)
+omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id);
+
+#pragma omp task depend(out: c)
+for (i = 0; i < 128; i++)
+  c[i] = i + 1;
+
+#pragma omp depobj(obj[0]) depend(inout: p)
+#pragma omp depobj(obj[1]) depend(in: c)
+
+/*  This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
+	13 14 15 - - 17 18 19 - - at positions 20..29.  */
+omp_target_memcpy_rect_async (p, c, sizeof (int), NUM_DIMS, volume,
+  dst_offsets, src_offsets, dst_dimensions,
+  src_dimensions, d, id, 2, obj);
+
+#pragma omp task depend(in: p)
+omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id);
+  }
+  
+#pragma omp taskwait
+  
+  if (omp_target_memcpy (q, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+abort ();
+  
+  for (i = 0; i < 16; ++i)
+if (q[i] != 77)
+  abort ();
+  if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18
+  || q[27] != 19)
+abort ();
+  for (i = 28; i < 64; ++i)
+if (q[i] != 24)
+  abort ();
+  for (i = 64; i < 128; ++i)
+   if (q[i] != 42)
+ abort ();
+  
+  omp_target_free (p, d);
+  return 0;
+}
Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
===
--- /dev/null
+++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
@@ -0,0 +1,66 @@
+// RUN: %libomptarget-compile-run-and-check-nvptx64-nvidia-cuda
+
+#include 
+#include 
+#include 
+
+#define NUM_DIMS 3
+
+int main() {
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int q[128], q2[128], i;
+  void *p;
+  
+  if (d < 0 || d >= omp_get_num_devices ())
+d = id;
+  
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+return 0;
+
+  if (omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+NULL, d, id, 0, NULL) < 3
+  || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+   NULL, id, d, 0, NULL) < 3
+  || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+   NULL, id, id, 0, NULL) < 3)
+abort ();
+ 
+  for (i = 0; i < 128; i++)
+q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+abort ();
+  
+  for (i = 0; i < 128; i++)
+q[i] = i + 1;
+  
+  size_t volume[NUM_DIMS] = { 1, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+  
+  if (omp_target_memcpy_rect_async (p, q, sizeof (int), NUM_DIMS, volume,
+dst_offsets, src_offsets, dst_dimensions,

[PATCH] D136103: OpenMP asynchronous memory copy support

2022-10-17 Thread Jisheng Zhao via Phabricator via cfe-commits
jz10 created this revision.
jz10 added a reviewer: jdoerfert.
Herald added subscribers: guansong, yaxunl.
Herald added a project: All.
jz10 requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.

We introduced the implementation of supporting asynchronous routines with 
depend objects specified in Version 5.1 of the OpenMP Application Programming 
Interface. In brief, these routines omp_target_memcpy_async and 
omp_target_memcpy_rect_async perform asynchronous (nonblocking) memory copies 
between any
combination of host and device pointers. The basic idea is to create the 
implicit tasks to carry the memory copy calls and handle dependencies specified 
by depend objects. The implicit tasks are executed via hidden helper thread in 
OpenMP runtime.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D136103

Files:
  clang/docs/ReleaseNotes.rst
  openmp/libomptarget/src/api.cpp
  openmp/libomptarget/src/exports
  openmp/libomptarget/src/private.h

Index: openmp/libomptarget/src/private.h
===
--- openmp/libomptarget/src/private.h
+++ openmp/libomptarget/src/private.h
@@ -98,7 +98,47 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void * omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (* kmp_routine_entry_t)( kmp_int32, struct kmp_task * );
+typedef struct kmp_task {
+  void *  shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32   part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */ /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
+  unsigned final : 1; /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0 code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority setting for the task */
+  unsigned detachable : 1; /* 1 == can detach */
+  unsigned hidden_helper : 1; /* 1 == hidden helper task */
+  unsigned reserved : 8; /* reserved for compiler use */
+
+  /* Library flags */ /* Total library flags must be 16 bits */
+  unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0) 
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately 
+  // (1) or may be deferred (0) 
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1; /* 1==started, 0==not started */
+  unsigned executing : 1; /* 1==executing, 0==not executing */
+  unsigned complete : 1; /* 1==complete, 0==not complete   */
+  unsigned freed : 1; /* 1==freed, 0==allocated*/
+  unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+  
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -117,6 +157,96 @@
   kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
   kmp_depend_info_t *noalias_dep_list)
 __attribute__((weak));
+
+kmp_task_t* __kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+  kmp_routine_entry_t task_entry)
+  __attribute__((weak));
+
+kmp_task_t* __kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+ size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+ kmp_routine_entry_t task_entry, kmp_int64 device_id)
+  __attribute__((weak));
+
+void __kmpc_proxy_task_completed_ooo (kmp_task_t *ptask) __attribute__((weak));
+kmp_int32 __kmpc_omp_task_with_deps (ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
+ kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+ kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list)
+  __attribute__((weak));
+
+class TargetMemcpyArgsTy {
+public:
+  TargetMemcpyArgsTy(void *Dst_, const void *Src_, size_t Length_,
+ size_t DstOffset_, size_t SrcOffset_, int DstDevice_, int SrcDevice_,
+ int Depobj_count, omp_depend_t* Depobj