[PATCH] D59319: [OpenMP][Offloading][1/3] A generic and simple target region interface

Johannes Doerfert via Phabricator via cfe-commits Wed, 13 Mar 2019 13:14:44 -0700

jdoerfert created this revision.
jdoerfert added reviewers: ABataev, arpith-jacob, guraypp, gtbercea, hfinkel.
Herald added a project: OpenMP.


This patch introduces an alternative OpenMP GPU kernel offloading
interface called target kernel region (or TRegion).

The commit includes the runtime library implementation for the NVPTX
device plugin, implemented mostly in terms of the existing
functionality.

The interface is deliberately simple to be easily analyzable in the
middle end. Design decisions included:

- Hide all (complex) implementation choices in the runtime library but allow 
complete removal of the abstraction once the runtime is inlined.
- Provide all runtime calls with sufficient, easy encoded information.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D59319

Files:
  openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
  openmp/libomptarget/deviceRTLs/common/target_region.h
  openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
  openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
  openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu

Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu
===================================================================
--- /dev/null
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu
@@ -0,0 +1,197 @@
+//===-- target_region.cu ---- CUDA impl. of the target region interface -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the common target region interface.
+//
+//===----------------------------------------------------------------------===//
+
+// Include the native definitions first as certain defines might be needed in
+// the common interface definition below.
+#include "omptarget-nvptx.h"
+#include "interface.h"
+
+#include "../../common/target_region.h"
+
+/// The pointer used to share memory between team threads.
+extern __device__ __shared__ target_region_shared_buffer
+    _target_region_shared_memory;
+
+EXTERN char *__kmpc_target_region_kernel_get_shared_memory() {
+  return _target_region_shared_memory.begin();
+}
+EXTERN char *__kmpc_target_region_kernel_get_private_memory() {
+  return _target_region_shared_memory.begin() +
+         _target_region_shared_memory.get_offset();
+}
+
+/// Simple generic state machine for worker threads.
+INLINE static void
+__kmpc_target_region_state_machine(bool IsOMPRuntimeInitialized) {
+
+  do {
+    void *WorkFn = 0;
+
+    // Wait for the signal that we have a new work function.
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized);
+
+    // If there is nothing more to do, break out of the state machine by
+    // returning to the caller.
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      char *SharedVars = __kmpc_target_region_kernel_get_shared_memory();
+      char *PrivateVars = __kmpc_target_region_kernel_get_private_memory();
+
+      ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars);
+
+      __kmpc_kernel_end_parallel();
+    }
+
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+  } while (true);
+}
+
+/// Filter threads into masters and workers. If \p UseStateMachine is true,
+/// required workers will enter a state machine through and be trapped there.
+/// Master and surplus worker threads will return from this function immediately
+/// while required workers will only return once there is no more work. The
+/// return value indicates if the thread is a master (1), a surplus worker (0),
+/// or a finished required worker released from the state machine (-1).
+INLINE static int8_t
+__kmpc_target_region_thread_filter(unsigned ThreadLimit, bool UseStateMachine,
+                                   bool IsOMPRuntimeInitialized) {
+
+  unsigned TId = GetThreadIdInBlock();
+  bool IsWorker = TId < ThreadLimit;
+
+  if (IsWorker) {
+    if (UseStateMachine)
+      __kmpc_target_region_state_machine(IsOMPRuntimeInitialized);
+    return -1;
+  }
+
+  return TId == GetMasterThreadID();
+}
+
+EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode,
+                                               bool UseStateMachine,
+                                               bool RequiresOMPRuntime,
+                                               bool RequiresDataSharing) {
+  unsigned NumThreads = GetNumberOfThreadsInBlock();
+
+  // Handle the SPMD case first.
+  if (UseSPMDMode) {
+
+    __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime,
+                            RequiresDataSharing);
+
+    if (RequiresDataSharing)
+      __kmpc_data_sharing_init_stack_spmd();
+
+    return 1;
+  }
+
+  // Reserve one WARP in non-SPMD mode for the masters.
+  unsigned ThreadLimit = NumThreads - WARPSIZE;
+  int8_t FilterVal = __kmpc_target_region_thread_filter(
+      ThreadLimit, UseStateMachine, RequiresOMPRuntime);
+
+  // If the filter returns 1 the executing thread is a team master which will
+  // initialize the kernel in the following.
+  if (FilterVal == 1) {
+    __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime);
+    __kmpc_data_sharing_init_stack();
+    _target_region_shared_memory.init();
+  }
+
+  return FilterVal;
+}
+
+EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+                                               bool RequiredOMPRuntime) {
+  // Handle the SPMD case first.
+  if (UseSPMDMode) {
+    __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime);
+    return;
+  }
+
+  __kmpc_kernel_deinit(RequiredOMPRuntime);
+
+  // Barrier to terminate worker threads.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // Release any dynamically allocated memory used for sharing.
+  _target_region_shared_memory.release();
+}
+
+EXTERN void __kmpc_target_region_kernel_parallel(
+    bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn,
+    char *SharedVars, uint16_t SharedVarsBytes, char *PrivateVars,
+    uint16_t PrivateVarsBytes, bool SharedMemPointers) {
+
+  if (UseSPMDMode) {
+    ParallelWorkFn(SharedVars, PrivateVars);
+    return;
+  }
+
+  if (SharedMemPointers) {
+    // If shared memory pointers are used the user guarantees that any private
+    // variables, if any, are stored directly after the shared ones in memory
+    // and that this memory can be accessed by all the threads. In that case,
+    // we do not need to copy memory around but simply use the provided
+    // locations.
+
+    _target_region_shared_memory.set(SharedVars, SharedVarsBytes);
+
+  } else {
+
+    size_t BytesToCopy = SharedVarsBytes + PrivateVarsBytes;
+    if (BytesToCopy) {
+      // Resize the shared memory to be able to hold the data which is required
+      // to be in shared memory. Also set the offset to the beginning to the
+      // private variables.
+      _target_region_shared_memory.resize(BytesToCopy, SharedVarsBytes);
+
+      // Copy the shared and private variables into shared memory.
+      char *SVMemory = __kmpc_target_region_kernel_get_shared_memory();
+      char *PVMemory = __kmpc_target_region_kernel_get_private_memory();
+      memcpy(SVMemory, SharedVars, SharedVarsBytes);
+      memcpy(PVMemory, PrivateVars, PrivateVarsBytes);
+    }
+  }
+
+  // TODO: It seems we could store the work function in the same shared space
+  // as the rest of the variables above.
+  //
+  // Initialize the parallel work, e.g., make sure the work function is known.
+  __kmpc_kernel_prepare_parallel((void *)ParallelWorkFn, RequiredOMPRuntime);
+
+  // TODO: It is odd that we call the *_spmd version in non-SPMD mode here.
+  //
+  // Activate workers. This barrier is used by the master to signal
+  // work for the workers.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // OpenMP [2.5, Parallel Construct, p.49]
+  // There is an implied barrier at the end of a parallel region. After the
+  // end of a parallel region, only the master thread of the team resumes
+  // execution of the enclosing task region.
+  //
+  // The master waits at this barrier until all workers are done.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // Update the shared variables if necessary.
+  if (!SharedVars && SharedVarsBytes)
+    memcpy(SharedVars, __kmpc_target_region_kernel_get_shared_memory(),
+           SharedVarsBytes);
+}
Index: openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -101,6 +101,66 @@
   uint32_t nArgs;
 };
 
+/// Helper structure to manage the memory shared by the threads in a team.
+///
+/// Note: Only the team master is allowed to call non-const functions!
+struct target_region_shared_buffer {
+#define PRE_SHARED_BYTES 128
+
+  INLINE void init() {
+    _ptr = &_data[0];
+    _size = PRE_SHARED_BYTES;
+    _offset = 0;
+  }
+
+  /// Release any dynamic allocated memory.
+  INLINE void release() {
+    if (_size == PRE_SHARED_BYTES)
+      return;
+    SafeFree(_ptr, (char *)"free shared dynamic buffer");
+    init();
+  }
+
+  INLINE void set(char *ptr, size_t offset) {
+    release();
+    _ptr = ptr;
+    _offset = offset;
+  }
+
+  INLINE void resize(size_t size, size_t offset) {
+    _offset = offset;
+
+    if (size <= _size)
+      return;
+
+    if (_size != PRE_SHARED_BYTES)
+      SafeFree(_ptr, (char *)"free shared dynamic buffer");
+
+    _size = size;
+    _ptr = (char *)SafeMalloc(_size, (char *)"new shared buffer");
+  }
+
+  // Called by all threads.
+  INLINE char *begin() const { return _ptr; };
+  INLINE size_t size() const { return _size; };
+  INLINE size_t get_offset() const { return _offset; };
+
+private:
+  // Pre-allocated space that holds PRE_SHARED_BYTES many bytes.
+  char _data[PRE_SHARED_BYTES];
+
+  // Pointer to the currently used buffer.
+  char *_ptr;
+
+  // Size of the currently used buffer.
+  uint32_t _size;
+
+  // Offset into the currently used buffer.
+  uint32_t _offset;
+
+#undef PRE_SHARED_BYTES
+};
+
 extern __device__ __shared__ omptarget_nvptx_SharedArgs
     omptarget_nvptx_globalArgs;
 
Index: openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -63,3 +63,9 @@
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
 __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Pointer to share memory between team threads in the target region interface.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ target_region_shared_buffer _target_region_shared_memory;
+
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -53,6 +53,7 @@
       src/reduction.cu
       src/sync.cu
       src/task.cu
+      src/target_region.cu
   )
 
   set(omp_data_objects src/omp_data.cu)
Index: openmp/libomptarget/deviceRTLs/common/target_region.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/deviceRTLs/common/target_region.h
@@ -0,0 +1,167 @@
+//===-- target_region.h --- Target region OpenMP devie runtime interface --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target region interfaces are simple interfaces designed to allow middle-end
+// (=LLVM) passes to analyze and transform the code. To achieve good performance
+// it may be required to run the associated passes. However, implementations of
+// this interface shall always provide a correct implementation as close to the
+// user expected code as possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _DEVICERTL_COMMON_INTERFACES_H_
+#define _DEVICERTL_COMMON_INTERFACES_H_
+
+#ifndef EXTERN
+#define EXTERN
+#endif
+#ifndef CALLBACK
+#define CALLBACK(Callee, Payload0, Payload1)
+#endif
+
+/// The target region _kernel_ interface for GPUs
+///
+/// This deliberatly simple interface provides the middle-end (=LLVM) with
+/// easier means to reason about the semantic of the code and transform it as
+/// well. The runtime calls are therefore also desiged to carry sufficient
+/// information necessary for optimization.
+///
+///
+/// Intended usage:
+///
+/// \code
+/// void kernel(...) {
+///
+///   char ThreadKind = __kmpc_target_region_kernel_init(...);
+///
+///   if (ThreadKind == -1) {               //  actual worker thread
+///     if (!UsedLibraryStateMachine)
+///       user_state_machine();
+///     goto exit;
+///   } else if (ThreadKind == 0) {         // surplus worker thread
+///     goto exit;
+///   } else {                              //    team master thread
+///     goto user_code;
+///   }
+///
+/// user_code:
+///
+///   // User defined kernel code, parallel regions are replaced by
+///   // by __kmpc_target_region_kernel_parallel(...) calls.
+///
+///   // Fallthrough to de-initialization
+///
+/// deinit:
+///   __kmpc_target_region_kernel_deinit(...);
+///
+/// exit:
+///   /* exit the kernel */
+/// }
+/// \endcode
+///
+///
+///{
+
+/// Initialization
+///
+///
+/// In SPMD mode, all threads will execute their respective initialization
+/// routines.
+///
+/// In non-SPMD mode, team masters will invoke the initialization routines while
+/// the rest is considered a worker thread. Worker threads required for this
+/// target region will be trapped inside the function if \p UseStateMachine is
+/// true. Otherwise they will escape with a return value of -1
+///
+/// \param UseSPMDMode         Flag to indicate if execution is performed in
+///                            SPMD mode.
+/// \param RequiresOMPRuntime  Flag to indicate if the runtime is required and
+///                            needs to be initialized.
+/// \param UseStateMachine     Flag to indicate if the runtime state machine
+///                            should be used in non-SPMD mode.
+/// \param RequiresDataSharing Flag to indicate if there might be inter-thread
+///                            sharing which needs runtime support.
+///
+/// \return 1, always in SPMD mode, and in non-SPMD mode if the thread is the
+///            team master.
+///         0, in non-SPMD mode and the thread is a surplus worker that should
+///            not execute anything in the target region.
+///        -1, in non-SPMD mode and the thread is a required worker which:
+///             - finished work and should be terminated if \p UseStateMachine
+///               is true.
+///             - has not performed work and should be put in a user provied
+///               state machine (as defined above).
+///
+EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode,
+                                               bool RequiresOMPRuntime,
+                                               bool UseStateMachine,
+                                               bool RequiresDataSharing);
+
+/// De-Initialization
+///
+///
+/// In non-SPMD, this function releases the workers trapped in a state machine
+/// and also any memory dynamically allocated by the runtime.
+///
+/// \param UseSPMDMode        Flag to indicate if execution is performed in
+///                           SPMD mode.
+/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and
+///                           is therefore initialized.
+///
+EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+                                               bool RequiredOMPRuntime);
+
+/// Generic type of a work function in the target region kernel interface. The
+/// two arguments are pointers to structures that contains the shared and
+/// firstprivate variables respectively. Since the layout and size was known at
+/// compile time, the front-end is expected to generate appropriate packing and
+/// unpacking code.
+typedef void (*ParallelWorkFnTy)(char * /* SharedValues */,
+                                 char * /* PrivateValues */);
+
+/// Enter a parallel region
+///
+///
+/// The parallel region is defined by \p ParallelWorkFn. The shared variables,
+/// \p SharedMemorySize bytes in total, start at \p SharedValues. The
+/// firstprivate variables, \p PrivateValuesBytes bytes in total, start at
+/// \p PrivateValues.
+///
+/// In SPMD mode, this function calls \p ParallelWorkFn with \p SharedValues and
+/// \p PrivateValues as arguments before it returns.
+///
+/// In non-SPMD mode, \p ParallelWorkFn, \p SharedValues, and \p PrivateValues
+/// are communicated to the workers before they are released from the state
+/// machine to run the code defined by \p ParallelWorkFn in parallel. This
+/// function will only return after all workers are finished.
+///
+/// \param UseSPMDMode        Flag to indicate if execution is performed in
+///                           SPMD mode.
+/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and
+///                           is therefore initialized.
+/// \param ParallelWorkFn     The outlined code that is executed in parallel by
+///                           the threads in the team.
+/// \param SharedValues       A pointer to the location of all shared values.
+/// \param SharedValuesBytes  The total size of the shared values in bytes.
+/// \param PrivateValues      A pointer to the location of all private values.
+/// \param PrivateValuesBytes The total size of the private values in bytes.
+/// \param SharedMemPointers  Flag to indicate that the pointer \p SharedValues
+///                           and \p PrivateValues point into shared memory.
+///                           If this flag is true, it also requires that all
+///                           private values, if any, are stored directly after
+///                           the shared values.
+///
+CALLBACK(ParallelWorkFnTy, SharedValues, PrivateValues)
+EXTERN void __kmpc_target_region_kernel_parallel(
+    bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn,
+    char *SharedValues, uint16_t SharedValuesBytes, char *PrivateValues,
+    uint16_t PrivateValuesBytes, bool SharedMemPointers);
+
+///}
+
+#endif
Index: openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
===================================================================
--- openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
+++ openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -78,13 +78,14 @@
 
 # These flags are required to emit LLVM Bitcode. We check them together because
 # if any of them are not supported, there is no point in finding out which are.
-set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required -emit-llvm -std=c++11 -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
 set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
 check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
 
 # It makes no sense to continue given that the compiler doesn't support
 # emitting basic LLVM Bitcode
 if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
+  message(ERROR "NO FLAG SUPPORT")
   return()
 endif()
 
@@ -101,6 +102,7 @@
   check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
 
   if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
+    message(ERROR "NO FCUDA RDC")
     return()
   endif()

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D59319: [OpenMP][Offloading][1/3] A generic and simple target region interface

Reply via email to