tianshilei1992 created this revision.
Herald added subscribers: kosarev, ormris, kerbowa, guansong, hiraditya, 
yaxunl, jvesely.
Herald added a project: All.
tianshilei1992 requested review of this revision.
Herald added a reviewer: jdoerfert.
Herald added a reviewer: jdoerfert.
Herald added a reviewer: sstefan1.
Herald added subscribers: llvm-commits, openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP, LLVM.

Still WIP. Not for review now.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D142569

Files:
  clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
  llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
  llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
  llvm/lib/Transforms/IPO/OpenMPOpt.cpp
  openmp/libomptarget/DeviceRTL/CMakeLists.txt
  openmp/libomptarget/DeviceRTL/include/Configuration.h
  openmp/libomptarget/DeviceRTL/include/Debug.h
  openmp/libomptarget/DeviceRTL/include/Interface.h
  openmp/libomptarget/DeviceRTL/include/State.h
  openmp/libomptarget/DeviceRTL/src/Configuration.cpp
  openmp/libomptarget/DeviceRTL/src/Debug.cpp
  openmp/libomptarget/DeviceRTL/src/Kernel.cpp
  openmp/libomptarget/DeviceRTL/src/Mapping.cpp
  openmp/libomptarget/DeviceRTL/src/State.cpp
  openmp/libomptarget/include/DeviceEnvironment.h
  openmp/libomptarget/include/Environment.h
  openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
  openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
  openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
  openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
  openmp/libomptarget/plugins/cuda/src/rtl.cpp

Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -23,7 +23,7 @@
 #include <vector>
 
 #include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
 #include "omptarget.h"
 #include "omptargetplugin.h"
 
Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -37,7 +37,7 @@
 #include "internal.h"
 #include "rt.h"
 
-#include "DeviceEnvironment.h"
+#include "Environment.h"
 #include "get_elf_mach_gfx_name.h"
 #include "omptargetplugin.h"
 #include "print_tracing.h"
Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -17,7 +17,7 @@
 #include <unordered_map>
 
 #include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
 #include "GlobalHandler.h"
 #include "PluginInterface.h"
 #include "omptarget.h"
Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -17,7 +17,7 @@
 #include <unordered_map>
 
 #include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
 #include "GlobalHandler.h"
 #include "PluginInterface.h"
 
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
===================================================================
--- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -19,7 +19,7 @@
 #include <vector>
 
 #include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
 #include "GlobalHandler.h"
 #include "JIT.h"
 #include "MemoryManager.h"
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -21,7 +21,7 @@
 #include <unordered_map>
 
 #include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
 #include "GlobalHandler.h"
 #include "PluginInterface.h"
 #include "Utilities.h"
@@ -431,7 +431,7 @@
 
   /// Launch the AMDGPU kernel function.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
-                   uint64_t NumBlocks, 
+                   uint64_t NumBlocks,
                    KernelArgsTy &KernelArgs, void *Args,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
Index: openmp/libomptarget/include/Environment.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/include/Environment.h
@@ -0,0 +1,53 @@
+//===------------ Environment.h - OpenMP GPU environments --------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Environments shared between host and device.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_ENVIRONMENT_H_
+#define _OMPTARGET_ENVIRONMENT_H_
+
+#ifdef OMPTARGET_DEVICE_RUNTIME
+#include "Types.h"
+#else
+#include "SourceInfo.h"
+
+#include <cstdint>
+
+using IdentTy = ident_t;
+#endif
+
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+
+struct DeviceEnvironmentTy {
+  uint32_t DebugKind;
+  uint32_t NumDevices;
+  uint32_t DeviceNum;
+  uint32_t DynamicMemSize;
+};
+
+// NOTE: Please don't change the order of those members as they are used in the
+// middle end. Always add the new data member at the end.
+struct ConfigurationEnvironmentTy {
+  uint8_t UseGenericStateMachine;
+  uint8_t MayUseNestedParallelism;
+  llvm::omp::OMPTgtExecModeFlags ExecMode;
+};
+
+// NOTE: Please don't change the order of those members as they are used in the
+// middle end. Always add the new data member at the end.
+struct KernelEnvironmentTy {
+  ConfigurationEnvironmentTy Configuration;
+  IdentTy *Ident;
+  /// Current indentation level for the function trace. Only accessed by thread
+  /// 0.
+  uint16_t DebugIndentionLevel;
+};
+
+#endif // _OMPTARGET_ENVIRONMENT_H_
Index: openmp/libomptarget/include/DeviceEnvironment.h
===================================================================
--- openmp/libomptarget/include/DeviceEnvironment.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===---- device_environment.h - OpenMP GPU device environment ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Global device environment
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_
-#define _OMPTARGET_DEVICE_ENVIRONMENT_H_
-
-// deviceRTL uses <stdint> and DeviceRTL uses explicit definitions
-
-struct DeviceEnvironmentTy {
-  uint32_t DebugKind;
-  uint32_t NumDevices;
-  uint32_t DeviceNum;
-  uint32_t DynamicMemSize;
-};
-
-#endif
Index: openmp/libomptarget/DeviceRTL/src/State.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/State.cpp
+++ openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -9,8 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "State.h"
-#include "Configuration.h"
 #include "Debug.h"
+#include "Environment.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "Synchronization.h"
@@ -34,6 +34,9 @@
 extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
 #pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
 
+/// The kernel environment passed to the init method by the compiler.
+static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
+
 namespace {
 
 /// Fallback implementations are missing to trigger a link time error.
@@ -241,15 +244,19 @@
 
 } // namespace
 
-void state::init(bool IsSPMD) {
+void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
   SharedMemorySmartStack.init(IsSPMD);
   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
     TeamState.init(IsSPMD);
-    DebugEntryRAII::init();
     ThreadStates = nullptr;
+    KernelEnvironmentPtr = &KernelEnvironment;
   }
 }
 
+KernelEnvironmentTy &state::getKernelEnvironment() {
+  return *KernelEnvironmentPtr;
+}
+
 void state::enterDataEnvironment(IdentTy *Ident) {
   ASSERT(config::mayUseThreadStates() &&
          "Thread state modified while explicitly disabled!");
Index: openmp/libomptarget/DeviceRTL/src/Mapping.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mapping.h"
+#include "Configuration.h"
 #include "Interface.h"
 #include "State.h"
 #include "Types.h"
Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Debug.h"
+#include "Environment.h"
 #include "Interface.h"
 #include "Mapping.h"
 #include "State.h"
@@ -23,11 +24,12 @@
 
 #pragma omp begin declare target device_type(nohost)
 
-static void inititializeRuntime(bool IsSPMD) {
+static void inititializeRuntime(bool IsSPMD,
+                                KernelEnvironmentTy &KernelEnvironment) {
   // Order is important here.
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
-  state::init(IsSPMD);
+  state::init(IsSPMD, KernelEnvironment);
 }
 
 /// Simple generic state machine for worker threads.
@@ -67,16 +69,17 @@
 ///
 /// \param Ident               Source location identification, can be NULL.
 ///
-int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
-                           bool UseGenericStateMachine) {
+int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) {
   FunctionTracingRAII();
-  const bool IsSPMD =
-      Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
+  ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
+  bool IsSPMD = Configuration.ExecMode &
+                llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
+  bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
   if (IsSPMD) {
-    inititializeRuntime(/* IsSPMD */ true);
+    inititializeRuntime(/* IsSPMD */ true, KernelEnvironment);
     synchronize::threadsAligned();
   } else {
-    inititializeRuntime(/* IsSPMD */ false);
+    inititializeRuntime(/* IsSPMD */ false, KernelEnvironment);
     // No need to wait since only the main threads will execute user
     // code and workers will run into a barrier right away.
   }
@@ -104,7 +107,7 @@
   // thread's warp, so none of its threads can ever be active worker threads.
   if (UseGenericStateMachine &&
       mapping::getThreadIdInBlock() < mapping::getBlockSize(IsSPMD)) {
-    genericStateMachine(Ident);
+    genericStateMachine(KernelEnvironment.Ident);
   } else {
     // Retrieve the work function just to ensure we always call
     // __kmpc_kernel_parallel even if a custom state machine is used.
@@ -128,10 +131,9 @@
 ///
 /// \param Ident Source location identification, can be NULL.
 ///
-void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) {
+void __kmpc_target_deinit() {
   FunctionTracingRAII();
-  const bool IsSPMD =
-      Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
+  bool IsSPMD = mapping::isSPMDMode();
   state::assumeInitialState(IsSPMD);
   if (IsSPMD)
     return;
Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -14,6 +14,7 @@
 #include "Configuration.h"
 #include "Interface.h"
 #include "Mapping.h"
+#include "State.h"
 #include "Types.h"
 
 using namespace ompx;
@@ -31,15 +32,13 @@
 }
 }
 
-/// Current indentation level for the function trace. Only accessed by thread 0.
-__attribute__((loader_uninitialized)) static uint32_t Level;
-#pragma omp allocate(Level) allocator(omp_pteam_mem_alloc)
-
 DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
                                const char *Function) {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
       mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
 
+    uint16_t &Level = state::getKernelEnvironment().DebugIndentionLevel;
+
     for (int I = 0; I < Level; ++I)
       PRINTF("%s", "  ");
 
@@ -51,10 +50,10 @@
 
 DebugEntryRAII::~DebugEntryRAII() {
   if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
-      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0)
+      mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
+    uint16_t &Level = state::getKernelEnvironment().DebugIndentionLevel;
     Level--;
+  }
 }
 
-void DebugEntryRAII::init() { Level = 0; }
-
 #pragma omp end declare target
Index: openmp/libomptarget/DeviceRTL/src/Configuration.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Configuration.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
 #include "State.h"
 #include "Types.h"
 
@@ -23,7 +23,6 @@
 // defined by CGOpenMPRuntimeGPU
 extern uint32_t __omp_rtl_debug_kind;
 extern uint32_t __omp_rtl_assume_no_thread_state;
-extern uint32_t __omp_rtl_assume_no_nested_parallelism;
 
 // This variable should be visibile to the plugin so we override the default
 // hidden visibility.
@@ -53,7 +52,7 @@
 bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
 
 bool config::mayUseNestedParallelism() {
-  return !__omp_rtl_assume_no_nested_parallelism;
+  return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
 }
 
 #pragma omp end declare target
Index: openmp/libomptarget/DeviceRTL/include/State.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/State.h
+++ openmp/libomptarget/DeviceRTL/include/State.h
@@ -17,6 +17,9 @@
 #include "Types.h"
 #include "Utils.h"
 
+// Forward declaration.
+struct KernelEnvironmentTy;
+
 #pragma omp begin declare target device_type(nohost)
 
 namespace ompx {
@@ -113,7 +116,10 @@
 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
 
 /// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD);
+void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
+
+/// Return the kernel environment associated with the current kernel.
+KernelEnvironmentTy &getKernelEnvironment();
 
 /// TODO
 enum ValueKind {
Index: openmp/libomptarget/DeviceRTL/include/Interface.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Interface.h
+++ openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -214,12 +214,14 @@
 /// Kernel
 ///
 ///{
+// Forward declaration
+struct KernelEnvironmentTy;
+
 int8_t __kmpc_is_spmd_exec_mode();
 
-int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
-                           bool UseGenericStateMachine);
+int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment);
 
-void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode);
+void __kmpc_target_deinit();
 
 ///}
 
Index: openmp/libomptarget/DeviceRTL/include/Debug.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Debug.h
+++ openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -50,8 +50,6 @@
 struct DebugEntryRAII {
   DebugEntryRAII(const char *File, const unsigned Line, const char *Function);
   ~DebugEntryRAII();
-
-  static void init();
 };
 
 #endif
Index: openmp/libomptarget/DeviceRTL/include/Configuration.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -13,11 +13,16 @@
 #ifndef OMPTARGET_CONFIGURATION_H
 #define OMPTARGET_CONFIGURATION_H
 
+#include "Environment.h"
 #include "Types.h"
 
+#pragma omp begin declare target device_type(nohost)
+
 namespace ompx {
 namespace config {
 
+extern ConfigurationEnvironmentTy *ConfigurationEnvironment;
+
 enum DebugKind : uint32_t {
   Assertion = 1U << 0,
   FunctionTracing = 1U << 1,
@@ -51,4 +56,6 @@
 } // namespace config
 } // namespace ompx
 
+#pragma omp end declare target
+
 #endif
Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt
===================================================================
--- openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -128,6 +128,7 @@
              -nocudalib -nogpulib -nostdinc
              -fopenmp -fopenmp-cuda-mode
              -Wno-unknown-cuda-version
+             -DOMPTARGET_DEVICE_RUNTIME
              -I${include_directory}
              -I${devicertl_base_directory}/../include
              ${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -44,6 +44,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -179,6 +180,71 @@
 
 namespace {
 
+/// Return the constant element of \p StructC at \p Offset.
+template <int Offset>
+unsigned getConstantStructOffsetIdx(const DataLayout &DL,
+                                    ConstantStruct *StructC) {
+  StructType *ST = cast<StructType>(StructC->getType());
+  return DL.getStructLayout(ST)->getElementContainingOffset(Offset);
+}
+
+/// Return the constant element of \p StructC at \p Offset.
+template <int Offset>
+Constant *getConstantStructOffsetElementAddr(const DataLayout &DL,
+                                             ConstantStruct *StructC,
+                                             Constant *Ptr) {
+  unsigned Idx = getConstantStructOffsetIdx<Offset>(DL, StructC);
+  StructType *ST = cast<StructType>(StructC->getType());
+  return ConstantExpr::getInBoundsGetElementPtr(
+      ST, Ptr,
+      ArrayRef<Constant *>{
+          ConstantInt::getNullValue(IntegerType::getInt32Ty(ST->getContext())),
+          ConstantInt::get(IntegerType::getInt32Ty(ST->getContext()), Idx)});
+}
+
+/// Return the constant element of \p StructC at \p Offset.
+template <int Offset>
+Constant *getConstantStructOffsetElement(const DataLayout &DL,
+                                         ConstantStruct *StructC) {
+  return StructC->getAggregateElement(
+      getConstantStructOffsetIdx<Offset>(DL, StructC));
+}
+
+/// Set the constant element of \p StructC at \p Offset to be \p V.
+template <int Offset>
+static ConstantStruct *setConstantStructOffsetElement(const DataLayout &DL,
+                                                      ConstantStruct *StructC,
+                                                      Constant *V) {
+  unsigned Idxs[1] = {getConstantStructOffsetIdx<Offset>(DL, StructC)};
+  Constant *NewStructC = ConstantFoldInsertValueInstruction(StructC, V, Idxs);
+  assert(NewStructC && "Failed to create constant kernel environment!");
+  return cast<ConstantStruct>(NewStructC);
+}
+
+#define GET_KERNEL_ENVIRONMENT_MEMBER(Member, DL, StructC)                     \
+  getConstantStructOffsetElement<offsetof(struct KernelEnvironmentTy,          \
+                                          Member)>(DL, StructC)
+#define GET_KERNEL_ENVIRONMENT_MEMBER_ADDR(Member, DL, StructC, Ptr)           \
+  getConstantStructOffsetElementAddr<offsetof(struct KernelEnvironmentTy,      \
+                                              Member)>(DL, StructC, Ptr)
+#define SET_KERNEL_ENVIRONMENT_MEMBER(Member, DL, StructC, V)                  \
+  setConstantStructOffsetElement<offsetof(struct KernelEnvironmentTy,          \
+                                          Member)>(DL, StructC, V)
+#define GET_KERNEL_CONFIGURATION_ENVIRONMENT_MEMBER(Member, DL, StructC)       \
+  getConstantStructOffsetElement<offsetof(struct ConfigurationEnvironmentTy,   \
+                                          Member)>(                            \
+      DL, cast<ConstantStruct>(                                                \
+              GET_KERNEL_ENVIRONMENT_MEMBER(Configuration, DL, StructC)))
+#define SET_KERNEL_CONFIGURATION_ENVIRONMENT_MEMBER(Member, DL, StructC, V)    \
+  SET_KERNEL_ENVIRONMENT_MEMBER(                                               \
+      Configuration, DL, StructC,                                              \
+      setConstantStructOffsetElement<offsetof(                                 \
+          struct ConfigurationEnvironmentTy, Member)>(                         \
+          DL,                                                                  \
+          cast<ConstantStruct>(                                                \
+              GET_KERNEL_ENVIRONMENT_MEMBER(Configuration, DL, StructC)),      \
+          V))
+
 struct AAHeapToShared;
 
 struct AAICVTracker;
@@ -588,6 +654,10 @@
   /// one we abort as the kernel is malformed.
   CallBase *KernelInitCB = nullptr;
 
+  /// The constant kernel environement as taken from and passed to
+  /// __kmpc_target_init.
+  ConstantStruct *KernelEnvC = nullptr;
+
   /// The __kmpc_target_deinit call in this kernel, if any. If we find more than
   /// one we abort as the kernel is malformed.
   CallBase *KernelDeinitCB = nullptr;
@@ -692,6 +762,12 @@
                          "assumptions.");
       KernelDeinitCB = KIS.KernelDeinitCB;
     }
+    if (KIS.KernelEnvC) {
+      if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
+        llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
+                         "assumptions.");
+      KernelEnvC = KIS.KernelEnvC;
+    }
     SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
     ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
     ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
@@ -3400,6 +3476,50 @@
     return GuardedInstructions;
   }
 
+  /// Return the IdentTy (ident_ty) corresponding to the associated kernel.
+  Constant *getKernelIdent(ConstantStruct *StructC) {
+    GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable();
+    auto *KernelEnvC = cast<ConstantStruct>(KernelEnvGV->getInitializer());
+    constexpr const unsigned IdentIdx = 1;
+    return KernelEnvC->getAggregateElement(IdentIdx);
+  }
+
+  ConstantStruct *getConfigurationFromKernelEnvC(ConstantStruct *KernelEnvC) {
+    constexpr const unsigned ConfigurationIdx = 0;
+    auto *ConfigC =
+        cast<ConstantStruct>(KernelEnvC->getAggregateElement(ConfigurationIdx));
+    return ConfigC;
+  }
+
+  ConstantInt *
+  getUseGenericStateMachineFromKernelConfiguration(ConstantStruct *KernelEnvC) {
+    ConstantStruct *ConfigC = getConfigurationFromKernelEnvC(KernelEnvC);
+    constexpr const unsigned UseGenericStateMachineIdx = 0;
+    return cast<ConstantInt>(
+        ConfigC->getAggregateElement(UseGenericStateMachineIdx));
+  }
+
+  ConstantInt *getExecModeFromKernelConfiguration(ConstantStruct *KernelEnvC) {
+    ConstantStruct *ConfigC = getConfigurationFromKernelEnvC(KernelEnvC);
+    constexpr const unsigned ExecModeIdx = 2;
+    return cast<ConstantInt>(ConfigC->getAggregateElement(ExecModeIdx));
+  }
+
+  ConstantInt *getMayUseNestedParallelismFromKernelConfiguration(
+      ConstantStruct *KernelEnvC) {
+    ConstantStruct *ConfigC = getConfigurationFromKernelEnvC(KernelEnvC);
+    constexpr const unsigned MayUseNestedParallelismIdx = 1;
+    return cast<ConstantInt>(
+        ConfigC->getAggregateElement(MayUseNestedParallelismIdx));
+  }
+
+  GlobalVariable *getKernelEnvironementGlobalVariable() {
+    constexpr const int InitKernelEnvironmentArgNo = 0;
+    return cast<GlobalVariable>(
+        KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
+            ->stripPointerCasts());
+  }
+
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     // This is a high-level transform that might change the constant arguments
@@ -3448,6 +3568,18 @@
     ReachingKernelEntries.insert(Fn);
     IsKernelEntry = true;
 
+    GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable();
+    KernelEnvC = cast<ConstantStruct>(KernelEnvGV->getInitializer());
+    auto *ExecModeC = getExecModeFromKernelConfiguration(KernelEnvC);
+    auto *AssumedExecModeC = ConstantInt::get(
+        ExecModeC->getType(),
+        ExecModeC->getZExtValue() | OMP_TGT_EXEC_MODE_GENERIC_SPMD);
+
+    // Next rewrite the kernel configuration to indicate we use SPMD-mode now.
+    auto &Ctx = getAnchorValue().getContext();
+    if (!DisableOpenMPOptSPMDization)
+      setKernelConfigurationExecMode(KernelEnvC, AssumedExecModeC);
+
     // For kernels we might need to initialize/finalize the IsSPMD state and
     // we need to register a simplification callback so that the Attributor
     // knows the constant arguments to __kmpc_target_init and
@@ -3497,23 +3629,18 @@
       return Val;
     };
 
-    constexpr const int InitModeArgNo = 1;
-    constexpr const int DeinitModeArgNo = 1;
-    constexpr const int InitUseStateMachineArgNo = 2;
-    A.registerSimplificationCallback(
-        IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
-        StateMachineSimplifyCB);
-    A.registerSimplificationCallback(
-        IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
-        ModeSimplifyCB);
+    Attributor::SimplifictionCallbackTy KernelConfigurationSimplifyCB =
+        [&](const IRPosition &IRP, const AbstractAttribute *AA,
+            bool &UsedAssumedInformation) -> std::optional<Value *> {
+      return KernelEnvC;
+    };
+
     A.registerSimplificationCallback(
-        IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
-        ModeSimplifyCB);
+        IRPosition::value(*KernelEnvGV->getInitializer()),
+        KernelConfigurationSimplifyCB);
 
     // Check if we know we are in SPMD-mode already.
-    ConstantInt *ModeArg =
-        dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
-    if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
+    if (ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)
       SPMDCompatibilityTracker.indicateOptimisticFixpoint();
     // This is a generic region but SPMDization is disabled so stop tracking.
     else if (DisableOpenMPOptSPMDization)
Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===================================================================
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3878,14 +3878,42 @@
   ConstantInt *IsSPMDVal = ConstantInt::getSigned(
       IntegerType::getInt8Ty(Int8->getContext()),
       IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-  ConstantInt *UseGenericStateMachine =
-      ConstantInt::getBool(Int32->getContext(), !IsSPMD);
-
+  ConstantInt *UseGenericStateMachineVal = ConstantInt::getSigned(
+      IntegerType::getInt8Ty(Int8->getContext()), !IsSPMD);
+  ConstantInt *MayUseNestedParallelismVal =
+      ConstantInt::getSigned(IntegerType::getInt8Ty(Int8->getContext()), true);
+  ConstantInt *DebugIndentionLevelVal =
+      ConstantInt::getSigned(IntegerType::getInt16Ty(Int8->getContext()), true);
+
+  Function *Kernel = Builder.GetInsertBlock()->getParent();
   Function *Fn = getOrCreateRuntimeFunctionPtr(
       omp::RuntimeFunction::OMPRTL___kmpc_target_init);
-
-  CallInst *ThreadKind = Builder.CreateCall(
-      Fn, {Ident, IsSPMDVal, UseGenericStateMachine});
+  const DataLayout &DL = Fn->getParent()->getDataLayout();
+
+  Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
+      ConfigurationEnvironmentTy, {
+                                      UseGenericStateMachineVal,
+                                      MayUseNestedParallelismVal,
+                                      IsSPMDVal,
+                                  });
+  Constant *KernelEnvironmentInitializer = ConstantStruct::get(
+      KernelEnvironmentTy, {
+                               Ident,
+                               ConfigurationEnvironmentInitializer,
+                               DebugIndentionLevelVal,
+                           });
+  Twine KernelEnvironmentName = Kernel->getName() + "_kernel_info";
+  GlobalVariable *KernelEnvironment = new GlobalVariable(
+      M, KernelEnvironmentTy, /* IsConstant */ true,
+      GlobalValue::ExternalLinkage, KernelEnvironmentInitializer,
+      KernelEnvironmentName,
+      /* InsertBefore */ nullptr, llvm::GlobalValue::NotThreadLocal,
+      DL.getDefaultGlobalsAddressSpace());
+  Constant *KernelEnvironmentCasted =
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(KernelEnvironment,
+                                                     KernelEnvironmentTyPtr);
+
+  CallInst *ThreadKind = Builder.CreateCall(Fn, {KernelEnvironmentCasted});
 
   Value *ExecUserCode = Builder.CreateICmpEQ(
       ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
@@ -3918,22 +3946,14 @@
   return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
 }
 
-void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
-                                         bool IsSPMD) {
+void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
   if (!updateToLocation(Loc))
     return;
 
-  uint32_t SrcLocStrSize;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
-      IntegerType::getInt8Ty(Int8->getContext()),
-      IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-
   Function *Fn = getOrCreateRuntimeFunctionPtr(
       omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
 
-  Builder.CreateCall(Fn, {Ident, IsSPMDVal});
+  Builder.CreateCall(Fn, {});
 }
 
 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -96,6 +96,10 @@
 		  Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
 __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
+__OMP_STRUCT_TYPE(ConfigurationEnvironmentTy, ConfigurationEnvironmentTy, false,
+                  Int8, Int8, Int8)
+__OMP_STRUCT_TYPE(KernelEnvironmentTy, KernelEnvironmentTy, false, IdentPtr,
+                  ConfigurationEnvironmentTy, Int16)
 
 #undef __OMP_STRUCT_TYPE
 #undef OMP_STRUCT_TYPE
@@ -452,8 +456,8 @@
           /* Int */ Int32, /* kmp_task_t */ VoidPtr)
 
 /// OpenMP Device runtime functions
-__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1)
-__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8)
+__OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentTyPtr)
+__OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
           VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
@@ -1012,9 +1016,9 @@
                 ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs, SExt))
 
 __OMP_RTL_ATTRS(__kmpc_target_init, AttributeSet(), SExt,
-                ParamAttrs(AttributeSet(), SExt, SExt))
+                ParamAttrs(AttributeSet()))
 __OMP_RTL_ATTRS(__kmpc_target_deinit, AttributeSet(), AttributeSet(),
-                ParamAttrs(AttributeSet(), SExt))
+                ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),
                 ParamAttrs(AttributeSet(), SExt, SExt, SExt, SExt,
                            AttributeSet(), AttributeSet(), AttributeSet(),
Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1461,8 +1461,7 @@
   /// Create a runtime call for kmpc_target_deinit
   ///
   /// \param Loc The insert and source location description.
-  /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
-  void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD);
+  void createTargetDeinit(const LocationDescription &Loc);
 
   ///}
 
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -781,7 +781,7 @@
     emitGenericVarsEpilog(CGF);
 
   CGBuilderTy &Bld = CGF.Builder;
-  OMPBuilder.createTargetDeinit(Bld, IsSPMD);
+  OMPBuilder.createTargetDeinit(Bld);
 }
 
 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to