tianshilei1992 updated this revision to Diff 492322.
tianshilei1992 added a comment.
fix bugs and add plugin support
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D142569/new/
https://reviews.llvm.org/D142569
Files:
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
openmp/libomptarget/DeviceRTL/CMakeLists.txt
openmp/libomptarget/DeviceRTL/include/Debug.h
openmp/libomptarget/DeviceRTL/include/Interface.h
openmp/libomptarget/DeviceRTL/include/State.h
openmp/libomptarget/DeviceRTL/src/Configuration.cpp
openmp/libomptarget/DeviceRTL/src/Debug.cpp
openmp/libomptarget/DeviceRTL/src/Kernel.cpp
openmp/libomptarget/DeviceRTL/src/State.cpp
openmp/libomptarget/include/DeviceEnvironment.h
openmp/libomptarget/include/Environment.h
openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
openmp/libomptarget/plugins/cuda/src/rtl.cpp
Index: openmp/libomptarget/plugins/cuda/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -23,7 +23,7 @@
#include <vector>
#include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
#include "omptarget.h"
#include "omptargetplugin.h"
Index: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -37,7 +37,7 @@
#include "internal.h"
#include "rt.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
#include "get_elf_mach_gfx_name.h"
#include "omptargetplugin.h"
#include "print_tracing.h"
Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -17,7 +17,7 @@
#include <unordered_map>
#include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
#include "GlobalHandler.h"
#include "PluginInterface.h"
#include "omptarget.h"
Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -17,7 +17,7 @@
#include <unordered_map>
#include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
#include "GlobalHandler.h"
#include "PluginInterface.h"
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
===================================================================
--- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -19,7 +19,7 @@
#include <vector>
#include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
#include "GlobalHandler.h"
#include "JIT.h"
#include "MemoryManager.h"
@@ -627,6 +627,11 @@
/// Map of host pinned allocations used for optimize device transfers.
PinnedAllocationMapTy PinnedAllocs;
+
+private:
+ /// Return the kernel environment object for kernel \p Name.
+ Expected<KernelEnvironmentTy>
+ getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
};
/// Class implementing common functionalities of offload plugins. Each plugin
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -554,32 +554,43 @@
return Plugin::success();
}
-Expected<OMPTgtExecModeFlags>
-GenericDeviceTy::getExecutionModeForKernel(StringRef Name,
- DeviceImageTy &Image) {
- // Create a metadata object for the exec mode global (auto-generated).
- StaticGlobalTy<llvm::omp::OMPTgtExecModeFlags> ExecModeGlobal(Name.data(),
- "_exec_mode");
+Expected<KernelEnvironmentTy>
+GenericDeviceTy::getKernelEnvironmentForKernel(StringRef Name,
+ DeviceImageTy &Image) {
+ // Create a metadata object for the kernel environment object.
+ StaticGlobalTy<KernelEnvironmentTy> KernelEnv(Name.data(), "_kernel_info");
- // Retrieve execution mode for the kernel. This may fail since some kernels
- // may not have an execution mode.
+ // Retrieve kernel environment object for the kernel.
GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
- if (auto Err = GHandler.readGlobalFromImage(*this, Image, ExecModeGlobal)) {
+ if (auto Err = GHandler.readGlobalFromImage(*this, Image, KernelEnv)) {
// Consume the error since it is acceptable to fail.
[[maybe_unused]] std::string ErrStr = toString(std::move(Err));
- DP("Failed to read execution mode for '%s': %s\n"
- "Using default SPMD (2) execution mode\n",
- Name.data(), ErrStr.data());
+ DP("Failed to read kernel environment object for '%s': %s\n", Name.data(),
+ ErrStr.data());
- return OMP_TGT_EXEC_MODE_SPMD;
+ return createStringError(inconvertibleErrorCode(), ErrStr);
}
+ return KernelEnv.getValue();
+}
+
+Expected<OMPTgtExecModeFlags>
+GenericDeviceTy::getExecutionModeForKernel(StringRef Name,
+ DeviceImageTy &Image) {
+ auto KernelEnvOrError = getKernelEnvironmentForKernel(Name, Image);
+ // We error out directly if we can't read the kernel environment object.
+ if (!KernelEnvOrError)
+ return KernelEnvOrError.takeError();
+
+ auto &KernelEnv = *KernelEnvOrError;
+ auto ExecMode = KernelEnv.Configuration.ExecMode;
+
// Check that the retrieved execution mode is valid.
- if (!GenericKernelTy::isValidExecutionMode(ExecModeGlobal.getValue()))
- return Plugin::error("Invalid execution mode %d for '%s'",
- ExecModeGlobal.getValue(), Name.data());
+ if (!GenericKernelTy::isValidExecutionMode(ExecMode))
+ return Plugin::error("Invalid execution mode %d for '%s'", ExecMode,
+ Name.data());
- return ExecModeGlobal.getValue();
+ return ExecMode;
}
Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
Index: openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
===================================================================
--- openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -21,7 +21,7 @@
#include <unordered_map>
#include "Debug.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
#include "GlobalHandler.h"
#include "PluginInterface.h"
#include "Utilities.h"
@@ -431,7 +431,7 @@
/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
- uint64_t NumBlocks,
+ uint64_t NumBlocks,
KernelArgsTy &KernelArgs, void *Args,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
Index: openmp/libomptarget/include/Environment.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/include/Environment.h
@@ -0,0 +1,53 @@
+//===------------ Environment.h - OpenMP GPU environments --------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Environments shared between host and device.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _OMPTARGET_ENVIRONMENT_H_
+#define _OMPTARGET_ENVIRONMENT_H_
+
+#ifdef OMPTARGET_DEVICE_RUNTIME
+#include "Types.h"
+#else
+#include "SourceInfo.h"
+
+#include <cstdint>
+
+using IdentTy = ident_t;
+#endif
+
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+
+struct DeviceEnvironmentTy {
+ uint32_t DebugKind;
+ uint32_t NumDevices;
+ uint32_t DeviceNum;
+ uint32_t DynamicMemSize;
+};
+
+// NOTE: Please don't change the order of those members as they are used in the
+// middle end. Always add the new data member at the end.
+struct ConfigurationEnvironmentTy {
+ uint8_t UseGenericStateMachine;
+ uint8_t MayUseNestedParallelism;
+ llvm::omp::OMPTgtExecModeFlags ExecMode;
+};
+
+// NOTE: Please don't change the order of those members as they are used in the
+// middle end. Always add the new data member at the end.
+struct KernelEnvironmentTy {
+ ConfigurationEnvironmentTy Configuration;
+ IdentTy *Ident;
+ /// Current indentation level for the function trace. Only accessed by thread
+ /// 0.
+ uint16_t DebugIndentionLevel;
+};
+
+#endif // _OMPTARGET_ENVIRONMENT_H_
Index: openmp/libomptarget/include/DeviceEnvironment.h
===================================================================
--- openmp/libomptarget/include/DeviceEnvironment.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===---- device_environment.h - OpenMP GPU device environment ---- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Global device environment
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_
-#define _OMPTARGET_DEVICE_ENVIRONMENT_H_
-
-// deviceRTL uses <stdint> and DeviceRTL uses explicit definitions
-
-struct DeviceEnvironmentTy {
- uint32_t DebugKind;
- uint32_t NumDevices;
- uint32_t DeviceNum;
- uint32_t DynamicMemSize;
-};
-
-#endif
Index: openmp/libomptarget/DeviceRTL/src/State.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/State.cpp
+++ openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -9,8 +9,8 @@
//===----------------------------------------------------------------------===//
#include "State.h"
-#include "Configuration.h"
#include "Debug.h"
+#include "Environment.h"
#include "Interface.h"
#include "Mapping.h"
#include "Synchronization.h"
@@ -34,6 +34,9 @@
extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+/// The kernel environment passed to the init method by the compiler.
+static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
+
namespace {
/// Fallback implementations are missing to trigger a link time error.
@@ -241,15 +244,19 @@
} // namespace
-void state::init(bool IsSPMD) {
+void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
SharedMemorySmartStack.init(IsSPMD);
if (mapping::isInitialThreadInLevel0(IsSPMD)) {
TeamState.init(IsSPMD);
- DebugEntryRAII::init();
ThreadStates = nullptr;
+ KernelEnvironmentPtr = &KernelEnvironment;
}
}
+KernelEnvironmentTy &state::getKernelEnvironment() {
+ return *KernelEnvironmentPtr;
+}
+
void state::enterDataEnvironment(IdentTy *Ident) {
ASSERT(config::mayUseThreadStates() &&
"Thread state modified while explicitly disabled!");
Index: openmp/libomptarget/DeviceRTL/src/Kernel.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "Debug.h"
+#include "Environment.h"
#include "Interface.h"
#include "Mapping.h"
#include "State.h"
@@ -23,11 +24,12 @@
#pragma omp begin declare target device_type(nohost)
-static void inititializeRuntime(bool IsSPMD) {
+static void inititializeRuntime(bool IsSPMD,
+ KernelEnvironmentTy &KernelEnvironment) {
// Order is important here.
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
- state::init(IsSPMD);
+ state::init(IsSPMD, KernelEnvironment);
}
/// Simple generic state machine for worker threads.
@@ -67,16 +69,17 @@
///
/// \param Ident Source location identification, can be NULL.
///
-int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
- bool UseGenericStateMachine) {
+int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) {
FunctionTracingRAII();
- const bool IsSPMD =
- Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
+ ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
+ bool IsSPMD = Configuration.ExecMode &
+ llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
+ bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
if (IsSPMD) {
- inititializeRuntime(/* IsSPMD */ true);
+ inititializeRuntime(/* IsSPMD */ true, KernelEnvironment);
synchronize::threadsAligned();
} else {
- inititializeRuntime(/* IsSPMD */ false);
+ inititializeRuntime(/* IsSPMD */ false, KernelEnvironment);
// No need to wait since only the main threads will execute user
// code and workers will run into a barrier right away.
}
@@ -104,7 +107,7 @@
// thread's warp, so none of its threads can ever be active worker threads.
if (UseGenericStateMachine &&
mapping::getThreadIdInBlock() < mapping::getBlockSize(IsSPMD)) {
- genericStateMachine(Ident);
+ genericStateMachine(KernelEnvironment.Ident);
} else {
// Retrieve the work function just to ensure we always call
// __kmpc_kernel_parallel even if a custom state machine is used.
@@ -128,10 +131,9 @@
///
/// \param Ident Source location identification, can be NULL.
///
-void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) {
+void __kmpc_target_deinit() {
FunctionTracingRAII();
- const bool IsSPMD =
- Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
+ bool IsSPMD = mapping::isSPMDMode();
state::assumeInitialState(IsSPMD);
if (IsSPMD)
return;
Index: openmp/libomptarget/DeviceRTL/src/Debug.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Debug.cpp
+++ openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -12,8 +12,10 @@
#include "Debug.h"
#include "Configuration.h"
+#include "Environment.h"
#include "Interface.h"
#include "Mapping.h"
+#include "State.h"
#include "Types.h"
using namespace ompx;
@@ -31,15 +33,13 @@
}
}
-/// Current indentation level for the function trace. Only accessed by thread 0.
-__attribute__((loader_uninitialized)) static uint32_t Level;
-#pragma omp allocate(Level) allocator(omp_pteam_mem_alloc)
-
DebugEntryRAII::DebugEntryRAII(const char *File, const unsigned Line,
const char *Function) {
if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
+ uint16_t &Level = state::getKernelEnvironment().DebugIndentionLevel;
+
for (int I = 0; I < Level; ++I)
PRINTF("%s", " ");
@@ -51,10 +51,10 @@
DebugEntryRAII::~DebugEntryRAII() {
if (config::isDebugMode(config::DebugKind::FunctionTracing) &&
- mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0)
+ mapping::getThreadIdInBlock() == 0 && mapping::getBlockId() == 0) {
+ uint16_t &Level = state::getKernelEnvironment().DebugIndentionLevel;
Level--;
+ }
}
-void DebugEntryRAII::init() { Level = 0; }
-
#pragma omp end declare target
Index: openmp/libomptarget/DeviceRTL/src/Configuration.cpp
===================================================================
--- openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -12,7 +12,7 @@
//===----------------------------------------------------------------------===//
#include "Configuration.h"
-#include "DeviceEnvironment.h"
+#include "Environment.h"
#include "State.h"
#include "Types.h"
@@ -23,7 +23,6 @@
// defined by CGOpenMPRuntimeGPU
extern uint32_t __omp_rtl_debug_kind;
extern uint32_t __omp_rtl_assume_no_thread_state;
-extern uint32_t __omp_rtl_assume_no_nested_parallelism;
// This variable should be visibile to the plugin so we override the default
// hidden visibility.
@@ -53,7 +52,7 @@
bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; }
bool config::mayUseNestedParallelism() {
- return !__omp_rtl_assume_no_nested_parallelism;
+ return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
}
#pragma omp end declare target
Index: openmp/libomptarget/DeviceRTL/include/State.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/State.h
+++ openmp/libomptarget/DeviceRTL/include/State.h
@@ -17,6 +17,9 @@
#include "Types.h"
#include "Utils.h"
+// Forward declaration.
+struct KernelEnvironmentTy;
+
#pragma omp begin declare target device_type(nohost)
namespace ompx {
@@ -113,7 +116,10 @@
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
/// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD);
+void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
+
+/// Return the kernel environment associated with the current kernel.
+KernelEnvironmentTy &getKernelEnvironment();
/// TODO
enum ValueKind {
Index: openmp/libomptarget/DeviceRTL/include/Interface.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Interface.h
+++ openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -214,12 +214,14 @@
/// Kernel
///
///{
+// Forward declaration
+struct KernelEnvironmentTy;
+
int8_t __kmpc_is_spmd_exec_mode();
-int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
- bool UseGenericStateMachine);
+int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment);
-void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode);
+void __kmpc_target_deinit();
///}
Index: openmp/libomptarget/DeviceRTL/include/Debug.h
===================================================================
--- openmp/libomptarget/DeviceRTL/include/Debug.h
+++ openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -50,8 +50,6 @@
struct DebugEntryRAII {
DebugEntryRAII(const char *File, const unsigned Line, const char *Function);
~DebugEntryRAII();
-
- static void init();
};
#endif
Index: openmp/libomptarget/DeviceRTL/CMakeLists.txt
===================================================================
--- openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -128,6 +128,7 @@
-nocudalib -nogpulib -nostdinc
-fopenmp -fopenmp-cuda-mode
-Wno-unknown-cuda-version
+ -DOMPTARGET_DEVICE_RUNTIME
-I${include_directory}
-I${devicertl_base_directory}/../include
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -44,6 +44,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -588,6 +589,10 @@
/// one we abort as the kernel is malformed.
CallBase *KernelInitCB = nullptr;
+ /// The constant kernel environement as taken from and passed to
+ /// __kmpc_target_init.
+ ConstantStruct *KernelEnvC = nullptr;
+
/// The __kmpc_target_deinit call in this kernel, if any. If we find more than
/// one we abort as the kernel is malformed.
CallBase *KernelDeinitCB = nullptr;
@@ -692,6 +697,12 @@
"assumptions.");
KernelDeinitCB = KIS.KernelDeinitCB;
}
+ if (KIS.KernelEnvC) {
+ if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
+ llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
+ "assumptions.");
+ KernelEnvC = KIS.KernelEnvC;
+ }
SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
@@ -3386,6 +3397,50 @@
static const char ID;
};
+/// Return the \p IdentTy pointer from the constant kernel environment \p
+/// KernelEnvC.
+Constant *getIdentFromKernelEnvironment(ConstantStruct *KernelEnvC) {
+ constexpr const unsigned IdentIdx = 1;
+ return KernelEnvC->getAggregateElement(IdentIdx);
+}
+
+/// Return the \p Configurationnvironment struct from the constant kernel
+/// environment \p KernelEnvC.
+ConstantStruct *
+getConfigurationFromKernelEnvironment(ConstantStruct *KernelEnvC) {
+ constexpr const unsigned ConfigurationIdx = 0;
+ return dyn_cast<ConstantStruct>(
+ KernelEnvC->getAggregateElement(ConfigurationIdx));
+}
+
+/// Return the \p UseGenericStateMachine flag from the constant kernel
+/// environment \p KernelEnvC.
+ConstantInt *
+getUseGenericStateMachineFromKernelEnvironment(ConstantStruct *KernelEnvC) {
+ ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC);
+ constexpr const unsigned UseGenericStateMachineIdx = 0;
+ return dyn_cast<ConstantInt>(
+ ConfigC->getAggregateElement(UseGenericStateMachineIdx));
+}
+
+/// Return the \p MayUseNestedParallelism flag from the constant kernel
+/// environment \p KernelEnvC.
+ConstantInt *
+getMayUseNestedParallelismFromKernelEnvironment(ConstantStruct *KernelEnvC) {
+ ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC);
+ constexpr const unsigned MayUseNestedParallelismIdx = 1;
+ return cast<ConstantInt>(
+ ConfigC->getAggregateElement(MayUseNestedParallelismIdx));
+}
+
+/// Return the \p ExecMode flag from the constant kernel environment \p
+/// KernelEnvC.
+ConstantInt *getExecModeFromKernelEnvironment(ConstantStruct *KernelEnvC) {
+ ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC);
+ constexpr const unsigned ExecModeIdx = 2;
+ return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(ExecModeIdx));
+}
+
/// The function kernel info abstract attribute, basically, what can we say
/// about a function with regards to the KernelInfoState.
struct AAKernelInfoFunction : AAKernelInfo {
@@ -3398,6 +3453,48 @@
return GuardedInstructions;
}
+ GlobalVariable *getKernelEnvironementGlobalVariable() {
+ constexpr const int InitKernelEnvironmentArgNo = 0;
+ return cast<GlobalVariable>(
+ KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
+ ->stripPointerCasts());
+ }
+
+ void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
+ constexpr const unsigned ConfigurationIdx = 0;
+ Constant *NewKernelEnvC = ConstantFoldInsertValueInstruction(
+ KernelEnvC, ConfigC, {ConfigurationIdx});
+ assert(NewKernelEnvC && "Failed to create new kernel environment");
+ KernelEnvC = cast<ConstantStruct>(NewKernelEnvC);
+ }
+
+ void setUseGenericStateMachineOfKernelEnvironment(ConstantInt *NewVal) {
+ ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC);
+ constexpr const unsigned UseGenericStateMachineIdx = 2;
+ Constant *NewConfigC = ConstantFoldInsertValueInstruction(
+ ConfigC, NewVal, {UseGenericStateMachineIdx});
+ assert(NewConfigC && "Failed to create new configuration environment");
+ setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC));
+ }
+
+ void setMayUseNestedParallelismOfKernelEnvironment(ConstantInt *NewVal) {
+ ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC);
+ constexpr const unsigned ExecModeIdx = 1;
+ Constant *NewConfigC =
+ ConstantFoldInsertValueInstruction(ConfigC, NewVal, {ExecModeIdx});
+ assert(NewConfigC && "Failed to create new configuration environment");
+ setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC));
+ }
+
+ void setExecModeOfKernelEnvironment(ConstantInt *NewVal) {
+ ConstantStruct *ConfigC = getConfigurationFromKernelEnvironment(KernelEnvC);
+ constexpr const unsigned ExecModeIdx = 2;
+ Constant *NewConfigC =
+ ConstantFoldInsertValueInstruction(ConfigC, NewVal, {ExecModeIdx});
+ assert(NewConfigC && "Failed to create new configuration environment");
+ setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC));
+ }
+
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
// This is a high-level transform that might change the constant arguments
@@ -3446,6 +3543,9 @@
ReachingKernelEntries.insert(Fn);
IsKernelEntry = true;
+ GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable();
+ KernelEnvC = cast<ConstantStruct>(KernelEnvGV->getInitializer());
+
// For kernels we might need to initialize/finalize the IsSPMD state and
// we need to register a simplification callback so that the Attributor
// knows the constant arguments to __kmpc_target_init and
@@ -3495,27 +3595,30 @@
return Val;
};
- constexpr const int InitModeArgNo = 1;
- constexpr const int DeinitModeArgNo = 1;
- constexpr const int InitUseStateMachineArgNo = 2;
- A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
- StateMachineSimplifyCB);
- A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo),
- ModeSimplifyCB);
+ Attributor::SimplifictionCallbackTy KernelConfigurationSimplifyCB =
+ [&](const IRPosition &IRP, const AbstractAttribute *AA,
+ bool &UsedAssumedInformation) -> std::optional<Value *> {
+ return KernelEnvC;
+ };
+
A.registerSimplificationCallback(
- IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo),
- ModeSimplifyCB);
+ IRPosition::value(*KernelEnvGV->getInitializer()),
+ KernelConfigurationSimplifyCB);
+
+ ConstantInt *ExecModeC = getExecModeFromKernelEnvironment(KernelEnvC);
+ auto *AssumedExecModeC = ConstantInt::get(
+ ExecModeC->getType(),
+ ExecModeC->getZExtValue() | OMP_TGT_EXEC_MODE_GENERIC_SPMD);
// Check if we know we are in SPMD-mode already.
- ConstantInt *ModeArg =
- dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
- if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
+ if (ExecModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
- // This is a generic region but SPMDization is disabled so stop tracking.
else if (DisableOpenMPOptSPMDization)
+ // This is a generic region but SPMDization is disabled so stop
+ // tracking.
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+ else
+ setExecModeOfKernelEnvironment(AssumedExecModeC);
// Register virtual uses of functions we might need to preserve.
auto RegisterVirtualUse = [&](RuntimeFunction RFKind,
@@ -3616,21 +3719,29 @@
if (!KernelInitCB || !KernelDeinitCB)
return ChangeStatus::UNCHANGED;
- /// Insert nested Parallelism global variable
- Function *Kernel = getAnchorScope();
- Module &M = *Kernel->getParent();
- Type *Int8Ty = Type::getInt8Ty(M.getContext());
- new GlobalVariable(M, Int8Ty, /* isConstant */ true,
- GlobalValue::WeakAnyLinkage,
- ConstantInt::get(Int8Ty, NestedParallelism ? 1 : 0),
- Kernel->getName() + "_nested_parallelism");
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
// If we can we change the execution mode to SPMD-mode otherwise we build a
// custom state machine.
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
if (!changeToSPMDMode(A, Changed)) {
if (!KernelInitCB->getCalledFunction()->isDeclaration())
- return buildCustomStateMachine(A);
+ Changed |= buildCustomStateMachine(A);
+ }
+
+ if (KernelEnvC) {
+ ConstantInt *MayUseNestedParallelismC =
+ getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
+ ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
+ MayUseNestedParallelismC->getType(), NestedParallelism);
+ setMayUseNestedParallelismOfKernelEnvironment(
+ NewMayUseNestedParallelismC);
+
+ // At last, update the KernelEnvc
+ GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable();
+ if (KernelEnvGV->getInitializer() != KernelEnvC) {
+ KernelEnvGV->setInitializer(KernelEnvC);
+ Changed = ChangeStatus::CHANGED;
+ }
}
return Changed;
@@ -3952,16 +4063,8 @@
assert(OMPInfoCache.Kernels.count(Kernel) && "Expected kernel function!");
// Check if the kernel is already in SPMD mode, if so, return success.
- GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
- (Kernel->getName() + "_exec_mode").str());
- assert(ExecMode && "Kernel without exec mode?");
- assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
-
- // Set the global exec mode flag to indicate SPMD-Generic mode.
- assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
- "ExecMode is not an integer!");
- const int8_t ExecModeVal =
- cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
+ auto *ExecModeC = getExecModeFromKernelEnvironment(KernelEnvC);
+ const int8_t ExecModeVal = ExecModeC->getSExtValue();
if (ExecModeVal != OMP_TGT_EXEC_MODE_GENERIC)
return true;
@@ -3979,27 +4082,8 @@
// kernel is executed in.
assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
"Initially non-SPMD kernel has SPMD exec mode!");
- ExecMode->setInitializer(
- ConstantInt::get(ExecMode->getInitializer()->getType(),
- ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
-
- // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
- const int InitModeArgNo = 1;
- const int DeinitModeArgNo = 1;
- const int InitUseStateMachineArgNo = 2;
-
- auto &Ctx = getAnchorValue().getContext();
- A.changeUseAfterManifest(
- KernelInitCB->getArgOperandUse(InitModeArgNo),
- *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
- OMP_TGT_EXEC_MODE_SPMD));
- A.changeUseAfterManifest(
- KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
- *ConstantInt::getBool(Ctx, false));
- A.changeUseAfterManifest(
- KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
- *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
- OMP_TGT_EXEC_MODE_SPMD));
+ setExecModeOfKernelEnvironment(ConstantInt::get(
+ ExecModeC->getType(), ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
++NumOpenMPTargetRegionKernelsSPMD;
@@ -4019,30 +4103,29 @@
if (!ReachedKnownParallelRegions.isValidState())
return ChangeStatus::UNCHANGED;
- const int InitModeArgNo = 1;
- const int InitUseStateMachineArgNo = 2;
+ GlobalVariable *KernelEnvGV = getKernelEnvironementGlobalVariable();
+ assert(KernelEnvGV && "missing kernel environment global variable");
+ auto *ExistingKernelEnvC =
+ cast<ConstantStruct>(KernelEnvGV->getInitializer());
// Check if the current configuration is non-SPMD and generic state machine.
// If we already have SPMD mode or a custom state machine we do not need to
// go any further. If it is anything but a constant something is weird and
// we give up.
- ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
- KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
- ConstantInt *Mode =
- dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitModeArgNo));
+ ConstantInt *UseStateMachineC =
+ getUseGenericStateMachineFromKernelEnvironment(ExistingKernelEnvC);
+ ConstantInt *ModeC = getExecModeFromKernelEnvironment(ExistingKernelEnvC);
// If we are stuck with generic mode, try to create a custom device (=GPU)
// state machine which is specialized for the parallel regions that are
// reachable by the kernel.
- if (!UseStateMachine || UseStateMachine->isZero() || !Mode ||
- (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
+ if (!UseStateMachineC->isZero() ||
+ (ModeC->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD))
return ChangeStatus::UNCHANGED;
// If not SPMD mode, indicate we use a custom state machine now.
- auto &Ctx = getAnchorValue().getContext();
- auto *FalseVal = ConstantInt::getBool(Ctx, false);
- A.changeUseAfterManifest(
- KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
+ setUseGenericStateMachineOfKernelEnvironment(
+ ConstantInt::get(UseStateMachineC->getType(), false));
// If we don't actually need a state machine we are done here. This can
// happen if there simply are no parallel regions. In the resulting kernel
@@ -4121,6 +4204,7 @@
// UserCodeEntryBB: // user code
// __kmpc_target_deinit(...)
//
+ auto &Ctx = getAnchorValue().getContext();
Function *Kernel = getAssociatedFunction();
assert(Kernel && "Expected an associated function!");
@@ -4204,7 +4288,7 @@
StateMachineBeginBB->end()),
DLoc));
- Value *Ident = KernelInitCB->getArgOperand(0);
+ Value *Ident = getIdentFromKernelEnvironment(KernelEnvC);
Value *GTid = KernelInitCB;
FunctionCallee BarrierFn =
@@ -4334,6 +4418,39 @@
ChangeStatus updateImpl(Attributor &A) override {
KernelInfoState StateBefore = getState();
+ // When we leave this function this RAII will make sure the member
+ // KernelEnvC is updated properly depending on the state. That member is
+ // used for simplification of values and needs to be up to date at all
+ // times.
+ struct UpdateKernelEnvCRAII {
+ AAKernelInfoFunction &AA;
+
+ UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
+
+ ~UpdateKernelEnvCRAII() {
+ if (!AA.KernelEnvC)
+ return;
+
+ GlobalVariable *KernelEnvGV = AA.getKernelEnvironementGlobalVariable();
+ ConstantStruct *ExistingKernelEnvC =
+ cast<ConstantStruct>(KernelEnvGV->getInitializer());
+
+ if (!AA.isValidState()) {
+ AA.KernelEnvC = ExistingKernelEnvC;
+ return;
+ }
+
+ if (!AA.ReachedKnownParallelRegions.isValidState())
+ AA.setUseGenericStateMachineOfKernelEnvironment(
+ getUseGenericStateMachineFromKernelEnvironment(
+ ExistingKernelEnvC));
+
+ if (!AA.SPMDCompatibilityTracker.isValidState())
+ AA.setExecModeOfKernelEnvironment(
+ getExecModeFromKernelEnvironment(ExistingKernelEnvC));
+ }
+ } RAII(*this);
+
// Callback to check a read/write instruction.
auto CheckRWInst = [&](Instruction &I) {
// We handle calls later.
Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===================================================================
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3878,14 +3878,39 @@
ConstantInt *IsSPMDVal = ConstantInt::getSigned(
IntegerType::getInt8Ty(Int8->getContext()),
IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
- ConstantInt *UseGenericStateMachine =
- ConstantInt::getBool(Int32->getContext(), !IsSPMD);
-
+ ConstantInt *UseGenericStateMachineVal = ConstantInt::getSigned(
+ IntegerType::getInt8Ty(Int8->getContext()), !IsSPMD);
+ ConstantInt *MayUseNestedParallelismVal =
+ ConstantInt::getSigned(IntegerType::getInt8Ty(Int8->getContext()), true);
+ ConstantInt *DebugIndentionLevelVal =
+ ConstantInt::getSigned(IntegerType::getInt16Ty(Int8->getContext()), 0);
+
+ Function *Kernel = Builder.GetInsertBlock()->getParent();
Function *Fn = getOrCreateRuntimeFunctionPtr(
omp::RuntimeFunction::OMPRTL___kmpc_target_init);
-
- CallInst *ThreadKind = Builder.CreateCall(
- Fn, {Ident, IsSPMDVal, UseGenericStateMachine});
+ const DataLayout &DL = Fn->getParent()->getDataLayout();
+
+ Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
+ ConfigurationEnvironmentTy, {
+ UseGenericStateMachineVal,
+ MayUseNestedParallelismVal,
+ IsSPMDVal,
+ });
+ Constant *KernelEnvironmentInitializer = ConstantStruct::get(
+ KernelEnvironmentTy, {
+ ConfigurationEnvironmentInitializer,
+ Ident,
+ DebugIndentionLevelVal,
+ });
+ Twine KernelEnvironmentName = Kernel->getName() + "_kernel_info";
+ GlobalVariable *KernelEnvironment = new GlobalVariable(
+ M, KernelEnvironmentTy, /* IsConstant */ true,
+ GlobalValue::ExternalLinkage, KernelEnvironmentInitializer,
+ KernelEnvironmentName,
+ /* InsertBefore */ nullptr, llvm::GlobalValue::NotThreadLocal,
+ DL.getDefaultGlobalsAddressSpace());
+
+ CallInst *ThreadKind = Builder.CreateCall(Fn, {KernelEnvironment});
Value *ExecUserCode = Builder.CreateICmpEQ(
ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
@@ -3918,22 +3943,14 @@
return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
}
-void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
- bool IsSPMD) {
+void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
if (!updateToLocation(Loc))
return;
- uint32_t SrcLocStrSize;
- Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
- Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
- ConstantInt *IsSPMDVal = ConstantInt::getSigned(
- IntegerType::getInt8Ty(Int8->getContext()),
- IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-
Function *Fn = getOrCreateRuntimeFunctionPtr(
omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
- Builder.CreateCall(Fn, {Ident, IsSPMDVal});
+ Builder.CreateCall(Fn, {});
}
void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -96,6 +96,10 @@
Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
+__OMP_STRUCT_TYPE(ConfigurationEnvironmentTy, ConfigurationEnvironmentTy, false,
+ Int8, Int8, Int8)
+__OMP_STRUCT_TYPE(KernelEnvironmentTy, KernelEnvironmentTy, false,
+ ConfigurationEnvironmentTy, IdentPtr, Int16)
#undef __OMP_STRUCT_TYPE
#undef OMP_STRUCT_TYPE
@@ -452,8 +456,8 @@
/* Int */ Int32, /* kmp_task_t */ VoidPtr)
/// OpenMP Device runtime functions
-__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1)
-__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8)
+__OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentTyPtr)
+__OMP_RTL(__kmpc_target_deinit, false, Void,)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
@@ -1012,9 +1016,9 @@
ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs, SExt))
__OMP_RTL_ATTRS(__kmpc_target_init, AttributeSet(), SExt,
- ParamAttrs(AttributeSet(), SExt, SExt))
+ ParamAttrs(AttributeSet()))
__OMP_RTL_ATTRS(__kmpc_target_deinit, AttributeSet(), AttributeSet(),
- ParamAttrs(AttributeSet(), SExt))
+ ParamAttrs())
__OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),
ParamAttrs(AttributeSet(), SExt, SExt, SExt, SExt,
AttributeSet(), AttributeSet(), AttributeSet(),
Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1461,8 +1461,7 @@
/// Create a runtime call for kmpc_target_deinit
///
/// \param Loc The insert and source location description.
- /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
- void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD);
+ void createTargetDeinit(const LocationDescription &Loc);
///}
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -781,7 +781,7 @@
emitGenericVarsEpilog(CGF);
CGBuilderTy &Bld = CGF.Builder;
- OMPBuilder.createTargetDeinit(Bld, IsSPMD);
+ OMPBuilder.createTargetDeinit(Bld);
}
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits