saiislam created this revision.
saiislam added reviewers: ABataev, jdoerfert, JonChesterfield.
Herald added subscribers: cfe-commits, guansong, yaxunl, jvesely, jholewinski.
Herald added a project: clang.
saiislam requested review of this revision.
Herald added a subscriber: sstefan1.
Provide support for amdgcn specific global variables and attributes.
Generalize allocation of various common global variables and provide
their specialized implementations for nvptx and amdgcn.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D86097
Files:
clang/lib/CodeGen/CGOpenMPRuntime.cpp
clang/lib/CodeGen/CGOpenMPRuntime.h
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
clang/test/OpenMP/amdgcn_target_codegen.cpp
Index: clang/test/OpenMP/amdgcn_target_codegen.cpp
===================================================================
--- clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -8,6 +8,22 @@
#define N 1000
+// CHECK: @"_openmp_kernel_static_glob_rd$ptr" = weak addrspace(3) externally_initialized global i8* undef
+
+// CHECK: @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l36_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 1, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threadsv_l36_exec_mode = weak constant i8 1
+
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l52_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 0, i8 0, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_tid_threads_simdv_l52_exec_mode = weak constant i8 0
+
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_wg_size = weak addrspace(1) constant i16 10
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 10, i8 0, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_spmdv_l63_exec_mode = weak constant i8 0
+
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_wg_size = weak addrspace(1) constant i16 74
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_kern_desc = weak constant %struct.__tgt_attribute_struct { i16 2, i16 9, i16 74, i8 1, i8 1, i8 0 }, align 2
+// CHECK : @__omp_offloading_{{.*}}test_amdgcn_target_attributes_non_spmdv_l75_exec_mode = weak constant i8 1
+
int test_amdgcn_target_tid_threads() {
// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads
@@ -40,4 +56,40 @@
return arr[0];
}
+int test_amdgcn_target_attributes_spmd() {
+ int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="10,10"
+#pragma omp target parallel num_threads(10)
+ for (int i = 0; i < N; i++) {
+ arr[i] = 1;
+ }
+
+ return arr[0];
+}
+
+int test_amdgcn_target_attributes_non_spmd() {
+ int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="74,74"
+#pragma omp target teams thread_limit(10)
+ for (int i = 0; i < N; i++) {
+ arr[i] = 1;
+ }
+
+ return arr[0];
+}
+
+int test_amdgcn_target_attributes_max_work_group_size() {
+ int arr[N];
+
+// CHECK: {{.*}}"amdgpu-flat-work-group-size"="1024,1024"
+#pragma omp target teams thread_limit(1500)
+ for (int i = 0; i < N; i++) {
+ arr[i] = 1;
+ }
+
+ return arr[0];
+}
+
#endif
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -35,6 +35,34 @@
/// Get the maximum number of threads in a block of the GPU.
llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+ /// Allocate global variable for TransferMedium
+ virtual llvm::GlobalVariable *
+ allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+ StringRef Name) override;
+
+ /// Allocate global variable for SharedStaticRD
+ virtual llvm::GlobalVariable *
+ allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+ llvm::Type *LLVMStaticTy) override;
+
+ /// Allocate global variable for KernelStaticGlobalized
+ virtual llvm::GlobalVariable *
+ allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+ /// Emit target specific SPMD kernel
+ virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+ StringRef ParentName,
+ llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen) override;
+ /// Emit target specific Non-SPMD kernel
+ virtual void
+ emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+ StringRef ParentName, llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen) override;
};
} // CodeGen namespace.
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -30,6 +30,11 @@
: CGOpenMPRuntimeGPU(CGM) {
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP NVPTX can only handle device code.");
+
+ // FIXME: nvlink does not handle weak linkage correctly (object with the
+ // different size are reported as erroneous).
+ // Restore CommonLinkage as soon as nvlink is fixed.
+ StaticRDLinkage = llvm::GlobalValue::InternalLinkage;
}
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -54,3 +59,52 @@
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x);
return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
}
+
+llvm::GlobalVariable *CGOpenMPRuntimeNVPTX::allocateTransferMediumGlobal(
+ CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+ return new llvm::GlobalVariable(
+ CGM.getModule(), Ty, /*isConstant=*/false,
+ llvm::GlobalVariable::CommonLinkage, llvm::Constant::getNullValue(Ty),
+ TransferMediumName,
+ /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+ CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+ llvm::Type *LLVMStaticTy) {
+ return new llvm::GlobalVariable(
+ CGM.getModule(), LLVMStaticTy,
+ /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
+ llvm::Constant::getNullValue(LLVMStaticTy),
+ "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
+ llvm::GlobalValue::NotThreadLocal,
+ CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeNVPTX::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+ return new llvm::GlobalVariable(
+ CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+ llvm::GlobalValue::InternalLinkage,
+ llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
+ "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+ llvm::GlobalValue::NotThreadLocal,
+ CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+}
+
+void CGOpenMPRuntimeNVPTX::emitSPMDKernelWrapper(
+ const OMPExecutableDirective &D, StringRef ParentName,
+ llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+ emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+ CodeGen);
+}
+
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernelWrapper(
+ const OMPExecutableDirective &D, StringRef ParentName,
+ llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+ emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+ CodeGen);
+}
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -33,6 +33,9 @@
/// Unknown execution mode (orphaned directive).
EM_Unknown,
};
+ /// Linkage type of StaticRD Global variable
+ llvm::GlobalValue::LinkageTypes StaticRDLinkage;
+
private:
/// Parallel outlined function work for workers to execute.
llvm::SmallVector<llvm::Function *, 16> Work;
@@ -99,36 +102,6 @@
uint64_t Size, int32_t Flags,
llvm::GlobalValue::LinkageTypes Linkage) override;
- /// Emit outlined function specialized for the Fork-Join
- /// programming model for applicable target directives on the NVPTX device.
- /// \param D Directive to emit.
- /// \param ParentName Name of the function that encloses the target region.
- /// \param OutlinedFn Outlined function value to be defined by this call.
- /// \param OutlinedFnID Outlined function ID value to be defined by this call.
- /// \param IsOffloadEntry True if the outlined function is an offload entry.
- /// An outlined function may not be an entry if, e.g. the if clause always
- /// evaluates to false.
- void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
- llvm::Function *&OutlinedFn,
- llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
- const RegionCodeGenTy &CodeGen);
-
- /// Emit outlined function specialized for the Single Program
- /// Multiple Data programming model for applicable target directives on the
- /// NVPTX device.
- /// \param D Directive to emit.
- /// \param ParentName Name of the function that encloses the target region.
- /// \param OutlinedFn Outlined function value to be defined by this call.
- /// \param OutlinedFnID Outlined function ID value to be defined by this call.
- /// \param IsOffloadEntry True if the outlined function is an offload entry.
- /// \param CodeGen Object containing the target statements.
- /// An outlined function may not be an entry if, e.g. the if clause always
- /// evaluates to false.
- void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
- llvm::Function *&OutlinedFn,
- llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
- const RegionCodeGenTy &CodeGen);
-
/// Emit outlined function for 'target' directive on the NVPTX
/// device.
/// \param D Directive to emit.
@@ -211,6 +184,64 @@
/// Get the maximum number of threads in a block of the GPU.
virtual llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) = 0;
+ /// Allocate global variable for TransferMedium
+ virtual llvm::GlobalVariable *
+ allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+ StringRef TransferMediumName) = 0;
+
+ /// Allocate global variable for SharedStaticRD
+ virtual llvm::GlobalVariable *
+ allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+ llvm::Type *LLVMStaticTy) = 0;
+
+ /// Allocate global variable for KernelStaticGlobalized
+ virtual llvm::GlobalVariable *
+ allocateKernelStaticGlobalized(CodeGenModule &CGM) = 0;
+
+ virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+ StringRef ParentName,
+ llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen) = 0;
+
+ virtual void emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+ StringRef ParentName,
+ llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen) = 0;
+
+ /// Emit outlined function specialized for the Single Program
+ /// Multiple Data programming model for applicable target directives on the
+ /// NVPTX device.
+ /// \param D Directive to emit.
+ /// \param ParentName Name of the function that encloses the target region.
+ /// \param OutlinedFn Outlined function value to be defined by this call.
+ /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+ /// \param IsOffloadEntry True if the outlined function is an offload entry.
+ /// \param CodeGen Object containing the target statements.
+ /// An outlined function may not be an entry if, e.g. the if clause always
+ /// evaluates to false.
+ void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+ llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen);
+
+ /// Emit outlined function specialized for the Fork-Join
+ /// programming model for applicable target directives on the NVPTX device.
+ /// \param D Directive to emit.
+ /// \param ParentName Name of the function that encloses the target region.
+ /// \param OutlinedFn Outlined function value to be defined by this call.
+ /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+ /// \param IsOffloadEntry True if the outlined function is an offload entry.
+ /// An outlined function may not be an entry if, e.g. the if clause always
+ /// evaluates to false.
+ void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
+ llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen);
+
/// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
/// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
virtual void emitProcBindClause(CodeGenFunction &CGF,
Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "CGOpenMPRuntimeGPU.h"
+#include "CGOpenMPRuntimeAMDGCN.h"
#include "CGOpenMPRuntimeNVPTX.h"
#include "CodeGenFunction.h"
#include "clang/AST/Attr.h"
@@ -1190,13 +1191,8 @@
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
if (!KernelStaticGlobalized) {
- KernelStaticGlobalized = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
- llvm::GlobalValue::InternalLinkage,
- llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
- "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
- llvm::GlobalValue::NotThreadLocal,
- CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+ KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
}
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
@@ -1321,13 +1317,8 @@
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
if (!KernelStaticGlobalized) {
- KernelStaticGlobalized = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
- llvm::GlobalValue::InternalLinkage,
- llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
- "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
- llvm::GlobalValue::NotThreadLocal,
- CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
+ KernelStaticGlobalized = RT.allocateKernelStaticGlobalized(CGM);
}
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
@@ -1396,11 +1387,11 @@
// warps participate in parallel work.
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
bool Mode) {
- auto *GVMode =
- new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage,
- llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
- Twine(Name, "_exec_mode"));
+ auto *GVMode = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int8Ty,
+ /*isConstant=*/true, llvm::GlobalValue::WeakAnyLinkage,
+ llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
+ Twine(Name, "_exec_mode"));
CGM.addCompilerUsedGlobal(GVMode);
}
@@ -1888,12 +1879,11 @@
bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
if (Mode)
- emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
- CodeGen);
+ emitSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+ IsOffloadEntry, CodeGen);
else
- emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
- CodeGen);
-
+ emitNonSPMDKernelWrapper(D, ParentName, OutlinedFn, OutlinedFnID,
+ IsOffloadEntry, CodeGen);
setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
}
@@ -3231,6 +3221,7 @@
CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
CGBuilderTy &Bld = CGF.Builder;
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// This array is used as a medium to transfer, one reduce element at a time,
// the data from the first lane of every warp to lanes in the first warp
@@ -3246,16 +3237,11 @@
unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
if (!TransferMedium) {
auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
- unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
- TransferMedium = new llvm::GlobalVariable(
- M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
- llvm::Constant::getNullValue(Ty), TransferMediumName,
- /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
- SharedAddressSpace);
+ TransferMedium =
+ RT.allocateTransferMediumGlobal(CGM, Ty, TransferMediumName);
CGM.addCompilerUsedGlobal(TransferMedium);
}
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
// Get the CUDA thread id of the current OpenMP thread on the GPU.
llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
// nvptx_lane_id = nvptx_id % warpsize
@@ -5104,6 +5090,7 @@
}
void CGOpenMPRuntimeGPU::clear() {
+ auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGM.getOpenMPRuntime());
if (!GlobalizedRecords.empty() &&
!CGM.getLangOpts().OpenMPCUDATargetParallel) {
ASTContext &C = CGM.getContext();
@@ -5152,9 +5139,6 @@
llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
}
// Allocate SharedMemorySize buffer for the shared memory.
- // FIXME: nvlink does not handle weak linkage correctly (object with the
- // different size are reported as erroneous).
- // Restore this code as sson as nvlink is fixed.
if (!SharedStaticRD->field_empty()) {
llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
QualType SubTy = C.getConstantArrayType(
@@ -5171,13 +5155,7 @@
if (!SharedStaticRD->field_empty()) {
QualType StaticTy = C.getRecordType(SharedStaticRD);
llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMStaticTy,
- /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
- llvm::Constant::getNullValue(LLVMStaticTy),
- "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
- llvm::GlobalValue::NotThreadLocal,
- C.getTargetAddressSpace(LangAS::cuda_shared));
+ auto *GV = RT.allocateSharedStaticRDGlobal(CGM, LLVMStaticTy);
auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
GV, CGM.VoidPtrTy);
for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
@@ -5198,14 +5176,11 @@
C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
/*IndexTypeQuals=*/0);
llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
- // FIXME: nvlink does not handle weak linkage correctly (object with the
- // different size are reported as erroneous).
- // Restore CommonLinkage as soon as nvlink is fixed.
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMArr2Ty,
- /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
- llvm::Constant::getNullValue(LLVMArr2Ty),
- "_openmp_static_glob_rd_$_");
+ auto *GV =
+ new llvm::GlobalVariable(CGM.getModule(), LLVMArr2Ty,
+ /*isConstant=*/false, RT.StaticRDLinkage,
+ llvm::Constant::getNullValue(LLVMArr2Ty),
+ "_openmp_static_glob_rd_$_");
auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
GV, CGM.VoidPtrTy);
for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
@@ -5238,7 +5213,7 @@
// Restore CommonLinkage as soon as nvlink is fixed.
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMReductionsBufferTy,
- /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
+ /*isConstant=*/false, RT.StaticRDLinkage,
llvm::Constant::getNullValue(LLVMReductionsBufferTy),
"_openmp_teams_reductions_buffer_$_");
KernelTeamsReductionPtr->setInitializer(
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
@@ -25,6 +25,10 @@
class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
public:
+ /// Nesting level of parallel region.
+ int ParallelLevel = 0;
+ int MaxParallelLevel = 0;
+
explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
/// Get the GPU warp size.
@@ -35,6 +39,46 @@
/// Get the maximum number of threads in a block of the GPU.
llvm::Value *getGPUNumThreads(CodeGenFunction &CGF) override;
+
+ /// Allocate global variable for TransferMedium
+ virtual llvm::GlobalVariable *
+ allocateTransferMediumGlobal(CodeGenModule &CGM, llvm::ArrayType *Ty,
+ StringRef TransferMediumName) override;
+
+ /// Allocate global variable for SharedStaticRD
+ virtual llvm::GlobalVariable *
+ allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+ llvm::Type *LLVMStaticTy) override;
+
+ /// Get global variable KernelStaticGlobalized which is a shared pointer for
+ /// the global memory in the global memory buffer used for the given kernel
+ virtual llvm::GlobalVariable *
+ allocateKernelStaticGlobalized(CodeGenModule &CGM) override;
+
+ /// Emit target specifc SPMD kernel
+ virtual void emitSPMDKernelWrapper(const OMPExecutableDirective &D,
+ StringRef ParentName,
+ llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen) override;
+
+ /// Emit target specific Non-SPMD kernel
+ virtual void
+ emitNonSPMDKernelWrapper(const OMPExecutableDirective &D,
+ StringRef ParentName, llvm::Function *&OutlinedFn,
+ llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+ const RegionCodeGenTy &CodeGen) override;
+
+ /// Create a unique global variable to indicate the flat-work-group-size
+ /// for this region. Values are [256..1024].
+ static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name,
+ unsigned WGSize);
+
+ /// Generate global variables _wg_size, kern_desc, __tgt_attribute_struct.
+ /// Also generate appropriate value of attribute amdgpu-flat-work-group-size
+ void generateMetaData(CodeGenModule &CGM, const OMPExecutableDirective &D,
+ llvm::Function *&OutlinedFn, bool IsGeneric);
};
} // namespace CodeGen
Index: clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "CGOpenMPRuntimeAMDGCN.h"
+#include "CGOpenMPRuntime.h"
#include "CGOpenMPRuntimeGPU.h"
#include "CodeGenFunction.h"
#include "clang/AST/Attr.h"
@@ -30,6 +31,7 @@
: CGOpenMPRuntimeGPU(CGM) {
if (!CGM.getLangOpts().OpenMPIsDevice)
llvm_unreachable("OpenMP AMDGCN can only handle device code.");
+ StaticRDLinkage = llvm::GlobalValue::PrivateLinkage;
}
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
@@ -59,3 +61,133 @@
return Bld.CreateTrunc(
Bld.CreateCall(F, {Bld.getInt32(0)}, "nvptx_num_threads"), CGF.Int32Ty);
}
+
+llvm::GlobalVariable *CGOpenMPRuntimeAMDGCN::allocateTransferMediumGlobal(
+ CodeGenModule &CGM, llvm::ArrayType *Ty, StringRef TransferMediumName) {
+ return new llvm::GlobalVariable(
+ CGM.getModule(), Ty, /*isConstant=*/false,
+ llvm::GlobalVariable::WeakAnyLinkage, llvm::UndefValue::get(Ty),
+ TransferMediumName,
+ /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+ CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+ /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateSharedStaticRDGlobal(CodeGenModule &CGM,
+ llvm::Type *LLVMStaticTy) {
+ return new llvm::GlobalVariable(
+ CGM.getModule(), LLVMStaticTy,
+ /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ llvm::UndefValue::get(LLVMStaticTy), "_openmp_shared_static_glob_rd_$_",
+ /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+ CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+ /*isExternallyInitialized*/ true);
+}
+
+llvm::GlobalVariable *
+CGOpenMPRuntimeAMDGCN::allocateKernelStaticGlobalized(CodeGenModule &CGM) {
+ return new llvm::GlobalVariable(
+ CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
+ llvm::GlobalValue::WeakAnyLinkage, llvm::UndefValue::get(CGM.VoidPtrTy),
+ "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
+ llvm::GlobalValue::NotThreadLocal,
+ CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared),
+ /*isExternallyInitialized*/ true);
+}
+
+void CGOpenMPRuntimeAMDGCN::setPropertyWorkGroupSize(CodeGenModule &CGM,
+ StringRef Name,
+ unsigned WGSize) {
+ auto *GVMode = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+ llvm::GlobalValue::WeakAnyLinkage,
+ llvm::ConstantInt::get(CGM.Int16Ty, WGSize), Name + Twine("_wg_size"),
+ /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+ CGM.getContext().getTargetAddressSpace(LangAS::cuda_device),
+ /*isExternallyInitialized*/ false);
+ CGM.addCompilerUsedGlobal(GVMode);
+}
+
+void CGOpenMPRuntimeAMDGCN::generateMetaData(CodeGenModule &CGM,
+ const OMPExecutableDirective &D,
+ llvm::Function *&OutlinedFn,
+ bool IsGeneric) {
+ if (!CGM.getTriple().isAMDGCN())
+ return;
+ int FlatAttr = 0;
+ bool flatAttrEmitted = false;
+ unsigned DefaultWorkGroupSz =
+ CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Default_WG_Size);
+
+ if (isOpenMPTeamsDirective(D.getDirectiveKind()) ||
+ isOpenMPParallelDirective(D.getDirectiveKind())) {
+ const auto *ThreadLimitClause = D.getSingleClause<OMPThreadLimitClause>();
+ const auto *NumThreadsClause = D.getSingleClause<OMPNumThreadsClause>();
+ unsigned MaxWorkGroupSz =
+ CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Max_WG_Size);
+ unsigned compileTimeThreadLimit = 0;
+ // Only one of thread_limit or num_threads is used, cant do it for both
+ if (ThreadLimitClause && !NumThreadsClause) {
+ Expr *ThreadLimitExpr = ThreadLimitClause->getThreadLimit();
+ clang::Expr::EvalResult Result;
+ if (ThreadLimitExpr->EvaluateAsInt(Result, CGM.getContext()))
+ compileTimeThreadLimit = Result.Val.getInt().getExtValue();
+ } else if (!ThreadLimitClause && NumThreadsClause) {
+ Expr *NumThreadsExpr = NumThreadsClause->getNumThreads();
+ clang::Expr::EvalResult Result;
+ if (NumThreadsExpr->EvaluateAsInt(Result, CGM.getContext()))
+ compileTimeThreadLimit = Result.Val.getInt().getExtValue();
+ }
+
+ // Add kernel metadata if ThreadLimit Clause is compile time constant > 0
+ if (compileTimeThreadLimit > 0) {
+ // Add the WarpSize to generic, to reflect what runtime dispatch does.
+ if (IsGeneric)
+ compileTimeThreadLimit +=
+ CGM.getTarget().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+ if (compileTimeThreadLimit > MaxWorkGroupSz)
+ compileTimeThreadLimit = MaxWorkGroupSz;
+ std::string AttrVal = llvm::utostr(compileTimeThreadLimit);
+ FlatAttr = compileTimeThreadLimit;
+ OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+ AttrVal + "," + AttrVal);
+ setPropertyWorkGroupSize(CGM, OutlinedFn->getName(),
+ compileTimeThreadLimit);
+ }
+ flatAttrEmitted = true;
+ } // end of amdgcn teams or parallel directive
+
+ // emit amdgpu-flat-work-group-size if not emitted already.
+ if (!flatAttrEmitted) {
+ std::string FlatAttrVal = llvm::utostr(DefaultWorkGroupSz);
+ OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
+ FlatAttrVal + "," + FlatAttrVal);
+ }
+ // Emit a kernel descriptor for runtime.
+ StringRef KernDescName = OutlinedFn->getName();
+ CGOpenMPRuntime::emitStructureKernelDesc(CGM, KernDescName, FlatAttr,
+ IsGeneric,
+ 1, // Uses HostServices
+ MaxParallelLevel);
+ // Reset it to zero for any subsequent kernel
+ MaxParallelLevel = 0;
+}
+
+void CGOpenMPRuntimeAMDGCN::emitSPMDKernelWrapper(
+ const OMPExecutableDirective &D, StringRef ParentName,
+ llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+ emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+ CodeGen);
+ generateMetaData(CGM, D, OutlinedFn, /*SPMD*/ false);
+}
+
+void CGOpenMPRuntimeAMDGCN::emitNonSPMDKernelWrapper(
+ const OMPExecutableDirective &D, StringRef ParentName,
+ llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+ emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+ CodeGen);
+ generateMetaData(CGM, D, OutlinedFn, /*Generic*/ true);
+}
Index: clang/lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.h
+++ clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -495,6 +495,8 @@
QualType TgtOffloadEntryQTy;
/// Entity that registers the offloading constants that were emitted so
/// far.
+
+ QualType TgtAttributeStructQTy;
class OffloadEntriesInfoManagerTy {
CodeGenModule &CGM;
@@ -1755,6 +1757,14 @@
llvm::FunctionCallee OutlinedFn,
ArrayRef<llvm::Value *> Args = llvm::None) const;
+ /// Returns __tgt_attribute_struct type.
+ QualType getTgtAttributeStructQTy();
+
+ /// Emit structure descriptor for a kernel
+ void emitStructureKernelDesc(CodeGenModule &CGM, StringRef Name,
+ int16_t WG_Size, int8_t Mode,
+ int8_t HostServices, int8_t MaxParallelLevel);
+
/// Emits OpenMP-specific function prolog.
/// Required for device constructs.
virtual void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D);
Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3335,6 +3335,47 @@
}
}
+/// Emit structure descriptor for a kernel
+void CGOpenMPRuntime::emitStructureKernelDesc(CodeGenModule &CGM,
+ StringRef Name, int16_t WG_Size,
+ int8_t Mode, int8_t HostServices,
+ int8_t MaxParallelLevel) {
+
+ // Create all device images
+ llvm::Constant *AttrData[] = {
+ llvm::ConstantInt::get(CGM.Int16Ty, 2), // Version
+ llvm::ConstantInt::get(CGM.Int16Ty, 9), // Size in bytes
+ llvm::ConstantInt::get(CGM.Int16Ty, WG_Size),
+ llvm::ConstantInt::get(CGM.Int8Ty, Mode), // 0 => SPMD, 1 => GENERIC
+ llvm::ConstantInt::get(CGM.Int8Ty, HostServices), // 1 => use HostServices
+ llvm::ConstantInt::get(CGM.Int8Ty, MaxParallelLevel)}; // number of nests
+
+ llvm::GlobalVariable *AttrImages = createGlobalStruct(
+ CGM, getTgtAttributeStructQTy(), isDefaultLocationConstant(), AttrData,
+ Name + Twine("_kern_desc"), llvm::GlobalValue::WeakAnyLinkage);
+ CGM.addCompilerUsedGlobal(AttrImages);
+}
+
+// Create Tgt Attribute Sruct type.
+QualType CGOpenMPRuntime::getTgtAttributeStructQTy() {
+ ASTContext &C = CGM.getContext();
+ QualType KmpInt8Ty = C.getIntTypeForBitwidth(/*Width=*/8, /*Signed=*/1);
+ QualType KmpInt16Ty = C.getIntTypeForBitwidth(/*Width=*/16, /*Signed=*/1);
+ if (TgtAttributeStructQTy.isNull()) {
+ RecordDecl *RD = C.buildImplicitRecord("__tgt_attribute_struct");
+ RD->startDefinition();
+ addFieldToRecordDecl(C, RD, KmpInt16Ty); // Version
+ addFieldToRecordDecl(C, RD, KmpInt16Ty); // Struct Size in bytes.
+ addFieldToRecordDecl(C, RD, KmpInt16Ty); // WG_size
+ addFieldToRecordDecl(C, RD, KmpInt8Ty); // Mode
+ addFieldToRecordDecl(C, RD, KmpInt8Ty); // HostServices
+ addFieldToRecordDecl(C, RD, KmpInt8Ty); // MaxParallelLevel
+ RD->completeDefinition();
+ TgtAttributeStructQTy = C.getRecordType(RD);
+ }
+ return TgtAttributeStructQTy;
+}
+
QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
// Make sure the type of the entry is already created. This is the type we
// have to create:
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits