jhuber6 created this revision. jhuber6 added a reviewer: jdoerfert. jhuber6 added projects: LLVM, clang, OpenMP. Herald added subscribers: jfb, guansong, hiraditya, yaxunl. jhuber6 requested review of this revision. Herald added subscribers: llvm-commits, openmp-commits, cfe-commits, sstefan1.
Memory globalization is required to maintain OpenMP standard semantics for data sharing between worker and master threads. The GPU cannot share data between its threads so must allocate global or shared memory to store the data in. Currently this is implemented fully in the frontend using the `__kmpc_data_sharing_push_stack` and __kmpc_data_sharing_pop_stack` functions to emulate standard CPU stack sharing. The front-end scans the target region for variables that escape the region and must be shared between the threads. Each variable then has a field created for it in a global record type. This patch replaces this functionality with a single allocation command, effectively mimicking an alloca instruction for the variables that must be shared between the threads. This will be much slower than the current solution, but makes it much easier to optimize as we can analyze each variable independently and determine if it is not captured. In the future, we can replace these calls with an `alloca` and small allocations can be pushed to shared memory. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D97680 Files: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/CodeGen/CGOpenMPRuntimeGPU.h llvm/include/llvm/Frontend/OpenMP/OMPKinds.def llvm/lib/Transforms/IPO/OpenMPOpt.cpp openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu openmp/libomptarget/deviceRTLs/interface.h
Index: openmp/libomptarget/deviceRTLs/interface.h =================================================================== --- openmp/libomptarget/deviceRTLs/interface.h +++ openmp/libomptarget/deviceRTLs/interface.h @@ -424,13 +424,8 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn); EXTERN void __kmpc_kernel_end_parallel(); -EXTERN void __kmpc_data_sharing_init_stack(); -EXTERN void __kmpc_data_sharing_init_stack_spmd(); -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void *__kmpc_data_sharing_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void *__kmpc_alloc_shared(size_t size); +EXTERN void __kmpc_free_shared(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); EXTERN void __kmpc_end_sharing_variables(); EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); @@ -445,4 +440,11 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, int16_t is_shared); +// Deprecated globalization interface +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, int16_t s); +EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t s); +EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void __kmpc_data_sharing_init_stack(); +EXTERN void __kmpc_data_sharing_init_stack_spmd(); + #endif Index: openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu =================================================================== --- openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -23,125 +23,6 @@ // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// -INLINE static void data_sharing_init_stack_common() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - omptarget_nvptx_TeamDescr *teamDescr = - &omptarget_nvptx_threadPrivateContext->TeamContext(); - - for (int WID = 0; WID < DS_Max_Warp_Number; WID++) { - __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - } -} - -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called only by the MASTER thread of each -// team in non-SPMD mode. -EXTERN void __kmpc_data_sharing_init_stack() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - data_sharing_init_stack_common(); - omptarget_nvptx_globalArgs.Init(); -} - -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called in SPMD mode only. -EXTERN void __kmpc_data_sharing_init_stack_spmd() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - if (GetThreadIdInBlock() == 0) - data_sharing_init_stack_common(); - - __kmpc_impl_threadfence_block(); -} - -INLINE static void *data_sharing_push_stack_common(size_t PushSize) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - // Only warp active master threads manage the stack. - bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; - - // Add worst-case padding to DataSize so that future stack allocations are - // correctly aligned. - const size_t Alignment = 8; - PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; - - // Frame pointer must be visible to all workers in the same warp. - const unsigned WID = GetWarpId(); - void *FrameP = 0; - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - - if (IsWarpMaster) { - // SlotP will point to either the shared memory slot or an existing - // global memory slot. - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - - // Check if we have room for the data in the current slot. - const uintptr_t StartAddress = (uintptr_t)StackP; - const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; - - // If we requested more data than there is room for in the rest - // of the slot then we need to either re-use the next slot, if one exists, - // or create a new slot. - if (EndAddress < RequestedEndAddress) { - __kmpc_data_sharing_slot *NewSlot = 0; - size_t NewSize = PushSize; - - // Allocate at least the default size for each type of slot. - // Master is a special case and even though there is only one thread, - // it can share more things with the workers. For uniformity, it uses - // the full size of a worker warp slot. - size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; - if (DefaultSlotSize > NewSize) - NewSize = DefaultSlotSize; - NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc( - sizeof(__kmpc_data_sharing_slot) + NewSize, - "Global memory slot allocation."); - - NewSlot->Next = 0; - NewSlot->Prev = SlotP; - NewSlot->PrevSlotStackPtr = StackP; - NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; - - // Make previous slot point to the newly allocated slot. - SlotP->Next = NewSlot; - // The current slot becomes the new slot. - SlotP = NewSlot; - // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[0] + PushSize; - // The frame pointer always points to the beginning of the frame. - FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0]; - } else { - // Add the data chunk to the current slot. The frame pointer is set to - // point to the start of the new frame held in StackP. - FrameP = DataSharingState.FramePtr[WID] = StackP; - // Reset stack pointer to the requested address. - StackP = (void *)RequestedEndAddress; - } - } - // Get address from lane 0. - int *FP = (int *)&FrameP; - FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); - if (sizeof(FrameP) == 8) - FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); - - return FrameP; -} - -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - return data_sharing_push_stack_common(DataSize); -} - // Called at the time of the kernel initialization. This is used to initilize // the list of references to shared variables and to pre-allocate global storage // for holding the globalized variables. @@ -149,61 +30,12 @@ // By default the globalized variables are stored in global memory. If the // UseSharedMemory is set to true, the runtime will attempt to use shared memory // as long as the size requested fits the pre-allocated size. -EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - // Compute the total memory footprint of the requested data. - // The master thread requires a stack only for itself. A worker - // thread (which at this point is a warp master) will require - // space for the variables of each thread in the warp, - // i.e. one DataSize chunk per warp lane. - // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) - ? DataSize - : WARPSIZE * DataSize; - - // Compute the start address of the frame of each thread in the warp. - uintptr_t FrameStartAddress = - (uintptr_t)data_sharing_push_stack_common(PushSize); - FrameStartAddress += (uintptr_t)(GetLaneId() * DataSize); - return (void *)FrameStartAddress; +EXTERN void *__kmpc_alloc_shared(size_t DataSize) { + return (void *)SafeMalloc(DataSize, "Alloc Shared"); } -// Pop the stack and free any memory which can be reclaimed. -// -// When the pop operation removes the last global memory slot, -// reclaim all outstanding global memory slots since it is -// likely we have reached the end of the kernel. -EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - __kmpc_impl_threadfence_block(); - - if (GetThreadIdInBlock() % WARPSIZE == 0) { - unsigned WID = GetWarpId(); - - // Current slot - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - - // Pointer to next available stack. - void *&StackP = DataSharingState.StackPtr[WID]; - - // Pop the frame. - StackP = FrameStart; - - // If the current slot is empty, we need to free the slot after the - // pop. - bool SlotEmpty = (StackP == &SlotP->Data[0]); - - if (SlotEmpty && SlotP->Prev) { - // Before removing the slot we need to reset StackP. - StackP = SlotP->PrevSlotStackPtr; - - // Remove the slot. - SlotP = SlotP->Prev; - SafeFree(SlotP->Next, "Free slot."); - SlotP->Next = 0; - } - } +EXTERN void __kmpc_free_shared(void *FrameStart) { + SafeFree(FrameStart, "Free Shared"); } // Begin a data sharing context. Maintain a list of references to shared @@ -277,4 +109,21 @@ omptarget_nvptx_simpleMemoryManager.Release(); } +// Deprecated globalization code +EXTERN void __kmpc_data_sharing_init_stack() {} +EXTERN void __kmpc_data_sharing_init_stack_spmd() {} + +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t) { + return (void *)SafeMalloc(DataSize, "Alloc Deprecated"); +} + +EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, int16_t) { + return (void *)SafeMalloc(DataSize, "Alloc Deprecated"); +} + +EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { + SafeFree(FrameStart, "Free Shared"); +} + #pragma omp end declare target Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1111,9 +1111,8 @@ } void analysisGlobalization() { - RuntimeFunction GlobalizationRuntimeIDs[] = { - OMPRTL___kmpc_data_sharing_coalesced_push_stack, - OMPRTL___kmpc_data_sharing_push_stack}; + RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared, + OMPRTL___kmpc_free_shared}; for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) { auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID]; Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -425,12 +425,9 @@ GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr) __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16) -__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, ) -__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16) -__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr) +__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy) +__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr) __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy) __OMP_RTL(__kmpc_end_sharing_variables, false, Void, ) __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr) Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.h =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -440,15 +440,14 @@ /// The data for the single globalized variable. struct MappedVarData { /// Corresponding field in the global record. - const FieldDecl *FD = nullptr; + llvm::Value *GlobalizedVal = nullptr; /// Corresponding address. Address PrivateAddr = Address::invalid(); /// true, if only one element is required (for latprivates in SPMD mode), /// false, if need to create based on the warp-size. bool IsOnePerTeam = false; MappedVarData() = delete; - MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false) - : FD(FD), IsOnePerTeam(IsOnePerTeam) {} + MappedVarData(bool IsOnePerTeam = false) : IsOnePerTeam(IsOnePerTeam) {} }; /// The map of local variables to their addresses in the global memory. using DeclToAddrMapTy = llvm::MapVector<const Decl *, MappedVarData>; @@ -460,29 +459,12 @@ EscapedParamsTy EscapedParameters; llvm::SmallVector<const ValueDecl*, 4> EscapedVariableLengthDecls; llvm::SmallVector<llvm::Value *, 4> EscapedVariableLengthDeclsAddrs; - const RecordDecl *GlobalRecord = nullptr; - llvm::Optional<const RecordDecl *> SecondaryGlobalRecord = llvm::None; - llvm::Value *GlobalRecordAddr = nullptr; llvm::Value *IsInSPMDModeFlag = nullptr; std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams; }; /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls; - /// List of records for the globalized variables in target/teams/distribute - /// contexts. Inner records are going to be joined into the single record, - /// while those resulting records are going to be joined into the single - /// union. This resulting union (one per CU) is the entry point for the static - /// memory management runtime functions. - struct GlobalPtrSizeRecsTy { - llvm::GlobalVariable *UseSharedMemory = nullptr; - llvm::GlobalVariable *RecSize = nullptr; - llvm::GlobalVariable *Buffer = nullptr; - SourceLocation Loc; - llvm::SmallVector<const RecordDecl *, 2> Records; - unsigned RegionCounter = 0; - }; - llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords; llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; /// List of the records with the list of fields for the reductions across the /// teams. Used to build the intermediate buffer for the fast teams Index: clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1096,17 +1096,6 @@ } Action(EST, WST); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1156,10 +1145,6 @@ CGM.getModule(), OMPRTL___kmpc_kernel_init), Args); - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack)); - emitGenericVarsProlog(CGF, WST.Loc); } @@ -1228,17 +1213,6 @@ } Action(*this, EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::UndefValue::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1260,12 +1234,6 @@ CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init), Args); - if (RequiresFullRuntime) { - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd)); - } - CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); @@ -1671,7 +1639,6 @@ static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); if (GlobalizedRD) { auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; - I->getSecond().GlobalRecord = GlobalizedRD; I->getSecond().MappedParams = std::make_unique<CodeGenFunction::OMPMapVars>(); DeclToAddrMapTy &Data = I->getSecond().LocalVarData; @@ -1679,8 +1646,7 @@ assert(Pair.getFirst()->isCanonicalDecl() && "Expected canonical declaration"); Data.insert(std::make_pair(Pair.getFirst(), - MappedVarData(Pair.getSecond(), - /*IsOnePerTeam=*/true))); + MappedVarData(/*IsOnePerTeam=*/true))); } } Rt.emitGenericVarsProlog(CGF, Loc); @@ -1709,281 +1675,68 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) return; - if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { - QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); - QualType SecGlobalRecTy; - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - - llvm::PointerType *GlobalRecPtrTy = - CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); - llvm::Value *GlobalRecCastAddr; - llvm::Value *IsTTD = nullptr; - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - if (I->getSecond().SecondaryGlobalRecord.hasValue()) { - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *PL = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), - OMPRTL___kmpc_parallel_level), - {RTLoc, ThreadID}); - IsTTD = Bld.CreateIsNull(PL); - } - llvm::Value *IsSPMD = Bld.CreateIsNotNull( - CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode))); - Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(SPMDBB); - Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy), - CharUnits::fromQuantity(Alignment)); - CGF.EmitBranch(ExitBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); - if (const RecordDecl *SecGlobalizedVarsRecord = - I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { - SecGlobalRecTy = - CGM.getContext().getRecordType(SecGlobalizedVarsRecord); - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - Size = Bld.CreateSelect( - IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); - } - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - CGF.EmitBlock(ExitBB); - auto *Phi = Bld.CreatePHI(GlobalRecPtrTy, - /*NumReservedValues=*/2, "_select_stack"); - Phi->addIncoming(RecPtr.getPointer(), SPMDBB); - Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); - GlobalRecCastAddr = Phi; - I->getSecond().GlobalRecordAddr = Phi; - I->getSecond().IsInSPMDModeFlag = IsSPMD; - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().Records.size() < 2 && - "Expected less than 2 globalized records: one for target and one " - "for teams."); - unsigned Offset = 0; - for (const RecordDecl *RD : GlobalizedRecords.back().Records) { - QualType RDTy = CGM.getContext().getRecordType(RD); - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); - unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); - Offset = - llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); - } - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); - Offset = llvm::alignTo(Offset, Alignment); - GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); - ++GlobalizedRecords.back().RegionCounter; - if (GlobalizedRecords.back().Records.size() == 1) { - assert(KernelStaticGlobalized && - "Kernel static pointer must be initialized already."); - auto *UseSharedMemory = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$is_shared"); - UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, Loc); - auto *StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, nullptr); - auto *RecSize = new llvm::GlobalVariable( - CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, - llvm::GlobalValue::InternalLinkage, nullptr, - "_openmp_static_kernel$size"); - RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - llvm::Value *Ld = CGF.EmitLoadOfScalar( - Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, - CGM.getContext().getSizeType(), Loc); - llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_get_team_static_memory), - GlobalRecordSizeArg); - GlobalizedRecords.back().Buffer = StaticGlobalized; - GlobalizedRecords.back().RecSize = RecSize; - GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; - GlobalizedRecords.back().Loc = Loc; - } - assert(KernelStaticGlobalized && "Global address must be set already."); - Address FrameAddr = CGF.EmitLoadOfPointer( - Address(KernelStaticGlobalized, CGM.getPointerAlign()), - CGM.getContext() - .getPointerType(CGM.getContext().VoidPtrTy) - .castAs<PointerType>()); - llvm::Value *GlobalRecValue = - Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer(); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); - } else { - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - bool UseSharedMemory = - IsInTTDRegion && GlobalRecordSize <= SharedMemorySize; - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), - IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack - : OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, GlobalRecPtrTy); - I->getSecond().GlobalRecordAddr = GlobalRecValue; - I->getSecond().IsInSPMDModeFlag = nullptr; - } - LValue Base = - CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy); - - // Emit the "global alloca" which is a GEP from the global declaration - // record using the pointer returned by the runtime. - LValue SecBase; - decltype(I->getSecond().LocalVarData)::const_iterator SecIt; - if (IsTTD) { - SecIt = I->getSecond().SecondaryLocalVarData->begin(); - llvm::PointerType *SecGlobalRecPtrTy = - CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); - SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( - Bld.CreatePointerBitCastOrAddrSpaceCast( - I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), - SecGlobalRecTy); + for (auto &Rec : I->getSecond().LocalVarData) { + const auto *VD = cast<VarDecl>(Rec.first); + bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); + QualType VarTy = VD->getType(); + + // Get the local allocation of a firstprivate variable before sharing + llvm::Value *ParValue; + if (EscapedParam) { + LValue ParLVal = + CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); + ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); } - for (auto &Rec : I->getSecond().LocalVarData) { - bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); - llvm::Value *ParValue; - if (EscapedParam) { - const auto *VD = cast<VarDecl>(Rec.first); - LValue ParLVal = - CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); - ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); - } - LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD); - // Emit VarAddr basing on lane-id if required. - QualType VarTy; - if (Rec.second.IsOnePerTeam) { - VarTy = Rec.second.FD->getType(); - } else { - llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( - VarAddr.getAddress(CGF).getPointer(), - {Bld.getInt32(0), getNVPTXLaneID(CGF)}); - VarTy = - Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); - VarAddr = CGF.MakeAddrLValue( - Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, - AlignmentSource::Decl); - } - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - assert(I->getSecond().IsInSPMDModeFlag && - "Expected unknown execution mode or required SPMD check."); - if (IsTTD) { - assert(SecIt->second.IsOnePerTeam && - "Secondary glob data must be one per team."); - LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); - VarAddr.setAddress( - Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF), - VarAddr.getPointer(CGF)), - VarAddr.getAlignment())); - Rec.second.PrivateAddr = VarAddr.getAddress(CGF); - } - Address GlobalPtr = Rec.second.PrivateAddr; - Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); - Rec.second.PrivateAddr = Address( - Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag, - LocalAddr.getPointer(), GlobalPtr.getPointer()), - LocalAddr.getAlignment()); - } - if (EscapedParam) { - const auto *VD = cast<VarDecl>(Rec.first); - CGF.EmitStoreOfScalar(ParValue, VarAddr); - I->getSecond().MappedParams->setVarAddr(CGF, VD, - VarAddr.getAddress(CGF)); - } - if (IsTTD) - ++SecIt; + + // Get the size needed in the stack. Logic of how much to allocate + // and which part to give to wich thread is inside the runtime function. + llvm::Value *Size = CGF.getTypeSize(VD->getType()); + llvm::Value *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + {Size}); + + Rec.second.GlobalizedVal = VoidPtr; + + // Cast the void pointer and get the address of the globalized variable. + llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); + llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); + LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy); + Rec.second.PrivateAddr = VarAddr.getAddress(CGF); + + // Assign the local allocation to the newly globalized location. + if (EscapedParam) { + CGF.EmitStoreOfScalar(ParValue, VarAddr); + I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF)); } } - for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding + for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) { + // Use actual memory size of the VLA object including the padding // for alignment purposes. - CGBuilderTy &Bld = CGF.Builder; llvm::Value *Size = CGF.getTypeSize(VD->getType()); CharUnits Align = CGM.getContext().getDeclAlign(VD); Size = Bld.CreateNUWAdd( Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); llvm::Value *AlignVal = llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); + Size = Bld.CreateUDiv(Size, AlignVal); Size = Bld.CreateNUWMul(Size, AlignVal); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo()); - LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(), + + // Allocate space for this VLA object to be globalized + llvm::Value *VoidPtr = + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_alloc_shared), + {Size}); + + I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr); + LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(), CGM.getContext().getDeclAlign(VD), AlignmentSource::Decl); I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD), Base.getAddress(CGF)); - I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue); } I->getSecond().MappedParams->apply(CGF); } @@ -1996,60 +1749,20 @@ const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I != FunctionGlobalizedDecls.end()) { - I->getSecond().MappedParams->restore(CGF); - if (!CGF.HaveInsertPoint()) - return; + // Deallocate the memory for each globalized VLA object for (llvm::Value *Addr : llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - Addr); + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + Addr); } - if (I->getSecond().GlobalRecordAddr) { - if (!IsInTTDRegion && - (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) { - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); - Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); - // There is no need to emit line number for unconditional branch. - (void)ApplyDebugLocation::CreateEmpty(CGF); - CGF.EmitBlock(NonSPMDBB); - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); - CGF.EmitBlock(ExitBB); - } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) { - assert(GlobalizedRecords.back().RegionCounter > 0 && - "region counter must be > 0."); - --GlobalizedRecords.back().RegionCounter; - // Emit the restore function only in the target region. - if (GlobalizedRecords.back().RegionCounter == 0) { - QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( - /*DestWidth=*/16, /*Signed=*/0); - llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( - Address(GlobalizedRecords.back().UseSharedMemory, - CGM.getContext().getTypeAlignInChars(Int16Ty)), - /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); - llvm::Value *Args[] = { - llvm::ConstantInt::get( - CGM.Int16Ty, - getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0), - IsInSharedMemory}; - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory), - Args); - } - } else { - CGF.EmitRuntimeCall( - OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); - } + // Deallocate the memory for each globalized value + for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { + I->getSecond().MappedParams->restore(CGF); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_free_shared), + {Rec.second.GlobalizedVal}); } } } @@ -4327,6 +4040,7 @@ } if (!Body) return; + CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = @@ -4340,7 +4054,6 @@ auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; I->getSecond().MappedParams = std::make_unique<CodeGenFunction::OMPMapVars>(); - I->getSecond().GlobalRecord = GlobalizedVarsRecord; I->getSecond().EscapedParameters.insert( VarChecker.getEscapedParameters().begin(), VarChecker.getEscapedParameters().end()); @@ -4349,21 +4062,16 @@ DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); + Data.insert(std::make_pair(VD, MappedVarData(IsInTTDRegion))); } if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None); VarChecker.Visit(Body); - I->getSecond().SecondaryGlobalRecord = - VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); I->getSecond().SecondaryLocalVarData.emplace(); DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); - const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert( - std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); + Data.insert(std::make_pair(VD, MappedVarData(/*IsInTTDRegion=*/true))); } } if (!NeedToDelayGlobalization) { @@ -4654,185 +4362,8 @@ CGOpenMPRuntime::processRequiresDirective(D); } -/// Get number of SMs and number of blocks per SM. -static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) { - std::pair<unsigned, unsigned> Data; - if (CGM.getLangOpts().OpenMPCUDANumSMs) - Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; - if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) - Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; - if (Data.first && Data.second) - return Data; - switch (getCudaArch(CGM)) { - case CudaArch::SM_20: - case CudaArch::SM_21: - case CudaArch::SM_30: - case CudaArch::SM_32: - case CudaArch::SM_35: - case CudaArch::SM_37: - case CudaArch::SM_50: - case CudaArch::SM_52: - case CudaArch::SM_53: - return {16, 16}; - case CudaArch::SM_60: - case CudaArch::SM_61: - case CudaArch::SM_62: - return {56, 32}; - case CudaArch::SM_70: - case CudaArch::SM_72: - case CudaArch::SM_75: - case CudaArch::SM_80: - case CudaArch::SM_86: - return {84, 32}; - case CudaArch::GFX600: - case CudaArch::GFX601: - case CudaArch::GFX602: - case CudaArch::GFX700: - case CudaArch::GFX701: - case CudaArch::GFX702: - case CudaArch::GFX703: - case CudaArch::GFX704: - case CudaArch::GFX705: - case CudaArch::GFX801: - case CudaArch::GFX802: - case CudaArch::GFX803: - case CudaArch::GFX805: - case CudaArch::GFX810: - case CudaArch::GFX900: - case CudaArch::GFX902: - case CudaArch::GFX904: - case CudaArch::GFX906: - case CudaArch::GFX908: - case CudaArch::GFX909: - case CudaArch::GFX90a: - case CudaArch::GFX90c: - case CudaArch::GFX1010: - case CudaArch::GFX1011: - case CudaArch::GFX1012: - case CudaArch::GFX1030: - case CudaArch::GFX1031: - case CudaArch::GFX1032: - case CudaArch::GFX1033: - case CudaArch::UNUSED: - case CudaArch::UNKNOWN: - break; - case CudaArch::LAST: - llvm_unreachable("Unexpected Cuda arch."); - } - llvm_unreachable("Unexpected NVPTX target without ptx feature."); -} - void CGOpenMPRuntimeGPU::clear() { - if (!GlobalizedRecords.empty() && - !CGM.getLangOpts().OpenMPCUDATargetParallel) { - ASTContext &C = CGM.getContext(); - llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs; - llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs; - RecordDecl *StaticRD = C.buildImplicitRecord( - "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - StaticRD->startDefinition(); - RecordDecl *SharedStaticRD = C.buildImplicitRecord( - "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); - SharedStaticRD->startDefinition(); - for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { - if (Records.Records.empty()) - continue; - unsigned Size = 0; - unsigned RecAlignment = 0; - for (const RecordDecl *RD : Records.Records) { - QualType RDTy = C.getRecordType(RD); - unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); - RecAlignment = std::max(RecAlignment, Alignment); - unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); - Size = - llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); - } - Size = llvm::alignTo(Size, RecAlignment); - llvm::APInt ArySize(/*numBits=*/64, Size); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - const bool UseSharedMemory = Size <= SharedMemorySize; - auto *Field = - FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, - SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - if (UseSharedMemory) { - SharedStaticRD->addDecl(Field); - SharedRecs.push_back(&Records); - } else { - StaticRD->addDecl(Field); - GlobalRecs.push_back(&Records); - } - Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); - Records.UseSharedMemory->setInitializer( - llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); - } - // Allocate SharedMemorySize buffer for the shared memory. - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore this code as sson as nvlink is fixed. - if (!SharedStaticRD->field_empty()) { - llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize); - QualType SubTy = C.getConstantArrayType( - C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0); - auto *Field = FieldDecl::Create( - C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); - Field->setAccess(AS_public); - SharedStaticRD->addDecl(Field); - } - SharedStaticRD->completeDefinition(); - if (!SharedStaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(SharedStaticRD); - llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMStaticTy, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, - llvm::UndefValue::get(LLVMStaticTy), - "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - C.getTargetAddressSpace(LangAS::cuda_shared)); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - StaticRD->completeDefinition(); - if (!StaticRD->field_empty()) { - QualType StaticTy = C.getRecordType(StaticRD); - std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); - llvm::APInt Size1(32, SMsBlockPerSM.second); - QualType Arr1Ty = - C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = - C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - // FIXME: nvlink does not handle weak linkage correctly (object with the - // different size are reported as erroneous). - // Restore CommonLinkage as soon as nvlink is fixed. - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), - "_openmp_static_glob_rd_$_"); - auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( - GV, CGM.VoidPtrTy); - for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { - Rec->Buffer->replaceAllUsesWith(Replacement); - Rec->Buffer->eraseFromParent(); - } - } - } + if (!TeamsReductions.empty()) { ASTContext &C = CGM.getContext(); RecordDecl *StaticRD = C.buildImplicitRecord(
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits