https://github.com/efwright updated https://github.com/llvm/llvm-project/pull/91261
>From 4b76d56f38baf86f6b65ef7e610ad266ba3d69b1 Mon Sep 17 00:00:00 2001 From: Eric Francis Wright <wright...@rzansel61.coral.llnl.gov> Date: Mon, 6 May 2024 12:20:44 -0700 Subject: [PATCH 1/4] OpenMP offload 'simd' directive --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 2 + clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 8 +- clang/lib/CodeGen/CGStmtOpenMP.cpp | 185 +++-- clang/lib/CodeGen/CodeGenFunction.cpp | 2 +- .../target_teams_generic_loop_codegen.cpp | 18 +- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 27 +- .../include/llvm/Frontend/OpenMP/OMPKinds.def | 12 + llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 655 +++++++++++++++++- llvm/lib/Transforms/Utils/CodeExtractor.cpp | 13 +- offload/DeviceRTL/include/Interface.h | 11 + offload/DeviceRTL/include/Mapping.h | 7 + offload/DeviceRTL/src/Kernel.cpp | 4 +- offload/DeviceRTL/src/Mapping.cpp | 34 + offload/DeviceRTL/src/Parallelism.cpp | 25 +- offload/DeviceRTL/src/Reduction.cpp | 48 ++ offload/DeviceRTL/src/State.cpp | 7 +- offload/DeviceRTL/src/Synchronization.cpp | 4 + offload/DeviceRTL/src/Workshare.cpp | 44 ++ 18 files changed, 1023 insertions(+), 83 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 3747b00d4893ad..836253ab1a7d8b 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1035,6 +1035,7 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC, CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) : CGM(CGM), OMPBuilder(CGM.getModule()) { + KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8); llvm::OpenMPIRBuilderConfig Config( CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), @@ -1056,6 +1057,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) } void CGOpenMPRuntime::clear() { + InternalVars.clear(); // Clean non-target variable declarations possibly used only in debug info. for (const auto &Data : EmittedNonTargetVariables) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 35ff75416cb776..16aff085579807 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -262,6 +262,7 @@ class CheckVarsEscapingDeclContext final bool IsCombinedParallelRegion) { if (!S) return; + for (const CapturedStmt::Capture &C : S->captures()) { if (C.capturesVariable() && !C.capturesVariableByCopy()) { const ValueDecl *VD = C.getCapturedVar(); @@ -336,13 +337,15 @@ class CheckVarsEscapingDeclContext final return; if (!D->hasAssociatedStmt()) return; + if (const auto *S = dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { // Do not analyze directives that do not actually require capturing, // like `omp for` or `omp simd` directives. llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); - if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { + if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown && + D->getDirectiveKind() != OMPD_simd) { VisitStmt(S->getCapturedStmt()); return; } @@ -1661,6 +1664,7 @@ void CGOpenMPRuntimeGPU::emitReduction( bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind); bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); + bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind); ASTContext &C = CGM.getContext(); @@ -1755,7 +1759,7 @@ void CGOpenMPRuntimeGPU::emitReduction( CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU( OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, - DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, + DistributeReduction, SimdReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); return; diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 71a27d0c6bc1fb..b4e699c1d003b8 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1454,6 +1454,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit( } const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(TaskRedRef)->getDecl()); +llvm::dbgs() << "Emitting " << VD->getName() << " " << VD << "\n"; EmitVarDecl(*VD); EmitStoreOfScalar(ReductionDesc, GetAddrOfLocalVar(VD), /*Volatile=*/false, TaskRedRef->getType()); @@ -1494,7 +1495,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( bool WithNowait = D.getSingleClause<OMPNowaitClause>() || isOpenMPParallelDirective(EKind) || TeamsLoopCanBeParallel || ReductionKind == OMPD_simd; - bool SimpleReduction = ReductionKind == OMPD_simd; + bool SimpleReduction = (CGM.getLangOpts().OpenMPIsTargetDevice ? false : ReductionKind == OMPD_simd); // Emit nowait reduction if nowait clause is present or directive is a // parallel directive (it always has implicit barrier). CGM.getOpenMPRuntime().emitReduction( @@ -2736,59 +2737,139 @@ GetAlignedMapping(const OMPLoopDirective &S, CodeGenFunction &CGF) { // available for "loop bind(thread)", which maps to "simd". static void emitOMPSimdDirective(const OMPLoopDirective &S, CodeGenFunction &CGF, CodeGenModule &CGM) { - bool UseOMPIRBuilder = - CGM.getLangOpts().OpenMPIRBuilder && isSimdSupportedByOpenMPIRBuilder(S); - if (UseOMPIRBuilder) { - auto &&CodeGenIRBuilder = [&S, &CGM, UseOMPIRBuilder](CodeGenFunction &CGF, - PrePostActionTy &) { - // Use the OpenMPIRBuilder if enabled. - if (UseOMPIRBuilder) { - llvm::MapVector<llvm::Value *, llvm::Value *> AlignedVars = - GetAlignedMapping(S, CGF); - // Emit the associated statement and get its loop representation. - const Stmt *Inner = S.getRawStmt(); - llvm::CanonicalLoopInfo *CLI = - CGF.EmitOMPCollapsedCanonicalLoopNest(Inner, 1); - - llvm::OpenMPIRBuilder &OMPBuilder = - CGM.getOpenMPRuntime().getOMPBuilder(); - // Add SIMD specific metadata - llvm::ConstantInt *Simdlen = nullptr; - if (const auto *C = S.getSingleClause<OMPSimdlenClause>()) { - RValue Len = CGF.EmitAnyExpr(C->getSimdlen(), AggValueSlot::ignored(), - /*ignoreResult=*/true); - auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal()); - Simdlen = Val; - } - llvm::ConstantInt *Safelen = nullptr; - if (const auto *C = S.getSingleClause<OMPSafelenClause>()) { - RValue Len = CGF.EmitAnyExpr(C->getSafelen(), AggValueSlot::ignored(), - /*ignoreResult=*/true); - auto *Val = cast<llvm::ConstantInt>(Len.getScalarVal()); - Safelen = Val; - } - llvm::omp::OrderKind Order = llvm::omp::OrderKind::OMP_ORDER_unknown; - if (const auto *C = S.getSingleClause<OMPOrderClause>()) { - if (C->getKind() == OpenMPOrderClauseKind::OMPC_ORDER_concurrent) { - Order = llvm::omp::OrderKind::OMP_ORDER_concurrent; - } - } - // Add simd metadata to the collapsed loop. Do not generate - // another loop for if clause. Support for if clause is done earlier. - OMPBuilder.applySimd(CLI, AlignedVars, - /*IfCond*/ nullptr, Order, Simdlen, Safelen); - return; - } + bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIsTargetDevice; + if(UseOMPIRBuilder) { + auto *CS = dyn_cast<CapturedStmt>(S.getAssociatedStmt()); + auto *CL = dyn_cast<OMPCanonicalLoop>(CS->getCapturedStmt()); + CGCapturedStmtInfo CGSI(*CS, CR_OpenMP); + + CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(*this, &CGSI); + llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( + AllocaInsertPt->getParent(), AllocaInsertPt->getIterator()); + + llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder(); + + using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; + + // Callback function for generating the trip count of the loop. + // This function should assign values to the TripCount and Signed variables + llvm::Value *LoopVar; + std::string LoopVarName; + EmittedClosureTy LoopVarClosure; + + auto DistanceCB = [&](llvm::BasicBlock *AllocaBB, + InsertPointTy CodeGenIP) -> llvm::Value* { + InsertPointTy AllocaIP(AllocaBB, AllocaBB->getTerminator()->getIterator()); + OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB( + *this, AllocaIP, *(CodeGenIP.getBlock())); + Builder.restoreIP(CodeGenIP); + + // Emit the loop variable, needed for the distance func + const auto *For = dyn_cast<ForStmt>(CL->getLoopStmt()); + if(const Stmt *InitStmt = For->getInit()) + EmitStmt(InitStmt); + + auto *LoopVarRef = CL->getLoopVarRef(); + LValue LCVal = EmitLValue(LoopVarRef); + //Address LoopVarAddress = LCVal.getAddress(*this); + //LoopVar = dyn_cast<llvm::Instruction>(LoopVarAddress.getPointer()); + LoopVar = dyn_cast<llvm::Instruction>(LCVal.getPointer(*this)); + LoopVarName = LoopVarRef->getNameInfo().getAsString(); + + // Emit the distance func from the CanonicalLoop + const CapturedStmt *DistanceFunc = CL->getDistanceFunc(); + EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc); + + // Load the output and store it in the TripCount + QualType LogicalTy = DistanceFunc->getCapturedDecl() + ->getParam(0) + ->getType() + .getNonReferenceType(); + + //Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr"); + RawAddress CountAddr = CreateMemTemp(LogicalTy, ".count.addr"); + + emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()}); + auto *TripCount = Builder.CreateLoad(CountAddr, ".count"); + + const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc(); + LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc); + + return TripCount; }; - { - auto LPCRegion = - CGOpenMPRuntime::LastprivateConditionalRAII::disable(CGF, S); - OMPLexicalScope Scope(CGF, S, OMPD_unknown); - CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_simd, - CodeGenIRBuilder); - } + + auto FiniCB = [this](InsertPointTy IP) { + OMPBuilderCBHelpers::FinalizeOMPRegion(*this, IP); + }; + + auto PrivCB = [](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + llvm::Value &, llvm::Value &Val, llvm::Value *&ReplVal) { + ReplVal = &Val; + return CodeGenIP; + }; + + auto BodyGenCB = [&] + (//InsertPointTy OuterAllocaIP, + llvm::BasicBlock *OuterAllocaBB, + InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + InsertPointTy Prolog, InsertPointTy ReductionEpilog, + llvm::Value *Virtual) { + + llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock(); + InsertPointTy OuterAllocaIP(OuterAllocaBB, OuterAllocaBB->getTerminator()->getIterator()); + + OMPBuilderCBHelpers::OutlinedRegionBodyRAII IRB( + *this, OuterAllocaIP, *(Prolog.getBlock())); + Builder.restoreIP(Prolog); + + OMPPrivateScope PrivateScope(*this); + EmitOMPFirstprivateClause(S, PrivateScope); + EmitOMPPrivateClause(S, PrivateScope); + EmitOMPReductionClauseInit(S, PrivateScope); + PrivateScope.Privatize(); + + const CapturedStmt *LoopVarFunc = CL->getLoopVarFunc(); + + Builder.restoreIP(CodeGenIP); + emitCapturedStmtCall(*this, LoopVarClosure, + {LoopVar, Virtual}); + + // Generate the body of the loop + OMPBuilderCBHelpers::EmitOMPOutlinedRegionBody( + *this, + S.getBody(), + AllocaIP, + CodeGenIP, + "simd"); + + llvm::BasicBlock *RedEpilogBB = ReductionEpilog.getBlock(); + llvm::Instruction *RedEpilogTerminator = RedEpilogBB->getTerminator(); + llvm::BasicBlock *FinalBlock = RedEpilogBB->getSingleSuccessor(); + + Builder.restoreIP(ReductionEpilog); + EmitOMPReductionClauseFinal(S, OMPD_simd); + + llvm::BasicBlock *ReductionThenBB = Builder.GetInsertBlock(); + + if(!(ReductionThenBB->getTerminator())) { + RedEpilogTerminator->eraseFromParent(); + Builder.CreateBr(FinalBlock); + } + + }; + + Builder.restoreIP( + OMPBuilder.createSimdLoop( + Builder, + AllocaIP, + BodyGenCB, + DistanceCB, + PrivCB, + FiniCB + )); + return; - } + } CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S); CGF.OMPFirstScanLoop = true; diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 2306043c90f406..4e3350db14b1d2 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -102,7 +102,7 @@ CodeGenFunction::~CodeGenFunction() { // seems to be a reasonable spot. We do it here, as opposed to the deletion // time of the CodeGenModule, because we have to ensure the IR has not yet // been "emitted" to the outside, thus, modifications are still sensible. - if (CGM.getLangOpts().OpenMPIRBuilder && CurFn) + if ((CGM.getLangOpts().OpenMPIsTargetDevice || CGM.getLangOpts().OpenMPIRBuilder) && CurFn) CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn); } diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp index e05b3209f9eff2..4194bdec549dd4 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp @@ -481,6 +481,7 @@ int foo() { // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5) // IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr +// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 @@ -498,7 +499,6 @@ int foo() { // IR-GPU-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100 // IR-GPU-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]] // IR-GPU: body: -// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[TMP2]]) // IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 // IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] @@ -513,12 +513,11 @@ int foo() { // IR-GPU: else: // IR-GPU-NEXT: br label [[IFCONT]] // IR-GPU: ifcont: -// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]]) // IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4 // IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]] // IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] -// IR-GPU: then3: +// IR-GPU: then2: // IR-GPU-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] // IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0 // IR-GPU-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8 @@ -526,9 +525,9 @@ int foo() { // IR-GPU-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4 // IR-GPU-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4 // IR-GPU-NEXT: br label [[IFCONT4:%.*]] -// IR-GPU: else4: +// IR-GPU: else3: // IR-GPU-NEXT: br label [[IFCONT4]] -// IR-GPU: ifcont5: +// IR-GPU: ifcont4: // IR-GPU-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1 // IR-GPU-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4 // IR-GPU-NEXT: br label [[PRECOND]] @@ -627,6 +626,7 @@ int foo() { // IR-GPU-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5) // IR-GPU-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // IR-GPU-NEXT: [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr +// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr // IR-GPU-NEXT: [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr // IR-GPU-NEXT: store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8 @@ -644,7 +644,6 @@ int foo() { // IR-GPU-NEXT: [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 100 // IR-GPU-NEXT: br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]] // IR-GPU: body: -// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]]) // IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 // IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] @@ -659,12 +658,11 @@ int foo() { // IR-GPU: else: // IR-GPU-NEXT: br label [[IFCONT]] // IR-GPU: ifcont: -// IR-GPU-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) // IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[TMP2]]) // IR-GPU-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4 // IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]] // IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]] -// IR-GPU: then3: +// IR-GPU: then2: // IR-GPU-NEXT: [[TMP15:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]] // IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0 // IR-GPU-NEXT: [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8 @@ -672,9 +670,9 @@ int foo() { // IR-GPU-NEXT: [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4 // IR-GPU-NEXT: store i32 [[TMP19]], ptr [[TMP18]], align 4 // IR-GPU-NEXT: br label [[IFCONT4:%.*]] -// IR-GPU: else4: +// IR-GPU: else3: // IR-GPU-NEXT: br label [[IFCONT4]] -// IR-GPU: ifcont5: +// IR-GPU: ifcont4: // IR-GPU-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1 // IR-GPU-NEXT: store i32 [[TMP20]], ptr [[DOTCNT_ADDR_ASCAST]], align 4 // IR-GPU-NEXT: br label [[PRECOND]] diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 8834c3b1f50115..82041a7b2a03fb 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -285,7 +285,7 @@ class OffloadEntriesInfoManager { /// Return true if a there are no entries defined. bool empty() const; /// Return number of entries defined so far. - unsigned size() const { return OffloadingEntriesNum; } + unsigned size() const { return OffloadingEntriesNum /*OffloadEntriesTargetRegion.size()*/ /*OffloadingEntriesNum*/; } OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {} @@ -514,6 +514,11 @@ class OpenMPIRBuilder { /// all functions are finalized. void finalize(Function *Fn = nullptr); + CallInst *globalizeAlloca(AllocaInst *Alloca, SmallVector<Instruction*, 32>&); + void globalizeParallelVars(Function *CurFn); + SmallPtrSet<Value*, 32> VarsNeedingGlobalization; + void globalizeVars(Function *CurFn); + /// Add attributes known for \p FnID to \p Fn. void addAttributes(omp::RuntimeFunction FnID, Function &Fn); @@ -592,6 +597,18 @@ class OpenMPIRBuilder { using BodyGenCallbackTy = function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; + using LoopBodyCallbackTy = + function_ref<void( + BasicBlock *OuterAllocaBB, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, + InsertPointTy PrologIP, InsertPointTy ReductionEpilogIP, + Value *IterationNum + )>; + + using TripCountCallbackTy = + function_ref< + Value*(llvm::BasicBlock *AllocaBB, InsertPointTy CodeGenIP) + >; + // This is created primarily for sections construct as llvm::function_ref // (BodyGenCallbackTy) is not storable (as described in the comments of // function_ref class - function_ref contains non-ownable reference @@ -672,6 +689,13 @@ class OpenMPIRBuilder { InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective); + IRBuilder<>::InsertPoint + createSimdLoop(const LocationDescription &Loc, InsertPointTy AllocaIP, + LoopBodyCallbackTy BodyGenCB, + TripCountCallbackTy DistanceCB, + PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB); + /// Generator for '#omp parallel' /// /// \param Loc The insert and source location description. @@ -1876,6 +1900,7 @@ class OpenMPIRBuilder { InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait = false, bool IsTeamsReduction = false, bool HasDistribute = false, + bool IsSimdReduction = false, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, Value *SrcLocInfo = nullptr); diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index d8f3c8fa06b747..81dc9299207693 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -124,6 +124,9 @@ __OMP_FUNCTION_TYPE(ShuffleReduce, false, Void, VoidPtr, Int16, Int16, Int16) __OMP_FUNCTION_TYPE(InterWarpCopy, false, Void, VoidPtr, Int32) __OMP_FUNCTION_TYPE(GlobalList, false, Void, VoidPtr, Int32, VoidPtr) +__OMP_FUNCTION_TYPE(LoopTask, false, Void, Int64, VoidPtrPtr) +__OMP_FUNCTION_TYPE(SimdTask, false, Void, VoidPtrPtr) + #undef __OMP_FUNCTION_TYPE #undef OMP_FUNCTION_TYPE @@ -204,6 +207,7 @@ __ICV_RT_GET(proc_bind, omp_get_proc_bind) __OMP_RTL(__kmpc_barrier, false, Void, IdentPtr, Int32) +__OMP_RTL(__kmpc_simd_barrier, false, Void, ) __OMP_RTL(__kmpc_cancel, false, Int32, IdentPtr, Int32, Int32) __OMP_RTL(__kmpc_cancel_barrier, false, Int32, IdentPtr, Int32) __OMP_RTL(__kmpc_error, false, Void, IdentPtr, Int32, Int8Ptr) @@ -227,6 +231,7 @@ __OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, ) __OMP_RTL(__kmpc_get_warp_size, false, Int32, ) __OMP_RTL(omp_get_thread_num, false, Int32, ) +__OMP_RTL(omp_get_simd_lane, false, Int32, ) __OMP_RTL(omp_get_num_threads, false, Int32, ) __OMP_RTL(omp_get_max_threads, false, Int32, ) __OMP_RTL(omp_in_parallel, false, Int32, ) @@ -484,6 +489,8 @@ __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, ) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_end_serialized_parallel, false, Void, IdentPtr, Int32) __OMP_RTL(__kmpc_shuffle_int32, false, Int32, Int32, Int16, Int16) +__OMP_RTL(__kmpc_nvptx_simd_reduce_nowait_v2, false, Int32, IdentPtr, + Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr) __OMP_RTL(__kmpc_nvptx_parallel_reduce_nowait_v2, false, Int32, IdentPtr, Int64, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr) __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, @@ -509,6 +516,10 @@ __OMP_RTL(__kmpc_syncwarp, false, Void, Int64) __OMP_RTL(__llvm_profile_register_function, false, Void, VoidPtr) __OMP_RTL(__llvm_profile_register_names_function, false, Void, VoidPtr, Int64) +__OMP_RTL(__kmpc_simd_4u, false, Void, IdentPtr, LoopTaskPtr, Int32, VoidPtrPtr) +__OMP_RTL(__kmpc_simd_8u, false, Void, IdentPtr, LoopTaskPtr, Int64, VoidPtrPtr) +__OMP_RTL(__kmpc_simd, false, Void, IdentPtr, SimdTaskPtr, VoidPtrPtr, Int32) + __OMP_RTL(__last, false, Void, ) #undef __OMP_RTL @@ -715,6 +726,7 @@ __OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, ZExt, Par __OMP_RTL_ATTRS(__kmpc_get_warp_size, GetterAttrs, ZExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, SExt, ParamAttrs()) +__OMP_RTL_ATTRS(omp_get_simd_lane, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, SExt, ParamAttrs()) __OMP_RTL_ATTRS(omp_in_parallel, GetterAttrs, SExt, ParamAttrs()) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 172812a3802d33..1d5b24475d1d1b 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -150,6 +150,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { } #endif +Function *GLOBAL_ReductionFunc = nullptr; + static const omp::GV &getGridValue(const Triple &T, Function *Kernel) { if (T.isAMDGPU()) { StringRef Features = @@ -798,6 +800,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) { for (Function *F : ConstantAllocaRaiseCandidates) raiseUserConstantDataAllocasToEntryBlock(Builder, F); + //globalizeVars(Fn); + EmitMetadataErrorReportFunctionTy &&ErrorReportFn = [](EmitMetadataErrorKind Kind, const TargetRegionEntryInfo &EntryInfo) -> void { @@ -806,7 +810,7 @@ void OpenMPIRBuilder::finalize(Function *Fn) { "OMPIRBuilder finalization \n"; }; - if (!OffloadInfoManager.empty()) + if (!OffloadInfoManager.empty()) createOffloadEntriesAndInfoMetadata(ErrorReportFn); if (Config.EmitLLVMUsedMetaInfo.value_or(false)) { @@ -814,6 +818,159 @@ void OpenMPIRBuilder::finalize(Function *Fn) { M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")}; emitUsed("llvm.compiler.used", LLVMCompilerUsed); } + +} + +CallInst * OpenMPIRBuilder::globalizeAlloca( + AllocaInst *Alloca, + SmallVector<Instruction*, 32> &ToBeDeleted +) { + FunctionCallee AllocFn = getOrCreateRuntimeFunctionPtr( + OMPRTL___kmpc_alloc_shared + ); + + Builder.SetInsertPoint(Alloca); + Value *SharedAllocArgs[] = { + //ConstantInt::get(Int64, Alloca->getType()->getScalarSizeInBits()/8) + + //ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())); + //ConstantExpr::getSizeOf(Alloca->getAllocatedType()) + ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())->getFixedValue()) + }; + + CallInst *AllocSharedCall = Builder.CreateCall(AllocFn, ArrayRef<Value*>(SharedAllocArgs, 1)); + AllocSharedCall->setName(Alloca->getName() + "_on_stack"); + //Value *ReplValue = Builder.CreateBitcast(AllocSharedCall, Alloca->getType(), Alloca->getName() + "_on_stack"); + + dbgs() << "Created " << *AllocSharedCall << "\n"; + dbgs() << *(Alloca->getType()) << "\n"; + dbgs() << *(AllocSharedCall->getType()) << "\n"; + + //Type *CastType = PointerType::get(Alloca->getAllocatedType(), 0); + //dbgs() << " " << *CastType << "\n"; + //llvm::Value *CastedSharedAlloc = Builder.CreateBitCast( + // AllocSharedCall, CastType, Alloca->getName()+"_on_stack" + //); + + //dbgs() << " Casted " << *CastedSharedAlloc << "\n"; + + //Alloca->replaceAllUsesWith(AllocSharedCall); + + // If the Alloca was allocated in address space 5 (local) we need to + // account for a type mismatch between it and the return from __kmpc_shared_alloc + + for(auto U = Alloca->user_begin(); U != Alloca->user_end(); U++) { + dbgs () << " User - " << *(*U) << "\n"; + } + + if(Alloca->hasOneUser() && isa<AddrSpaceCastInst>(Alloca->user_back())) { + auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(Alloca->user_back()); + dbgs() << *(AddrSpaceCast->getType()) << "\n"; + AddrSpaceCast->replaceAllUsesWith(AllocSharedCall); + //AddrSpaceCast->removeFromParent(); + ToBeDeleted.push_back(AddrSpaceCast); + } else { + Alloca->replaceAllUsesWith(AllocSharedCall); + } + ToBeDeleted.push_back(Alloca); + //Alloca->removeFromParent(); + + //for(auto U = AllocSharedCall->user_begin(); U != AllocSharedCall->user_end(); U++) { + // if(auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(*U)) { + // if(AddrSpaceCast->getSrcAddressSpace() == AddrSpaceCast->getDestAddressSpace()) { + // AddrSpaceCast->replaceAllUsesWith(CastedSharedAlloc); + // AddrSpaceCast->removeFromParent(); + // } + // } + //} + + //Alloca->removeFromParent(); + + dbgs() << " var globalized!\n"; + + return AllocSharedCall; + +} + +void OpenMPIRBuilder::globalizeParallelVars( + llvm::Function *CurFn +) { + SmallVector<Instruction*, 32> ToBeDeleted; + std::stack<CallInst*> GlobalizedVars; + + dbgs() << " Exploring: " << CurFn->getName() << "\n"; + for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++) + { + for(auto I = BB->begin(); I != BB->end(); I++) + { + if(auto Alloca = dyn_cast<AllocaInst>(I)) { + dbgs() << " Found Alloca: " << *Alloca << "\n"; + CallInst * GlobalizedAlloca = globalizeAlloca(Alloca, ToBeDeleted); + GlobalizedVars.push(GlobalizedAlloca); + } else if(auto FnCall = dyn_cast<CallInst>(I)) { + dbgs() << " Found Function Call: " << *FnCall << "\n"; + } + } + } + + BasicBlock &EndBlock = CurFn->back(); + Builder.SetInsertPoint(EndBlock.begin()); + while(!GlobalizedVars.empty()) { + CallInst *SharedAlloc = GlobalizedVars.top(); + GlobalizedVars.pop(); + FunctionCallee FreeFn = getOrCreateRuntimeFunctionPtr( + OMPRTL___kmpc_free_shared + ); + + Value *SharedFreeArgs[] = { + SharedAlloc, + SharedAlloc->getArgOperand(0) + }; + + CallInst *SharedFreeCall = Builder.CreateCall(FreeFn, ArrayRef<Value*>(SharedFreeArgs, 2)); + dbgs() << " Freed - " << *SharedFreeCall << "\n"; + } + + for(auto I : ToBeDeleted) + I->removeFromParent(); + +} + +// Globalize any variables that are needed in a lower level of +// the parallel hierarchy. +// Only Vars used in 'simd' regions are supported right now. +void OpenMPIRBuilder::globalizeVars(llvm::Function *CurFn) +{ + + std::stack<llvm::AllocaInst> Allocas; + SmallPtrSet<AllocaInst*, 32> EscapedVars; + + //dbgs() << "Function: " << CurFn->getName() << "\n"; + + for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++) + { + for(auto I = BB->begin(); I != BB->end(); I++) + { + //dbgs() << " Instruction: " << *I << "\n"; + if(auto FnCall = dyn_cast<CallInst>(I)) + { + //dbgs() << " Found call: " << *FnCall << "\n"; + if(auto Fn = FnCall->getCalledFunction()) { + //dbgs() << " " << Fn->getName() << "\n"; + if(Fn->getName() == "__kmpc_parallel_51") { + //dbgs() << " Parallel!\n"; + + Function *OutlinedFn = dyn_cast<Function>(FnCall->getArgOperand(5)); + assert(OutlinedFn && "failed to find GPU parallel outlined fn"); + + + dbgs() << "Found a parallel region\n"; + globalizeParallelVars(OutlinedFn); + } + } + } + } + } } OpenMPIRBuilder::~OpenMPIRBuilder() { @@ -975,9 +1132,11 @@ OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind, uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - Value *Args[] = { - getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags), - getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))}; + if (!ThreadID) + ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize)); + + Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags), + ThreadID}; // If we are in a cancellable parallel region, barriers are cancellation // points. @@ -1355,6 +1514,467 @@ hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, } } +IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( + const LocationDescription &Loc, InsertPointTy OuterAllocaIP, + LoopBodyCallbackTy BodyGenCB, + TripCountCallbackTy DistanceCB, + PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB +) +{ + assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); + + if (!updateToLocation(Loc)) + return Loc.IP; + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + BasicBlock *InsertBB = Builder.GetInsertBlock(); + Function *OuterFn = InsertBB->getParent(); + + LLVM_DEBUG(dbgs() << "At the start of createSimdLoop:\n" << *OuterFn << "\n"); + + // Save the outer alloca block because the insertion iterator may get + // invalidated and we still need this later. + BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); + + // Vector to remember instructions we used only during the modeling but which + // we want to delete at the end. + SmallVector<Instruction *, 16> ToBeDeleted; + + // Create an artificial insertion point that will also ensure the blocks we + // are about to split are not degenerated. + auto *UI = new UnreachableInst(Builder.getContext(), InsertBB); + + Instruction *ThenTI = UI, *ElseTI = nullptr; + + BasicBlock *ThenBB = ThenTI->getParent(); + + // Alloca block for simd + BasicBlock *EntryBB = ThenBB->splitBasicBlock(ThenTI, "omp.simd.entry"); + + // Block for setup related to simd + // i.e variable privatizaiton, trip count, reductions + BasicBlock *PrologBB = EntryBB->splitBasicBlock(ThenTI, "omp.simd.prolog"); + + // Entry block for the outlined loop body + // Allocas from the loop body should be done here + BasicBlock *LoopEntryBB = PrologBB->splitBasicBlock(ThenTI, "omp.simd.loop.entry"); + + // Block for generating the loop body + BasicBlock *LoopBodyBB = LoopEntryBB->splitBasicBlock(ThenTI, "omp.simd.loop.body"); + + BasicBlock *LoopPreFiniBB = + LoopBodyBB->splitBasicBlock(ThenTI, "omp.simd.loop.pre_finalize"); + + BasicBlock *LoopExitBB = + LoopPreFiniBB->splitBasicBlock(ThenTI, "omp.simd.loop.outlined.exit"); + + // Block for finalizing any reductions + BasicBlock *ReductionEpilogBB = + LoopExitBB->splitBasicBlock(ThenTI, "omp.reduction.epilog"); + + BasicBlock *FinalizeBB = + ReductionEpilogBB->splitBasicBlock(ThenTI, "omp.simd.finalize"); + + auto FiniCBWrapper = [&](InsertPointTy IP) { + // Hide "open-ended" blocks from the given FiniCB by setting the right jump + // target to the region exit blocks + if (IP.getBlock()->end() == IP.getPoint()) { + IRBuilder<>::InsertPointGuard IPG(Builder); + Builder.restoreIP(IP); + Instruction *I = Builder.CreateBr(FinalizeBB); //PRegExitBB); + IP = InsertPointTy(I->getParent(), I->getIterator()); + } + assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 && + IP.getBlock()->getTerminator()->getSuccessor(0) == FinalizeBB && //PRegExitBB && + "Unexpected insertion point for finalization call!"); + return FiniCB(IP); + }; + + FinalizationStack.push_back({FiniCBWrapper, OMPD_simd, false}); + + // Compute the loop trip count + // Insert after the outer alloca to ensure all variables needed + // in its calculation are ready + + InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator()); + assert(DistanceCB && "expected loop trip count callback function!"); + Value *DistVal = DistanceCB(EntryBB, DistanceIP); + assert(DistVal && "trip count call back should return integer trip count"); + Type *DistValType = DistVal->getType(); + assert(DistValType->isIntegerTy() && "trip count should be integer type"); + + LLVM_DEBUG(dbgs() << "After DistanceCB:\n" << *PrologBB << "\n"); + LLVM_DEBUG(dbgs() << "Trip count variable: " << *DistVal << "\n"); + + // Create the virtual iteration variable that will be pulled into + // the outlined function. + //Builder.restoreIP(OuterAllocaIP); + Builder.SetInsertPoint(EntryBB, EntryBB->begin()); + AllocaInst *OMPIVAlloca = Builder.CreateAlloca(DistValType, nullptr, "omp.iv.tmp"); + Instruction *OMPIV = Builder.CreateLoad(DistValType, OMPIVAlloca, "omp.iv"); + //InsertPointTy MidAllocaIP = Builder.saveIP(); + + // Generate the privatization allocas in the block that will become the entry + // of the outlined function. +// Builder.SetInsertPoint(LoopEntryBB->getTerminator()); + Builder.SetInsertPoint(LoopEntryBB, LoopEntryBB->begin()); + // Use omp.iv in the outlined region so it gets captured during the outline + Instruction *OMPIVUse = dyn_cast<Instruction>( + Builder.CreateAdd(OMPIV, OMPIV, "omp.iv.tobedeleted")); + InsertPointTy InnerAllocaIP = Builder.saveIP(); + + // All of the temporary omp.iv variables need to be deleted later + // Order matters + ToBeDeleted.push_back(OMPIVUse); + ToBeDeleted.push_back(OMPIV); + ToBeDeleted.push_back(OMPIVAlloca); + + LLVM_DEBUG(dbgs() << "omp.iv variable generated:\n" << *OuterFn << "\n"); + + LLVM_DEBUG(dbgs() << "Before body codegen:\n" << *OuterFn << "\n"); + assert(BodyGenCB && "Expected body generation callback!"); + InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); //LoopBodyBB->begin()); + + InsertPointTy PrologIP(PrologBB, PrologBB->getTerminator()->getIterator()); + InsertPointTy ReductionEpilogIP(ReductionEpilogBB, ReductionEpilogBB->begin()); + + // Generate the body of the loop. The omp.iv variable is a value between + // 0 <= omp.iv < TripCount + // If a loop variable is needed, then this callback function can initialize + // it based on the omp.iv. + BodyGenCB(EntryBB, InnerAllocaIP, CodeGenIP, PrologIP, ReductionEpilogIP, OMPIV); + + LLVM_DEBUG(dbgs() << "After body codegen:\n" << *OuterFn << "\n"); + + // Determine what runtime function should be called based on the type + // of the trip count + //FunctionCallee RTLFn; + + // Outline 1 + { + OutlineInfo OI; + + // Adjust the finalization stack, verify the adjustment, and call the + // finalize function a last time to finalize values between the pre-fini + // block and the exit block if we left the parallel "the normal way". + //auto FiniInfo = FinalizationStack.pop_back_val(); + //(void)FiniInfo; + //assert(FiniInfo.DK == OMPD_simd && + // "Unexpected finalization stack state!"); + + Instruction *LoopPreFiniTI = LoopPreFiniBB->getTerminator(); + + InsertPointTy PreFiniIP(LoopPreFiniBB, LoopPreFiniTI->getIterator()); + FiniCB(PreFiniIP); + + OI.OuterAllocaBB = EntryBB; //OuterAllocaBlock; + OI.EntryBB = LoopEntryBB; + OI.ExitBB = LoopExitBB; + + SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; + SmallVector<BasicBlock *, 32> Blocks; + OI.collectBlocks(ParallelRegionBlockSet, Blocks); + + // Ensure a single exit node for the outlined region by creating one. + // We might have multiple incoming edges to the exit now due to finalizations, + // e.g., cancel calls that cause the control flow to leave the region. + //BasicBlock *PRegOutlinedExitBB = PRegExitBB; + //PRegExitBB = LRegExitBB; + //PRegOutlinedExitBB->setName("omp.loop.outlined.exit"); + + Blocks.push_back(LoopExitBB); + + CodeExtractorAnalysisCache CEAC(*OuterFn); + + CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, + /* AggregateArgs */ true, + /* BlockFrequencyInfo */ nullptr, + /* BranchProbabilityInfo */ nullptr, + /* AssumptionCache */ nullptr, + /* AllowVarArgs */ false, + /* AllowAlloca */ true, + /* AllocationBlock */ EntryBB, //OuterAllocaBlock, + /* Suffix */ ".omp_simd"); + + BasicBlock *CommonExit = nullptr; + SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; + Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); + Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands); + + auto PrivHelper = [&](Value &V) { + // Exclude omp.iv from aggregate + if (&V == OMPIV) { + OI.ExcludeArgsFromAggregate.push_back(&V); + return; + } + + // Get all uses of value that are inside of the outlined region + SetVector<Use *> Uses; + for (Use &U : V.uses()) + if (auto *UserI = dyn_cast<Instruction>(U.getUser())) + if (ParallelRegionBlockSet.count(UserI->getParent())) + Uses.insert(&U); + + Value *Inner = &V; + + // If the value isn't a pointer type, store it in a pointer + // Unpack it inside the outlined region + if (!V.getType()->isPointerTy()) { + IRBuilder<>::InsertPointGuard Guard(Builder); + LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n"); + + Builder.restoreIP(OuterAllocaIP); + Value *Ptr = Builder.CreateAlloca( + V.getType(), nullptr, V.getName() + ".reloaded"); + + // Store to stack at end of the block that currently branches to the entry + // block of the to-be-outlined region. + Builder.SetInsertPoint( + InsertBB, InsertBB->getTerminator()->getIterator()); + Builder.CreateStore(&V, Ptr); + + // Load back next to allocations in the to-be-outlined region. + Builder.restoreIP(InnerAllocaIP); + Inner = Builder.CreateLoad(V.getType(), Ptr); + } + + Value *ReplacementValue = nullptr; + Builder.restoreIP( + PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue)); + assert(ReplacementValue && + "Expected copy/create callback to set replacement value!"); + if (ReplacementValue == &V) + return; + + for (Use *UPtr : Uses) + UPtr->set(ReplacementValue); + + }; + + LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n"); + + InnerAllocaIP = IRBuilder<>::InsertPoint( + OMPIV->getParent(), OMPIV->getNextNode()->getIterator()); + + // Reset the outer alloca insertion point to the entry of the relevant block + // in case it was invalidated. + OuterAllocaIP = IRBuilder<>::InsertPoint( + OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt()); + + for (Value *Input : Inputs) { + PrivHelper(*Input); + } + + assert(Outputs.empty() && + "OpenMP outlining should not produce live-out values!"); + + LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); + for (auto *BB : Blocks) { + LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n"); + } + + int NumInputs = Inputs.size()-1; // One argument is always omp.iv + OI.PostOutlineCB = [=](Function &OutlinedFn) { + + OutlinedFn.addFnAttr(Attribute::NoUnwind); + OutlinedFn.addFnAttr(Attribute::NoRecurse); + + assert(OutlinedFn.arg_size() == 2 && + "Expected omp.iv & structArg as arguments"); + + CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); + BasicBlock *CallBlock = CI->getParent(); + CallBlock->setName("omp_loop"); + Builder.SetInsertPoint(CI); + + Value * StructArg = CI->getArgOperand(1); // 0 should be omp.iv + + Value *SimdArgs[] = { + Ident, + Builder.CreateBitCast(&OutlinedFn, LoopTaskPtr), + DistVal, + Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr)}; + + SmallVector<Value *, 16> RealArgs; + RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs)); + + FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr( + (DistValType->isIntegerTy(32) ? OMPRTL___kmpc_simd_4u : + OMPRTL___kmpc_simd_8u)); + Builder.CreateCall(RTLFn, RealArgs); + + LLVM_DEBUG(dbgs() << "With kmpc_simd_4u call placed: " << *Builder.GetInsertBlock()->getParent() << "\n"); + + CI->eraseFromParent(); + + for (Instruction *I : ToBeDeleted) + I->eraseFromParent(); + + }; + + addOutlineInfo(std::move(OI)); + } + + +// Outline 2 + if(false) { // if(!SPMD) { + OutlineInfo OI; + + OI.OuterAllocaBB = OuterAllocaBlock; + OI.EntryBB = EntryBB; //LoopEntryBB; + OI.ExitBB = FinalizeBB; //LoopExitBB; + + SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; + SmallVector<BasicBlock *, 32> Blocks; + OI.collectBlocks(ParallelRegionBlockSet, Blocks); + + CodeExtractorAnalysisCache CEAC(*OuterFn); + + CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, + /* AggregateArgs */ true, + /* BlockFrequencyInfo */ nullptr, + /* BranchProbabilityInfo */ nullptr, + /* AssumptionCache */ nullptr, + /* AllowVarArgs */ false, + /* AllowAlloca */ true, + /* AllocationBlock */ OuterAllocaBlock, + /* Suffix */ ".omp_simd"); + + BasicBlock *CommonExit = nullptr; + SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; + Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); + Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands); + + auto PrivHelper = [&](Value &V) { + // Exclude omp.iv from aggregate + //if (&V == OMPIV) { + // OI.ExcludeArgsFromAggregate.push_back(&V); + // return; + //} + + // Get all uses of value that are inside of the outlined region + SetVector<Use *> Uses; + for (Use &U : V.uses()) + if (auto *UserI = dyn_cast<Instruction>(U.getUser())) + if (ParallelRegionBlockSet.count(UserI->getParent())) + Uses.insert(&U); + + Value *Inner = &V; + + // If the value isn't a pointer type, store it in a pointer + // Unpack it inside the outlined region + if (!V.getType()->isPointerTy()) { + IRBuilder<>::InsertPointGuard Guard(Builder); + LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n"); + + Builder.restoreIP(OuterAllocaIP); + Value *Ptr = Builder.CreateAlloca( + V.getType(), nullptr, V.getName() + ".reloaded"); + + // Store to stack at end of the block that currently branches to the entry + // block of the to-be-outlined region. + Builder.SetInsertPoint( + InsertBB, InsertBB->getTerminator()->getIterator()); + Builder.CreateStore(&V, Ptr); + + // Load back next to allocations in the to-be-outlined region. + Builder.restoreIP(InnerAllocaIP); + Inner = Builder.CreateLoad(V.getType(), Ptr); + } + + Value *ReplacementValue = nullptr; + Builder.restoreIP( + PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue)); + assert(ReplacementValue && + "Expected copy/create callback to set replacement value!"); + if (ReplacementValue == &V) + return; + + for (Use *UPtr : Uses) + UPtr->set(ReplacementValue); + + }; + + LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n"); + + InnerAllocaIP = IRBuilder<>::InsertPoint( + OMPIV->getParent(), OMPIV->getNextNode()->getIterator()); + + // Reset the outer alloca insertion point to the entry of the relevant block + // in case it was invalidated. + OuterAllocaIP = IRBuilder<>::InsertPoint( + OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt()); + + for (Value *Input : Inputs) { + PrivHelper(*Input); + } + + assert(Outputs.empty() && + "OpenMP outlining should not produce live-out values!"); + + LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); + for (auto *BB : Blocks) { + LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n"); + } + + int NumInputs = Inputs.size(); + + OI.PostOutlineCB = [=](Function &OutlinedFn) { + + OutlinedFn.addFnAttr(Attribute::NoUnwind); + OutlinedFn.addFnAttr(Attribute::NoRecurse); + + assert(OutlinedFn.arg_size() == 1 && + "Expected structArg as arguments"); + + CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); + BasicBlock *CallBlock = CI->getParent(); + CallBlock->setName("omp_simd"); + Builder.SetInsertPoint(CI); + + Value * StructArg = CI->getArgOperand(0); + + Value *SimdArgs[] = { + Ident, + Builder.CreateBitCast(&OutlinedFn, SimdTaskPtr), + Builder.CreateCast(Instruction::BitCast, StructArg, Int8PtrPtr), + Builder.getInt32(NumInputs)}; + + SmallVector<Value *, 16> RealArgs; + RealArgs.append(std::begin(SimdArgs), std::end(SimdArgs)); + + FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr( + OMPRTL___kmpc_simd); + Builder.CreateCall(RTLFn, RealArgs); + + LLVM_DEBUG(dbgs() << "With __kmpc_simd call placed: " << *Builder.GetInsertBlock()->getParent() << "\n"); + + CI->eraseFromParent(); + + for (Instruction *I : ToBeDeleted) + I->eraseFromParent(); + + }; + + addOutlineInfo(std::move(OI)); + } + + + + + + InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end()); //UI->getParent(), UI->getParent()->end()); + UI->eraseFromParent(); + + return AfterIP; + +} + + IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( const LocationDescription &Loc, InsertPointTy OuterAllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, @@ -1652,7 +2272,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); LLVM_DEBUG({ for (auto *BB : Blocks) - dbgs() << " PBR: " << BB->getName() << "\n"; + LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n"); }); // Adjust the finalization stack, verify the adjustment, and call the @@ -3495,7 +4115,18 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU( } Value *ReductionDataSize = Builder.getInt64(MaxDataSize * ReductionInfos.size()); - if (!IsTeamsReduction) { + if(IsSimdReduction) { + Value *SarFuncCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy); + Value *WcFuncCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy); + Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast}; + //Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr( + // RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2); + Function *SimdReduceFn = getOrCreateRuntimeFunctionPtr( + RuntimeFunction::OMPRTL___kmpc_nvptx_simd_reduce_nowait_v2); + Res = Builder.CreateCall(SimdReduceFn, Args); + } else if (!IsTeamsReduction) { Value *SarFuncCast = Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy); Value *WcFuncCast = @@ -3616,6 +4247,9 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc, if (!updateToLocation(Loc)) return InsertPointTy(); + if (ReductionInfos.size() == 0) + return Builder.saveIP(); + BasicBlock *InsertBlock = Loc.IP.getBlock(); BasicBlock *ContinuationBlock = InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); @@ -3656,7 +4290,7 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc, const DataLayout &DL = Module->getDataLayout(); unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy); Constant *RedArraySize = Builder.getInt64(RedArrayByteSize); - Function *ReductionFunc = getFreshReductionFunc(*Module); + Function *ReductionFunc = getFreshReductionFunc(M); Value *Lock = getOMPCriticalRegionLock(".reduction"); Function *ReduceFunc = getOrCreateRuntimeFunctionPtr( IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait @@ -8585,6 +9219,7 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata( [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString]( const TargetRegionEntryInfo &EntryInfo, const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) { + // Generate metadata for target regions. Each entry of this metadata // contains: // - Entry 0 -> Kind of this type of metadata (0). @@ -8922,7 +9557,6 @@ void OpenMPIRBuilder::registerTargetGlobalVariable( VarSize = M.getDataLayout().getPointerSize(); Linkage = GlobalValue::WeakAnyLinkage; } - OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize, Flags, Linkage); } @@ -9010,6 +9644,7 @@ bool OffloadEntriesInfoManager::empty() const { unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount( const TargetRegionEntryInfo &EntryInfo) const { + auto It = OffloadEntriesTargetRegionCount.find( getTargetRegionEntryCountKey(EntryInfo)); if (It == OffloadEntriesTargetRegionCount.end()) @@ -9019,6 +9654,7 @@ unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount( void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount( const TargetRegionEntryInfo &EntryInfo) { + OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] = EntryInfo.Count + 1; } @@ -9026,6 +9662,7 @@ void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount( /// Initialize target region entry. void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo( const TargetRegionEntryInfo &EntryInfo, unsigned Order) { + OffloadEntriesTargetRegion[EntryInfo] = OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr, OMPTargetRegionEntryTargetRegion); @@ -9035,6 +9672,7 @@ void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo( void OffloadEntriesInfoManager::registerTargetRegionEntryInfo( TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags) { + assert(EntryInfo.Count == 0 && "expected default EntryInfo"); // Update the EntryInfo with the next available count for this location. @@ -9082,6 +9720,7 @@ bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo( void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo( const OffloadTargetRegionEntryInfoActTy &Action) { + // Scan all target region entries and perform the provided action. for (const auto &It : OffloadEntriesTargetRegion) { Action(It.first, It.second); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 15b26a38cc28ef..0ad2b3a055f6c4 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1194,7 +1194,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, } StructType *StructArgTy = nullptr; - AllocaInst *Struct = nullptr; + //AllocaInst *Struct = nullptr; + Instruction *Struct = nullptr; unsigned NumAggregatedInputs = 0; if (AggregateArgs && !StructValues.empty()) { std::vector<Type *> ArgTypes; @@ -1210,12 +1211,16 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) { auto *StructSpaceCast = new AddrSpaceCastInst( - Struct, PointerType ::get(Context, 0), "structArg.ascast"); + Struct, PointerType ::get(Context, 0), "structArg.ascast"); StructSpaceCast->insertAfter(Struct); - params.push_back(StructSpaceCast); + // There isn't really a point in generating this cast if you + // just aren't going to use it... + Struct = StructSpaceCast; + //params.push_back(StructSpaceCast); } else { - params.push_back(Struct); + //params.push_back(Struct); } + params.push_back(Struct); // Store aggregated inputs in the struct. for (unsigned i = 0, e = StructValues.size(); i != e; ++i) { if (inputs.contains(StructValues[i])) { diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h index c4bfaaa2404b4f..b970d078fc0ddb 100644 --- a/offload/DeviceRTL/include/Interface.h +++ b/offload/DeviceRTL/include/Interface.h @@ -167,6 +167,9 @@ double omp_get_wtick(void); double omp_get_wtime(void); ///} + +int omp_get_simd_lane(void); + } extern "C" { @@ -233,6 +236,12 @@ void __kmpc_target_deinit(); ///{ void *__kmpc_reduction_get_fixed_buffer(); +int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc, + uint64_t reduce_data_size, + void *reduce_data, + ShuffleReductFnTy shflFct, + InterWarpCopyFnTy cpyFct); + int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, uint64_t reduce_data_size, void *reduce_data, @@ -257,6 +266,8 @@ int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); +void __kmpc_simd_barrier(void); + void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId); diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index 2fb87abe5418c0..d81ccff436cc2c 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -105,6 +105,13 @@ uint32_t getMaxTeamThreads(bool IsSPMD); /// Return the number of processing elements on the device. uint32_t getNumberOfProcessorElements(); +uint32_t getSimdLen(); +uint32_t getSimdGroup(); +uint32_t getSimdLane(); +bool isSimdLeader(); +uint32_t getNumSimdGroups(); +LaneMaskTy simdmask(); + } // namespace mapping } // namespace ompx diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 8bb275eae776c6..e6c01d1741821a 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -42,9 +42,9 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, static void genericStateMachine(IdentTy *Ident) { uint32_t TId = mapping::getThreadIdInBlock(); + do { ParallelRegionFnTy WorkFn = nullptr; - // Wait for the signal that we have a new work function. synchronize::threads(atomic::seq_cst); @@ -100,7 +100,9 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, } if (mapping::isInitialThreadInLevel0(IsSPMD)) + { return -1; + } // Enter the generic state machine if enabled and if this thread can possibly // be an active worker thread. diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index 3aefcff68e1956..41f47ad7a78539 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -322,6 +322,40 @@ uint32_t mapping::getNumberOfProcessorElements() { return static_cast<uint32_t>(config::getHardwareParallelism()); } +uint32_t mapping::getSimdLen() { + return 1; +} + +uint32_t mapping::getSimdGroup() { + uint32_t SimdGroup = mapping::getThreadIdInBlock() / mapping::getSimdLen(); + return SimdGroup; +} + +uint32_t mapping::getSimdLane() { + uint32_t SimdId = mapping::getThreadIdInWarp() % mapping::getSimdLen(); + return SimdId; +} + +bool mapping::isSimdLeader() { + return !mapping::getSimdLane(); +} + +uint32_t mapping::getNumSimdGroups() { + //uint32_t NumGroups = mapping::getBlockSize() / mapping::getSimdLen(); + uint32_t NumGroups = state::getEffectivePTeamSize() / mapping::getSimdLen(); + return NumGroups; +} + +LaneMaskTy mapping::simdmask() { + uint32_t GroupSize = mapping::getSimdLen(); + uint32_t Group = mapping::getSimdGroup(); + uint32_t WarpSize = mapping::getWarpSize(); + LaneMaskTy Mask = ~(LaneMaskTy)0; + Mask = Mask >> (sizeof(LaneMaskTy)*8 - GroupSize); + Mask = Mask << (Group * GroupSize) % WarpSize; + return Mask; +} + ///} /// Execution mode diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index 5286d53b623f0a..fe6eb3590d92e8 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -87,8 +87,10 @@ extern "C" { int32_t num_threads, void *fn, void **args, const int64_t nargs) { + //printf("SPMD mode\n"); uint32_t TId = mapping::getThreadIdInBlock(); uint32_t NumThreads = determineNumberOfThreads(num_threads); + NumThreads = NumThreads / mapping::getSimdLen(); uint32_t PTeamSize = NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; // Avoid the race between the read of the `icv::Level` above and the write @@ -101,6 +103,9 @@ extern "C" { state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, 1u, TId == 0, ident, /*ForceTeamState=*/true); + //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen, + // 1u, TId == 0, ident, + // /*ForceTeamState=*/true); state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, /*ForceTeamState=*/true); state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, @@ -119,7 +124,7 @@ extern "C" { // assumptions above. synchronize::threadsAligned(atomic::relaxed); - if (!PTeamSize || TId < PTeamSize) + if (!PTeamSize || (TId < PTeamSize*mapping::getSimdLen())) invokeMicrotask(TId, 0, fn, args, nargs); // Synchronize all threads at the end of a parallel region. @@ -141,6 +146,8 @@ extern "C" { return; } + + [[clang::always_inline]] void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t num_threads, int proc_bind, void *fn, @@ -166,6 +173,14 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // From this point forward we know that there is no thread state used. ASSERT(state::HasThreadState == false, nullptr); + //printf("num_threads=%i\n", num_threads); + uint32_t NumThreads = determineNumberOfThreads(num_threads); + //printf("NumThreads=%i\n", NumThreads); + NumThreads = NumThreads / mapping::getSimdLen(); + //printf("New NumThreads=%i\n", NumThreads); + uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); + uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; + //printf("PTeamSize=%i\n", PTeamSize); if (mapping::isSPMDMode()) { // This was moved to its own routine so it could be called directly // in certain situations to avoid resource consumption of unused @@ -185,7 +200,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // set, but they do not have individual ThreadStates yet. If they ever // modify the ICVs beyond this point a ThreadStates will be allocated. - bool IsActiveParallelRegion = NumThreads > 1; + bool IsActiveParallelRegion = NumThreads*mapping::getSimdLen() > 1; if (!IsActiveParallelRegion) { state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); invokeMicrotask(TId, 0, fn, args, nargs); @@ -254,12 +269,16 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, } { + //printf("Generic execution\n"); // Note that the order here is important. `icv::Level` has to be updated // last or the other updates will cause a thread specific state to be // created. state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, 1u, true, ident, /*ForceTeamState=*/true); + //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen, + // 1u, TId == 0, ident, + // /*ForceTeamState=*/true); state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, (void *)nullptr, true, ident, /*ForceTeamState=*/true); @@ -288,7 +307,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // Set to true for workers participating in the parallel region. uint32_t TId = mapping::getThreadIdInBlock(); - bool ThreadIsActive = TId < state::getEffectivePTeamSize(); + bool ThreadIsActive = TId < state::getEffectivePTeamSize()*mapping::getSimdLen(); return ThreadIsActive; } diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 57df159d3f28e5..402c33c7779eb4 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -164,9 +164,57 @@ uint32_t roundToWarpsize(uint32_t s) { uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } +static int32_t nvptx_simd_reduce_nowait(void *reduce_data, + ShuffleReductFnTy shflFct, + InterWarpCopyFnTy cpyFct) { + uint32_t SimdId = mapping::getSimdLane(); + uint32_t NumThreads = mapping::getSimdLen(); + if(NumThreads == 1) + return 1; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + if (NumThreads == mapping::getWarpSize()) + gpu_regular_warp_reduce(reduce_data, shflFct); + else + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads, + /*LaneId=*/mapping::getSimdLane()); +#else + __kmpc_impl_lanemask_t Liveness = mapping::simdmask(); + if (Liveness == lanes::All) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/utils::popc(Liveness), + /*LaneId=*/mapping::getSimdLane()); +#endif + + return mapping::isSimdLeader(); +} + + + + + + + + + + + + + } // namespace extern "C" { +int32_t __kmpc_nvptx_simd_reduce_nowait_v2(IdentTy *Loc, + uint64_t reduce_data_size, + void *reduce_data, + ShuffleReductFnTy shflFct, + InterWarpCopyFnTy cpyFct) { + return nvptx_simd_reduce_nowait(reduce_data, shflFct, cpyFct); +} + int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, uint64_t reduce_data_size, void *reduce_data, diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 855c74fa58e0a5..8c96d4cfd6d011 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -364,13 +364,18 @@ void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { } int omp_get_ancestor_thread_num(int Level) { - return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); + //return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); + return returnValIfLevelIsActive(Level, mapping::getSimdGroup(), 0); } int omp_get_thread_num(void) { return omp_get_ancestor_thread_num(omp_get_level()); } +int omp_get_simd_lane(void) { + return mapping::getSimdLane(); +} + int omp_get_team_size(int Level) { return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); } diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index 9ea8d171cc830e..4d56b7fc149448 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -533,6 +533,10 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) { impl::namedBarrier(); } +void __kmpc_simd_barrier(void) { + synchronize::warp(mapping::simdmask()); +} + [[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { synchronize::threadsAligned(atomic::OrderingTy::seq_cst); } diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index ad60e66548be90..5086eb4966179c 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -518,6 +518,28 @@ void workshare::init(bool IsSPMD) { ThreadDST = nullptr; } +template<typename IType> +void SimdLoop( + IdentTy *ident, void *WorkFn, IType TripCount, + void **Args +) { + ASSERT(WorkFn, "expected valid outlined function"); + __kmpc_impl_lanemask_t SimdMask = mapping::simdmask(); + uint32_t Step = mapping::getSimdLen(); + + //printf("Thread=%i : Lane=%i : Len=%i : TripCount=%i\n", + // mapping::getThreadIdInBlock(), mapping::getSimdLane(), mapping::getSimdLen(), TripCount); + + synchronize::warp(SimdMask); + for(IType omp_iv = (IType) mapping::getSimdLane(); + omp_iv < TripCount; + omp_iv += Step + ) { + ((void (*)(IType, void**))WorkFn)(omp_iv, Args); + } + synchronize::warp(SimdMask); +} + extern "C" { // init @@ -683,6 +705,28 @@ void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} + +void __kmpc_simd_4u( + IdentTy *ident, void *WorkFn, uint32_t TripCount, + void **Args +) { + SimdLoop<uint32_t>(ident, WorkFn, TripCount, Args); +} + +void __kmpc_simd_8u( + IdentTy *ident, void *WorkFn, uint64_t TripCount, + void **Args +) { + SimdLoop<uint64_t>(ident, WorkFn, TripCount, Args); +} + +void __kmpc_simd( + IdentTy *ident, void *WorkFn, void **Args, uint32_t nargs +) { + ASSERT(WorkFn, "expected valid outlined function"); + ((void (*)(void**))WorkFn)(Args); +} + } namespace ompx { >From ec9fa48937da7f88aceaeb06e5efaed0d42dabd7 Mon Sep 17 00:00:00 2001 From: Eric Francis Wright <wright...@rzvernal10.llnl.gov> Date: Fri, 14 Jun 2024 08:44:22 -0700 Subject: [PATCH 2/4] Changed where certain variables are allocated so that it is consistent --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 1d5b24475d1d1b..07557d66cde398 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1602,7 +1602,8 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator()); assert(DistanceCB && "expected loop trip count callback function!"); - Value *DistVal = DistanceCB(EntryBB, DistanceIP); + //Value *DistVal = DistanceCB(EntryBB, DistanceIP); + Value *DistVal = DistanceCB(OuterAllocaBlock, DistanceIP); assert(DistVal && "trip count call back should return integer trip count"); Type *DistValType = DistVal->getType(); assert(DistValType->isIntegerTy() && "trip count should be integer type"); >From 5fba287fb12a2b18c55270fa51d5f587b53f0c89 Mon Sep 17 00:00:00 2001 From: Eric Francis Wright <wright...@rzansel61.coral.llnl.gov> Date: Thu, 3 Oct 2024 23:40:20 -0700 Subject: [PATCH 3/4] Added SimdLen to the team state and removed unneeded globalization code --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 154 ---------------------- offload/DeviceRTL/include/Mapping.h | 15 +++ offload/DeviceRTL/include/State.h | 7 + offload/DeviceRTL/src/Mapping.cpp | 2 +- offload/DeviceRTL/src/Parallelism.cpp | 25 ++-- offload/DeviceRTL/src/State.cpp | 5 +- 6 files changed, 40 insertions(+), 168 deletions(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 07557d66cde398..0798ea353c5d19 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -800,8 +800,6 @@ void OpenMPIRBuilder::finalize(Function *Fn) { for (Function *F : ConstantAllocaRaiseCandidates) raiseUserConstantDataAllocasToEntryBlock(Builder, F); - //globalizeVars(Fn); - EmitMetadataErrorReportFunctionTy &&ErrorReportFn = [](EmitMetadataErrorKind Kind, const TargetRegionEntryInfo &EntryInfo) -> void { @@ -821,158 +819,6 @@ void OpenMPIRBuilder::finalize(Function *Fn) { } -CallInst * OpenMPIRBuilder::globalizeAlloca( - AllocaInst *Alloca, - SmallVector<Instruction*, 32> &ToBeDeleted -) { - FunctionCallee AllocFn = getOrCreateRuntimeFunctionPtr( - OMPRTL___kmpc_alloc_shared - ); - - Builder.SetInsertPoint(Alloca); - Value *SharedAllocArgs[] = { - //ConstantInt::get(Int64, Alloca->getType()->getScalarSizeInBits()/8) - - //ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())); - //ConstantExpr::getSizeOf(Alloca->getAllocatedType()) - ConstantInt::get(Int64, Alloca->getAllocationSize(M.getDataLayout())->getFixedValue()) - }; - - CallInst *AllocSharedCall = Builder.CreateCall(AllocFn, ArrayRef<Value*>(SharedAllocArgs, 1)); - AllocSharedCall->setName(Alloca->getName() + "_on_stack"); - //Value *ReplValue = Builder.CreateBitcast(AllocSharedCall, Alloca->getType(), Alloca->getName() + "_on_stack"); - - dbgs() << "Created " << *AllocSharedCall << "\n"; - dbgs() << *(Alloca->getType()) << "\n"; - dbgs() << *(AllocSharedCall->getType()) << "\n"; - - //Type *CastType = PointerType::get(Alloca->getAllocatedType(), 0); - //dbgs() << " " << *CastType << "\n"; - //llvm::Value *CastedSharedAlloc = Builder.CreateBitCast( - // AllocSharedCall, CastType, Alloca->getName()+"_on_stack" - //); - - //dbgs() << " Casted " << *CastedSharedAlloc << "\n"; - - //Alloca->replaceAllUsesWith(AllocSharedCall); - - // If the Alloca was allocated in address space 5 (local) we need to - // account for a type mismatch between it and the return from __kmpc_shared_alloc - - for(auto U = Alloca->user_begin(); U != Alloca->user_end(); U++) { - dbgs () << " User - " << *(*U) << "\n"; - } - - if(Alloca->hasOneUser() && isa<AddrSpaceCastInst>(Alloca->user_back())) { - auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(Alloca->user_back()); - dbgs() << *(AddrSpaceCast->getType()) << "\n"; - AddrSpaceCast->replaceAllUsesWith(AllocSharedCall); - //AddrSpaceCast->removeFromParent(); - ToBeDeleted.push_back(AddrSpaceCast); - } else { - Alloca->replaceAllUsesWith(AllocSharedCall); - } - ToBeDeleted.push_back(Alloca); - //Alloca->removeFromParent(); - - //for(auto U = AllocSharedCall->user_begin(); U != AllocSharedCall->user_end(); U++) { - // if(auto AddrSpaceCast = dyn_cast<AddrSpaceCastInst>(*U)) { - // if(AddrSpaceCast->getSrcAddressSpace() == AddrSpaceCast->getDestAddressSpace()) { - // AddrSpaceCast->replaceAllUsesWith(CastedSharedAlloc); - // AddrSpaceCast->removeFromParent(); - // } - // } - //} - - //Alloca->removeFromParent(); - - dbgs() << " var globalized!\n"; - - return AllocSharedCall; - -} - -void OpenMPIRBuilder::globalizeParallelVars( - llvm::Function *CurFn -) { - SmallVector<Instruction*, 32> ToBeDeleted; - std::stack<CallInst*> GlobalizedVars; - - dbgs() << " Exploring: " << CurFn->getName() << "\n"; - for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++) - { - for(auto I = BB->begin(); I != BB->end(); I++) - { - if(auto Alloca = dyn_cast<AllocaInst>(I)) { - dbgs() << " Found Alloca: " << *Alloca << "\n"; - CallInst * GlobalizedAlloca = globalizeAlloca(Alloca, ToBeDeleted); - GlobalizedVars.push(GlobalizedAlloca); - } else if(auto FnCall = dyn_cast<CallInst>(I)) { - dbgs() << " Found Function Call: " << *FnCall << "\n"; - } - } - } - - BasicBlock &EndBlock = CurFn->back(); - Builder.SetInsertPoint(EndBlock.begin()); - while(!GlobalizedVars.empty()) { - CallInst *SharedAlloc = GlobalizedVars.top(); - GlobalizedVars.pop(); - FunctionCallee FreeFn = getOrCreateRuntimeFunctionPtr( - OMPRTL___kmpc_free_shared - ); - - Value *SharedFreeArgs[] = { - SharedAlloc, - SharedAlloc->getArgOperand(0) - }; - - CallInst *SharedFreeCall = Builder.CreateCall(FreeFn, ArrayRef<Value*>(SharedFreeArgs, 2)); - dbgs() << " Freed - " << *SharedFreeCall << "\n"; - } - - for(auto I : ToBeDeleted) - I->removeFromParent(); - -} - -// Globalize any variables that are needed in a lower level of -// the parallel hierarchy. -// Only Vars used in 'simd' regions are supported right now. -void OpenMPIRBuilder::globalizeVars(llvm::Function *CurFn) -{ - - std::stack<llvm::AllocaInst> Allocas; - SmallPtrSet<AllocaInst*, 32> EscapedVars; - - //dbgs() << "Function: " << CurFn->getName() << "\n"; - - for(auto BB = CurFn->begin(); BB != CurFn->end(); BB++) - { - for(auto I = BB->begin(); I != BB->end(); I++) - { - //dbgs() << " Instruction: " << *I << "\n"; - if(auto FnCall = dyn_cast<CallInst>(I)) - { - //dbgs() << " Found call: " << *FnCall << "\n"; - if(auto Fn = FnCall->getCalledFunction()) { - //dbgs() << " " << Fn->getName() << "\n"; - if(Fn->getName() == "__kmpc_parallel_51") { - //dbgs() << " Parallel!\n"; - - Function *OutlinedFn = dyn_cast<Function>(FnCall->getArgOperand(5)); - assert(OutlinedFn && "failed to find GPU parallel outlined fn"); - - - dbgs() << "Found a parallel region\n"; - globalizeParallelVars(OutlinedFn); - } - } - } - } - } -} - OpenMPIRBuilder::~OpenMPIRBuilder() { assert(OutlineInfos.empty() && "There must be no outstanding outlinings"); } diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index d81ccff436cc2c..c5b73f8cc71f9b 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -105,11 +105,26 @@ uint32_t getMaxTeamThreads(bool IsSPMD); /// Return the number of processing elements on the device. uint32_t getNumberOfProcessorElements(); +/// Return the number of threads reserved for simd loops per parallel thread. +/// This is between [1, getWarpSize()]. uint32_t getSimdLen(); + +/// Return what simd group the thread belongs to. uint32_t getSimdGroup(); + +/// Return the thread ID within its simd group, in [0, getSimdLen()) uint32_t getSimdLane(); + +/// Return true if the thread is simd lane 0. I.e if this is the +/// thread that executes parallel regions. bool isSimdLeader(); + +/// Return the number of simd groups in the team. This is +/// getMaxTeamThreads() / getSimdLen(). uint32_t getNumSimdGroups(); + +/// Return the lane mask that correlates to all threads within +/// the simd group. LaneMaskTy simdmask(); } // namespace mapping diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index 565235cd48a913..24578ae1385aba 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -84,6 +84,7 @@ struct TeamStateTy { ///} uint32_t ParallelTeamSize; + uint32_t SimdLength; uint32_t HasThreadState; ParallelRegionFnTy ParallelRegionFnVar; }; @@ -140,6 +141,7 @@ enum ValueKind { VK_RunSchedChunk, VK_ParallelRegionFn, VK_ParallelTeamSize, + VK_SimdLength, VK_HasThreadState, }; @@ -217,6 +219,8 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { ForceTeamState); case state::VK_ParallelTeamSize: return TeamState.ParallelTeamSize; + case state::VK_SimdLength: + return TeamState.SimdLength; case state::VK_HasThreadState: return TeamState.HasThreadState; default: @@ -340,6 +344,9 @@ inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk; /// TODO inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize; +/// TODO +inline state::Value<uint32_t, state::VK_SimdLength> SimdLength; + /// TODO inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState; diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index 41f47ad7a78539..c491708b6225d0 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -323,7 +323,7 @@ uint32_t mapping::getNumberOfProcessorElements() { } uint32_t mapping::getSimdLen() { - return 1; + return state::SimdLength; } uint32_t mapping::getSimdGroup() { diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index fe6eb3590d92e8..85baf580465f30 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -87,10 +87,12 @@ extern "C" { int32_t num_threads, void *fn, void **args, const int64_t nargs) { - //printf("SPMD mode\n"); uint32_t TId = mapping::getThreadIdInBlock(); uint32_t NumThreads = determineNumberOfThreads(num_threads); - NumThreads = NumThreads / mapping::getSimdLen(); + + // Any threads leftover from the team max vs. what's used in the + // parallel region are reserved for simd loops + uint32_t SimdLen = mapping::getMaxTeamThreads() / NumThreads; uint32_t PTeamSize = NumThreads == mapping::getMaxTeamThreads() ? 0 : NumThreads; // Avoid the race between the read of the `icv::Level` above and the write @@ -103,9 +105,9 @@ extern "C" { state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, 1u, TId == 0, ident, /*ForceTeamState=*/true); - //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen, - // 1u, TId == 0, ident, - // /*ForceTeamState=*/true); + state::ValueRAII SimdLengthRAII(state::SimdLength, SimdLen, + 1u, TId == 0, ident, + /*ForceTeamState=*/true); state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0, ident, /*ForceTeamState=*/true); state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0, ident, @@ -173,14 +175,11 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // From this point forward we know that there is no thread state used. ASSERT(state::HasThreadState == false, nullptr); - //printf("num_threads=%i\n", num_threads); uint32_t NumThreads = determineNumberOfThreads(num_threads); - //printf("NumThreads=%i\n", NumThreads); NumThreads = NumThreads / mapping::getSimdLen(); - //printf("New NumThreads=%i\n", NumThreads); + uint32_t SimdLen = mapping::getMaxTeamThreads() / NumThreads; uint32_t MaxTeamThreads = mapping::getMaxTeamThreads(); uint32_t PTeamSize = NumThreads == MaxTeamThreads ? 0 : NumThreads; - //printf("PTeamSize=%i\n", PTeamSize); if (mapping::isSPMDMode()) { // This was moved to its own routine so it could be called directly // in certain situations to avoid resource consumption of unused @@ -200,6 +199,8 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, // set, but they do not have individual ThreadStates yet. If they ever // modify the ICVs beyond this point a ThreadStates will be allocated. + + // bool IsActiveParallelRegion = NumThreads*mapping::getSimdLen() > 1; if (!IsActiveParallelRegion) { state::ValueRAII LevelRAII(icv::Level, 1u, 0u, true, ident); @@ -276,9 +277,9 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, PTeamSize, 1u, true, ident, /*ForceTeamState=*/true); - //state::ValueRAII SimdLengthRAII(state::SimdLength, StaticSimdLen, - // 1u, TId == 0, ident, - // /*ForceTeamState=*/true); + state::ValueRAII SimdLengthRAII(state::SimdLength, SimdLen, + 1u, TId == 0, ident, + /*ForceTeamState=*/true); state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn, (void *)nullptr, true, ident, /*ForceTeamState=*/true); diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 8c96d4cfd6d011..91b1689e0ac57e 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -207,6 +207,7 @@ void state::TeamStateTy::init(bool IsSPMD) { ICVState.RunSchedVar = omp_sched_static; ICVState.RunSchedChunkVar = 1; ParallelTeamSize = 1; + SimdLength = 1; HasThreadState = false; ParallelRegionFnVar = nullptr; } @@ -214,12 +215,14 @@ void state::TeamStateTy::init(bool IsSPMD) { bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { return (ICVState == Other.ICVState) & (HasThreadState == Other.HasThreadState) & - (ParallelTeamSize == Other.ParallelTeamSize); + (ParallelTeamSize == Other.ParallelTeamSize) & + (SimdLength == Other.SimdLength); } void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { ICVState.assertEqual(Other.ICVState); ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); + ASSERT(SimdLength == Other.SimdLength, nullptr); ASSERT(HasThreadState == Other.HasThreadState, nullptr); } >From ccbde9005b0f432e87bbd360b2216d3f221810c4 Mon Sep 17 00:00:00 2001 From: Eric Francis Wright <wright...@rzansel61.coral.llnl.gov> Date: Fri, 4 Oct 2024 08:58:50 -0700 Subject: [PATCH 4/4] Removing extra whitespaces and comments throughout --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 2 - clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 2 - clang/lib/CodeGen/CGStmtOpenMP.cpp | 8 +-- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 58 ++++----------------- llvm/lib/Transforms/Utils/CodeExtractor.cpp | 8 +-- offload/DeviceRTL/src/Kernel.cpp | 2 - offload/DeviceRTL/src/Parallelism.cpp | 1 - offload/DeviceRTL/src/Workshare.cpp | 3 -- 8 files changed, 12 insertions(+), 72 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 836253ab1a7d8b..3747b00d4893ad 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1035,7 +1035,6 @@ static FieldDecl *addFieldToRecordDecl(ASTContext &C, DeclContext *DC, CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) : CGM(CGM), OMPBuilder(CGM.getModule()) { - KmpCriticalNameTy = llvm::ArrayType::get(CGM.Int32Ty, /*NumElements*/ 8); llvm::OpenMPIRBuilderConfig Config( CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), @@ -1057,7 +1056,6 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) } void CGOpenMPRuntime::clear() { - InternalVars.clear(); // Clean non-target variable declarations possibly used only in debug info. for (const auto &Data : EmittedNonTargetVariables) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 16aff085579807..e1b2b499c9bbcb 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -262,7 +262,6 @@ class CheckVarsEscapingDeclContext final bool IsCombinedParallelRegion) { if (!S) return; - for (const CapturedStmt::Capture &C : S->captures()) { if (C.capturesVariable() && !C.capturesVariableByCopy()) { const ValueDecl *VD = C.getCapturedVar(); @@ -337,7 +336,6 @@ class CheckVarsEscapingDeclContext final return; if (!D->hasAssociatedStmt()) return; - if (const auto *S = dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { // Do not analyze directives that do not actually require capturing, diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index b4e699c1d003b8..52812ba6ab2451 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -2771,8 +2771,6 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S, auto *LoopVarRef = CL->getLoopVarRef(); LValue LCVal = EmitLValue(LoopVarRef); - //Address LoopVarAddress = LCVal.getAddress(*this); - //LoopVar = dyn_cast<llvm::Instruction>(LoopVarAddress.getPointer()); LoopVar = dyn_cast<llvm::Instruction>(LCVal.getPointer(*this)); LoopVarName = LoopVarRef->getNameInfo().getAsString(); @@ -2786,7 +2784,6 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S, ->getType() .getNonReferenceType(); - //Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr"); RawAddress CountAddr = CreateMemTemp(LogicalTy, ".count.addr"); emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()}); @@ -2809,8 +2806,7 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S, }; auto BodyGenCB = [&] - (//InsertPointTy OuterAllocaIP, - llvm::BasicBlock *OuterAllocaBB, + (llvm::BasicBlock *OuterAllocaBB, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, InsertPointTy Prolog, InsertPointTy ReductionEpilog, llvm::Value *Virtual) { @@ -2869,7 +2865,7 @@ static void emitOMPSimdDirective(const OMPLoopDirective &S, )); return; - } + } CodeGenFunction::ParentLoopDirectiveForScanRegion ScanRegion(CGF, S); CGF.OMPFirstScanLoop = true; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 0798ea353c5d19..e6ac271b442ecc 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -978,11 +978,7 @@ OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind, uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); - if (!ThreadID) - ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize)); - - Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags), - ThreadID}; + Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags)}; // If we are in a cancellable parallel region, barriers are cancellation // points. @@ -1448,7 +1444,6 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( InsertPointTy DistanceIP(PrologBB, PrologBB->getTerminator()->getIterator()); assert(DistanceCB && "expected loop trip count callback function!"); - //Value *DistVal = DistanceCB(EntryBB, DistanceIP); Value *DistVal = DistanceCB(OuterAllocaBlock, DistanceIP); assert(DistVal && "trip count call back should return integer trip count"); Type *DistValType = DistVal->getType(); @@ -1459,15 +1454,12 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( // Create the virtual iteration variable that will be pulled into // the outlined function. - //Builder.restoreIP(OuterAllocaIP); Builder.SetInsertPoint(EntryBB, EntryBB->begin()); AllocaInst *OMPIVAlloca = Builder.CreateAlloca(DistValType, nullptr, "omp.iv.tmp"); Instruction *OMPIV = Builder.CreateLoad(DistValType, OMPIVAlloca, "omp.iv"); - //InsertPointTy MidAllocaIP = Builder.saveIP(); // Generate the privatization allocas in the block that will become the entry // of the outlined function. -// Builder.SetInsertPoint(LoopEntryBB->getTerminator()); Builder.SetInsertPoint(LoopEntryBB, LoopEntryBB->begin()); // Use omp.iv in the outlined region so it gets captured during the outline Instruction *OMPIVUse = dyn_cast<Instruction>( @@ -1484,7 +1476,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( LLVM_DEBUG(dbgs() << "Before body codegen:\n" << *OuterFn << "\n"); assert(BodyGenCB && "Expected body generation callback!"); - InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); //LoopBodyBB->begin()); + InsertPointTy CodeGenIP(LoopBodyBB, LoopBodyBB->getTerminator()->getIterator()); InsertPointTy PrologIP(PrologBB, PrologBB->getTerminator()->getIterator()); InsertPointTy ReductionEpilogIP(ReductionEpilogBB, ReductionEpilogBB->begin()); @@ -1505,20 +1497,12 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( { OutlineInfo OI; - // Adjust the finalization stack, verify the adjustment, and call the - // finalize function a last time to finalize values between the pre-fini - // block and the exit block if we left the parallel "the normal way". - //auto FiniInfo = FinalizationStack.pop_back_val(); - //(void)FiniInfo; - //assert(FiniInfo.DK == OMPD_simd && - // "Unexpected finalization stack state!"); - Instruction *LoopPreFiniTI = LoopPreFiniBB->getTerminator(); InsertPointTy PreFiniIP(LoopPreFiniBB, LoopPreFiniTI->getIterator()); FiniCB(PreFiniIP); - OI.OuterAllocaBB = EntryBB; //OuterAllocaBlock; + OI.OuterAllocaBB = EntryBB; OI.EntryBB = LoopEntryBB; OI.ExitBB = LoopExitBB; @@ -1526,13 +1510,6 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( SmallVector<BasicBlock *, 32> Blocks; OI.collectBlocks(ParallelRegionBlockSet, Blocks); - // Ensure a single exit node for the outlined region by creating one. - // We might have multiple incoming edges to the exit now due to finalizations, - // e.g., cancel calls that cause the control flow to leave the region. - //BasicBlock *PRegOutlinedExitBB = PRegExitBB; - //PRegExitBB = LRegExitBB; - //PRegOutlinedExitBB->setName("omp.loop.outlined.exit"); - Blocks.push_back(LoopExitBB); CodeExtractorAnalysisCache CEAC(*OuterFn); @@ -1621,7 +1598,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); for (auto *BB : Blocks) { - LLVM_DEBUG(dbgs() << " PBR: " << BB->getName() << "\n"); + dbgs() << " PBR: " << BB->getName() << "\n"; } int NumInputs = Inputs.size()-1; // One argument is always omp.iv @@ -1672,8 +1649,8 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( OutlineInfo OI; OI.OuterAllocaBB = OuterAllocaBlock; - OI.EntryBB = EntryBB; //LoopEntryBB; - OI.ExitBB = FinalizeBB; //LoopExitBB; + OI.EntryBB = EntryBB; + OI.ExitBB = FinalizeBB; SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; SmallVector<BasicBlock *, 32> Blocks; @@ -1697,12 +1674,6 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands); auto PrivHelper = [&](Value &V) { - // Exclude omp.iv from aggregate - //if (&V == OMPIV) { - // OI.ExcludeArgsFromAggregate.push_back(&V); - // return; - //} - // Get all uses of value that are inside of the outlined region SetVector<Use *> Uses; for (Use &U : V.uses()) @@ -1810,11 +1781,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( addOutlineInfo(std::move(OI)); } - - - - - InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end()); //UI->getParent(), UI->getParent()->end()); + InsertPointTy AfterIP(FinalizeBB, FinalizeBB->end()); UI->eraseFromParent(); return AfterIP; @@ -3876,6 +3843,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsTeamsReduction, bool HasDistribute, + bool IsSimdReduction, ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue, unsigned ReductionBufNum, Value *SrcLocInfo) { if (!updateToLocation(Loc)) @@ -3967,9 +3935,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU( Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy); Value *WcFuncCast = Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy); - Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast}; - //Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr( - // RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2); + Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast, WcFuncCast}; Function *SimdReduceFn = getOrCreateRuntimeFunctionPtr( RuntimeFunction::OMPRTL___kmpc_nvptx_simd_reduce_nowait_v2); Res = Builder.CreateCall(SimdReduceFn, Args); @@ -9066,7 +9032,6 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata( [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString]( const TargetRegionEntryInfo &EntryInfo, const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) { - // Generate metadata for target regions. Each entry of this metadata // contains: // - Entry 0 -> Kind of this type of metadata (0). @@ -9491,7 +9456,6 @@ bool OffloadEntriesInfoManager::empty() const { unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount( const TargetRegionEntryInfo &EntryInfo) const { - auto It = OffloadEntriesTargetRegionCount.find( getTargetRegionEntryCountKey(EntryInfo)); if (It == OffloadEntriesTargetRegionCount.end()) @@ -9501,7 +9465,6 @@ unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount( void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount( const TargetRegionEntryInfo &EntryInfo) { - OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] = EntryInfo.Count + 1; } @@ -9509,7 +9472,6 @@ void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount( /// Initialize target region entry. void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo( const TargetRegionEntryInfo &EntryInfo, unsigned Order) { - OffloadEntriesTargetRegion[EntryInfo] = OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr, OMPTargetRegionEntryTargetRegion); @@ -9519,7 +9481,6 @@ void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo( void OffloadEntriesInfoManager::registerTargetRegionEntryInfo( TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags) { - assert(EntryInfo.Count == 0 && "expected default EntryInfo"); // Update the EntryInfo with the next available count for this location. @@ -9567,7 +9528,6 @@ bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo( void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo( const OffloadTargetRegionEntryInfoActTy &Action) { - // Scan all target region entries and perform the provided action. for (const auto &It : OffloadEntriesTargetRegion) { Action(It.first, It.second); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 0ad2b3a055f6c4..7d72330f53975a 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1194,7 +1194,6 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, } StructType *StructArgTy = nullptr; - //AllocaInst *Struct = nullptr; Instruction *Struct = nullptr; unsigned NumAggregatedInputs = 0; if (AggregateArgs && !StructValues.empty()) { @@ -1211,14 +1210,9 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction, if (ArgsInZeroAddressSpace && DL.getAllocaAddrSpace() != 0) { auto *StructSpaceCast = new AddrSpaceCastInst( - Struct, PointerType ::get(Context, 0), "structArg.ascast"); + Struct, PointerType ::get(Context, 0), "structArg.ascast"); StructSpaceCast->insertAfter(Struct); - // There isn't really a point in generating this cast if you - // just aren't going to use it... Struct = StructSpaceCast; - //params.push_back(StructSpaceCast); - } else { - //params.push_back(Struct); } params.push_back(Struct); // Store aggregated inputs in the struct. diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index e6c01d1741821a..0ca6798fe128e6 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -100,9 +100,7 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, } if (mapping::isInitialThreadInLevel0(IsSPMD)) - { return -1; - } // Enter the generic state machine if enabled and if this thread can possibly // be an active worker thread. diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index 85baf580465f30..b5f60f5b2080db 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -270,7 +270,6 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, } { - //printf("Generic execution\n"); // Note that the order here is important. `icv::Level` has to be updated // last or the other updates will cause a thread specific state to be // created. diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index 5086eb4966179c..c1747a70473b67 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -527,9 +527,6 @@ void SimdLoop( __kmpc_impl_lanemask_t SimdMask = mapping::simdmask(); uint32_t Step = mapping::getSimdLen(); - //printf("Thread=%i : Lane=%i : Len=%i : TripCount=%i\n", - // mapping::getThreadIdInBlock(), mapping::getSimdLane(), mapping::getSimdLen(), TripCount); - synchronize::warp(SimdMask); for(IType omp_iv = (IType) mapping::getSimdLane(); omp_iv < TripCount; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits