https://github.com/chandraghale updated https://github.com/llvm/llvm-project/pull/134709
>From a05af192052de8503fb4945bfb853b3f2c14e4c9 Mon Sep 17 00:00:00 2001 From: Chandra Ghale <gh...@pe31.hpc.amslabs.hpecorp.net> Date: Mon, 7 Apr 2025 13:58:25 -0500 Subject: [PATCH 1/5] Codegen for Reduction over private variables with reduction clause --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 147 +++++++++++ clang/lib/CodeGen/CGOpenMPRuntime.h | 14 ++ clang/lib/CodeGen/CGStmtOpenMP.cpp | 12 +- .../OpenMP/for_private_reduction_codegen.cpp | 236 ++++++++++++++++++ 4 files changed, 406 insertions(+), 3 deletions(-) create mode 100644 clang/test/OpenMP/for_private_reduction_codegen.cpp diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 06a652c146fb9..3424227e5da79 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4899,6 +4899,150 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, } } +void CGOpenMPRuntime::emitPrivateReduction( + CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, + ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, + ArrayRef<const Expr *> ReductionOps) { + + if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty()) + return; + + if (LHSExprs.size() != Privates.size() || + LHSExprs.size() != ReductionOps.size()) + return; + + QualType PrivateType = Privates[0]->getType(); + llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); + + BinaryOperatorKind MainBO = BO_Comma; + if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[0])) { + if (const auto *RHSExpr = BinOp->getRHS()) { + if (const auto *BORHS = + dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) { + MainBO = BORHS->getOpcode(); + } + } + } + + llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType); + const Expr *Private = Privates[0]; + + if (const auto *DRE = dyn_cast<DeclRefExpr>(Private)) { + if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) { + if (const Expr *Init = VD->getInit()) { + if (Init->isConstantInitializer(CGF.getContext(), false)) { + Expr::EvalResult Result; + if (Init->EvaluateAsRValue(Result, CGF.getContext())) { + APValue &InitValue = Result.Val; + if (InitValue.isInt()) { + InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt()); + } + } + } + } + } + } + + // Create an internal shared variable + std::string SharedName = getName({"internal_private_var"}); + llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable( + CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage, + InitVal, ".omp.reduction." + SharedName, nullptr, + llvm::GlobalVariable::NotThreadLocal); + + SharedVar->setAlignment( + llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8)); + + Address SharedResult(SharedVar, SharedVar->getValueType(), + CGF.getContext().getTypeAlignInChars(PrivateType)); + + llvm::Value *ThreadId = getThreadID(CGF, Loc); + llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE); + llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId}; + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + llvm::BasicBlock *InitBB = CGF.createBasicBlock("init"); + llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end"); + + llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ( + ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0)); + CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); + + CGF.EmitBlock(InitBB); + CGF.Builder.CreateStore(InitVal, SharedResult); + CGF.Builder.CreateBr(InitEndBB); + + CGF.EmitBlock(InitEndBB); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + for (unsigned I = 0; I < ReductionOps.size(); ++I) { + if (I >= LHSExprs.size()) { + break; + } + + const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]); + if (!BinOp || BinOp->getOpcode() != BO_Assign) + continue; + + const Expr *RHSExpr = BinOp->getRHS(); + if (!RHSExpr) + continue; + + BinaryOperatorKind BO = BO_Comma; + if (const auto *BORHS = + dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) { + BO = BORHS->getOpcode(); + } + + LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); + RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); + auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) { + if (BO == BO_Mul) { + llvm::Value *OldScalar = OldVal.getScalarVal(); + llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); + llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar); + return RValue::get(Result); + } else { + OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), + BinOp->getLHS()->getType(), + ExprValueKind::VK_PRValue); + CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal); + return CGF.EmitAnyExpr(BinOp->getRHS()); + } + }; + + (void)CGF.EmitOMPAtomicSimpleUpdateExpr( + SharedLV, PrivateRV, BO, true, + llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + } + + // Final barrier + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + // Broadcast final result + llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult); + + // Update private variables with final result + for (unsigned I = 0; I < Privates.size(); ++I) { + LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); + CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress()); + } + + // Final synchronization + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); +} + void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, ArrayRef<const Expr *> LHSExprs, @@ -5201,6 +5345,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); + if (Options.IsPrivateVarReduction) { + emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps); + } } /// Generates unique name for artificial threadprivate variables. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 4321712e1521d..50ba28b565b6d 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1201,8 +1201,22 @@ class CGOpenMPRuntime { struct ReductionOptionsTy { bool WithNowait; bool SimpleReduction; + bool IsPrivateVarReduction; OpenMPDirectiveKind ReductionKind; }; + + /// Emits code for private variable reduction + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc, + ArrayRef<const Expr *> Privates, + ArrayRef<const Expr *> LHSExprs, + ArrayRef<const Expr *> RHSExprs, + ArrayRef<const Expr *> ReductionOps); + /// Emit a code for reduction clause. Next code should be emitted for /// reduction: /// \code diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index e4d1db264aac9..720a88e075ddd 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1470,6 +1470,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( llvm::SmallVector<const Expr *, 8> LHSExprs; llvm::SmallVector<const Expr *, 8> RHSExprs; llvm::SmallVector<const Expr *, 8> ReductionOps; + llvm::SmallVector<bool, 8> IsPrivate; bool HasAtLeastOneReduction = false; bool IsReductionWithTaskMod = false; for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { @@ -1480,6 +1481,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( Privates.append(C->privates().begin(), C->privates().end()); LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end()); RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end()); + IsPrivate.append(C->private_var_reduction_flags().begin(), + C->private_var_reduction_flags().end()); ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); IsReductionWithTaskMod = IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task; @@ -1499,9 +1502,11 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( bool SimpleReduction = ReductionKind == OMPD_simd; // Emit nowait reduction if nowait clause is present or directive is a // parallel directive (it always has implicit barrier). + bool IsPrivateVarReduction = + llvm::any_of(IsPrivate, [](bool IsPriv) { return IsPriv; }); CGM.getOpenMPRuntime().emitReduction( *this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps, - {WithNowait, SimpleReduction, ReductionKind}); + {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind}); } } @@ -3943,7 +3948,8 @@ static void emitScanBasedDirective( PrivScope.Privatize(); CGF.CGM.getOpenMPRuntime().emitReduction( CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, + /*IsPrivateVarReduction */ false, OMPD_unknown}); } llvm::Value *NextIVal = CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1)); @@ -5747,7 +5753,7 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) { } CGM.getOpenMPRuntime().emitReduction( *this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, false, OMPD_simd}); for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) { const Expr *PrivateExpr = Privates[I]; LValue DestLVal; diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp new file mode 100644 index 0000000000000..be50991ca193e --- /dev/null +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -0,0 +1,236 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --prefix-filecheck-ir-name _ --version 5 +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics +#define N 10 +void do_red(int n, int *v, int &sum_v) + { + sum_v = 0; + #pragma omp for reduction(original(private),+: sum_v) + for (int i = 0; i < n; i++) + { + sum_v += v[i]; + } + } + int main(void) + { + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; + #pragma omp parallel num_threads(4) + { + int s_v; + do_red(N, v, s_v); + } + return 0; + } +//. +// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8 +// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4 +// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +//. +// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi( +// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]] +// CHECK: [[OMP_PRECOND_THEN]]: +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK-NEXT: br i1 [[CMP7]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CHECK: [[COND_TRUE]]: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: br label %[[COND_END:.*]] +// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label %[[COND_END]] +// CHECK: [[COND_END]]: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE]] ], [ [[TMP11]], %[[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK-NEXT: br i1 [[CMP8]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]] +// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4 +// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CHECK: [[OMP_BODY_CONTINUE]]: +// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CHECK: [[OMP_INNER_FOR_END]]: +// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] +// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] +// CHECK-NEXT: ] +// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4 +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK: [[INIT]]: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: br label %[[INIT_END]] +// CHECK: [[INIT_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var, i32 [[TMP29]] seq_cst, align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: br label %[[OMP_PRECOND_END]] +// CHECK: [[OMP_PRECOND_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func( +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local noundef i32 @main( +// CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND:.*]] +// CHECK: [[FOR_COND]]: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// CHECK: [[FOR_BODY]]: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label %[[FOR_INC:.*]] +// CHECK: [[FOR_INC]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK: [[FOR_END]]: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @main.omp_outlined, ptr [[V]]) +// CHECK-NEXT: ret i32 0 +// +// +// CHECK-LABEL: define internal void @main.omp_outlined( +// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[S_V:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 +// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) +// CHECK-NEXT: ret void >From 4e6eea6ff066320a584d606ab258c375b1d887be Mon Sep 17 00:00:00 2001 From: Chandra Ghale <gh...@pe31.hpc.amslabs.hpecorp.net> Date: Tue, 8 Apr 2025 11:26:53 -0500 Subject: [PATCH 2/5] review comment changes incorporated --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 3424227e5da79..13b070f898a1c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4981,7 +4981,8 @@ void CGOpenMPRuntime::emitPrivateReduction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); - for (unsigned I = 0; I < ReductionOps.size(); ++I) { + for (unsigned I : + llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) { if (I >= LHSExprs.size()) { break; } @@ -5003,7 +5004,7 @@ void CGOpenMPRuntime::emitPrivateReduction( LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); - auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) { + auto UpdateOp = [&](RValue OldVal) { if (BO == BO_Mul) { llvm::Value *OldScalar = OldVal.getScalarVal(); llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); @@ -5032,7 +5033,7 @@ void CGOpenMPRuntime::emitPrivateReduction( llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult); // Update private variables with final result - for (unsigned I = 0; I < Privates.size(); ++I) { + for (unsigned I : llvm::seq<unsigned>(Privates.size())) { LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress()); } @@ -5345,9 +5346,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); - if (Options.IsPrivateVarReduction) { + if (Options.IsPrivateVarReduction) emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps); - } } /// Generates unique name for artificial threadprivate variables. >From 18e1708275ffbeec5a68b264f8c584b3c9a72704 Mon Sep 17 00:00:00 2001 From: Chandra Ghale <gh...@pe31.hpc.amslabs.hpecorp.net> Date: Wed, 9 Apr 2025 11:28:59 -0500 Subject: [PATCH 3/5] review comment , removing redundant code --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 13b070f898a1c..93d7280408002 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4983,9 +4983,6 @@ void CGOpenMPRuntime::emitPrivateReduction( for (unsigned I : llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) { - if (I >= LHSExprs.size()) { - break; - } const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]); if (!BinOp || BinOp->getOpcode() != BO_Assign) >From 59ab4be637f31f04e684fe759d287841c9a11746 Mon Sep 17 00:00:00 2001 From: Chandra Ghale <gh...@pe31.hpc.amslabs.hpecorp.net> Date: Thu, 10 Apr 2025 11:44:44 -0500 Subject: [PATCH 4/5] fix for user-defined reduction op --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 93d7280408002..3fd0a0489f38e 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4993,9 +4993,11 @@ void CGOpenMPRuntime::emitPrivateReduction( continue; BinaryOperatorKind BO = BO_Comma; - if (const auto *BORHS = - dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) { + const Expr *StripRHS = RHSExpr->IgnoreParenImpCasts(); + if (const auto *BORHS = dyn_cast<BinaryOperator>(StripRHS)) { BO = BORHS->getOpcode(); + } else if (const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(StripRHS)) { + BO = BinaryOperator::getOverloadedOpcode(OpCall->getOperator()); } LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); >From e45c30a3e6489d2c40f4267c4444d557683be6a7 Mon Sep 17 00:00:00 2001 From: Chandra Ghale <gh...@pe31.hpc.amslabs.hpecorp.net> Date: Thu, 1 May 2025 06:10:20 -0500 Subject: [PATCH 5/5] Handle user-defined reduction and updated lit test --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 134 ++++--- .../OpenMP/for_private_reduction_codegen.cpp | 344 ++++++++++++++++-- 2 files changed, 393 insertions(+), 85 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 3fd0a0489f38e..bea9f6af080dd 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4914,30 +4914,22 @@ void CGOpenMPRuntime::emitPrivateReduction( QualType PrivateType = Privates[0]->getType(); llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); - BinaryOperatorKind MainBO = BO_Comma; - if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[0])) { - if (const auto *RHSExpr = BinOp->getRHS()) { - if (const auto *BORHS = - dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) { - MainBO = BORHS->getOpcode(); - } - } - } - llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType); - const Expr *Private = Privates[0]; - - if (const auto *DRE = dyn_cast<DeclRefExpr>(Private)) { + const Expr *InitExpr = nullptr; + if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) { if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) { - if (const Expr *Init = VD->getInit()) { - if (Init->isConstantInitializer(CGF.getContext(), false)) { + InitExpr = VD->getInit(); + if (InitExpr && !PrivateType->isAggregateType()) { + if (InitExpr->isConstantInitializer(CGF.getContext(), false)) { Expr::EvalResult Result; - if (Init->EvaluateAsRValue(Result, CGF.getContext())) { + if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) { APValue &InitValue = Result.Val; if (InitValue.isInt()) { InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt()); } } + } else { + InitVal = llvm::Constant::getNullValue(LLVMType); } } } @@ -4972,7 +4964,25 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); CGF.EmitBlock(InitBB); - CGF.Builder.CreateStore(InitVal, SharedResult); + if (InitExpr) { + RValue RV = CGF.EmitAnyExpr(InitExpr); + if (RV.isAggregate()) { + CGF.Builder.CreateMemCpy(SharedResult, RV.getAggregateAddress(), + llvm::ConstantInt::get(CGF.IntPtrTy, 4), + /*IsVolatile=*/false); + } else { + CGF.Builder.CreateStore(RV.getScalarVal(), SharedResult); + } + } else { + if (PrivateType->isAggregateType()) { + CGF.Builder.CreateMemSet(SharedResult, + llvm::ConstantInt::get(CGM.Int8Ty, 0), + llvm::ConstantInt::get(CGF.IntPtrTy, 4), + /*IsVolatile=*/false); + } else { + CGF.Builder.CreateStore(InitVal, SharedResult); + } + } CGF.Builder.CreateBr(InitEndBB); CGF.EmitBlock(InitEndBB); @@ -4983,46 +4993,75 @@ void CGOpenMPRuntime::emitPrivateReduction( for (unsigned I : llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) { + const Expr *ReductionClauseExpr = ReductionOps[I]->IgnoreParenCasts(); + if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr)) + ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); + const Expr *AssignRHS = nullptr; + const Expr *AssignLHS = nullptr; + + if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) { + if (BinOp->getOpcode() == BO_Assign) { + AssignLHS = BinOp->getLHS(); + AssignRHS = BinOp->getRHS(); + } + } else if (const auto *OpCall = + dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) { + if (OpCall->getOperator() == OO_Equal) { + AssignLHS = OpCall->getArg(0); + AssignRHS = OpCall->getArg(1); + } + } - const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]); - if (!BinOp || BinOp->getOpcode() != BO_Assign) - continue; - - const Expr *RHSExpr = BinOp->getRHS(); - if (!RHSExpr) + if (!AssignRHS || !AssignLHS) { continue; + } - BinaryOperatorKind BO = BO_Comma; - const Expr *StripRHS = RHSExpr->IgnoreParenImpCasts(); - if (const auto *BORHS = dyn_cast<BinaryOperator>(StripRHS)) { - BO = BORHS->getOpcode(); - } else if (const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(StripRHS)) { - BO = BinaryOperator::getOverloadedOpcode(OpCall->getOperator()); + const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts(); + if (const auto *MTE = + dyn_cast<MaterializeTemporaryExpr>(ReductionCombinerExpr)) { + ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); } + BinaryOperatorKind BO = BO_Assign; LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); - auto UpdateOp = [&](RValue OldVal) { - if (BO == BO_Mul) { - llvm::Value *OldScalar = OldVal.getScalarVal(); - llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); - llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar); - return RValue::get(Result); - } else { - OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), - BinOp->getLHS()->getType(), - ExprValueKind::VK_PRValue); - CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal); - return CGF.EmitAnyExpr(BinOp->getRHS()); - } - }; + if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionCombinerExpr)) { + BO = BinOp->getOpcode(); + auto UpdateOp = [&](RValue OldVal) { + if (BO == BO_Mul) { + llvm::Value *OldScalar = OldVal.getScalarVal(); + llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); + llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar); + return RValue::get(Result); + } else { + OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), + BinOp->getLHS()->getType(), + ExprValueKind::VK_PRValue); + CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal); + return CGF.EmitAnyExpr(BinOp->getRHS()); + } + }; - (void)CGF.EmitOMPAtomicSimpleUpdateExpr( - SharedLV, PrivateRV, BO, true, - llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + (void)CGF.EmitOMPAtomicSimpleUpdateExpr( + SharedLV, PrivateRV, BO, true, + llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + } else if (const auto *OpCall = dyn_cast<CallExpr>(ReductionClauseExpr)) { + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + CharUnits Alignment = CGF.getContext().getTypeAlignInChars(PrivateType); + Address TempResult = + CGF.CreateMemTemp(PrivateType, "reduction.temp.result"); + ReturnValueSlot RVS(TempResult, /*IsVolatile=*/false); + RValue ResultRV = CGF.EmitCallExpr(OpCall, RVS, nullptr); + CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(), + llvm::ConstantInt::get(CGF.IntPtrTy, 4), + Alignment.getQuantity()); + }; + std::string CriticalName = getName({"reduction_critical"}); + emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); + } } - // Final barrier CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), @@ -5042,7 +5081,6 @@ void CGOpenMPRuntime::emitPrivateReduction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); } - void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, ArrayRef<const Expr *> LHSExprs, diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index be50991ca193e..dcacc4140bbdb 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -2,38 +2,307 @@ // RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s // expected-no-diagnostics #define N 10 +class Sum { + int val; +public: + Sum(int v = 0) : val(v) {} + Sum operator+(const Sum& rhs) const { + return Sum(val + rhs.val); + } +}; + +void func_red(){ + Sum result(0); + Sum array[N]; + + for(int i = 0; i < 10; i++) { + array[i] = Sum(i); + } + + #pragma omp parallel private(result) num_threads(4) + { + #pragma omp for reduction(+:result) + for(int i = 0; i < 10; i++) { + result = result + array[i]; + } + } +} + void do_red(int n, int *v, int &sum_v) { - sum_v = 0; - #pragma omp for reduction(original(private),+: sum_v) - for (int i = 0; i < n; i++) - { - sum_v += v[i]; - } + sum_v = 0; + #pragma omp for reduction(original(private),+: sum_v) + for (int i = 0; i < n; i++) + { + sum_v += v[i]; + } } int main(void) { - int v[N]; - for (int i = 0; i < N; i++) - v[i] = i; - #pragma omp parallel num_threads(4) - { - int s_v; - do_red(N, v, s_v); - } - return 0; + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; + #pragma omp parallel num_threads(4) + { + int s_v; + do_red(N, v, s_v); + } + return 0; } //. // CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8 -// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4 +// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8 +// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4 +// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4 //. +// CHECK-LABEL: define dso_local void @_Z8func_redv( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*]]: +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10 +// CHECK-NEXT: br label %[[ARRAYCTOR_LOOP:.*]] +// CHECK: [[ARRAYCTOR_LOOP]]: +// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], %[[ENTRY]] ], [ [[ARRAYCTOR_NEXT:%.*]], %[[ARRAYCTOR_LOOP]] ] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0) +// CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1 +// CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]] +// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label %[[ARRAYCTOR_CONT:.*]], label %[[ARRAYCTOR_LOOP]] +// CHECK: [[ARRAYCTOR_CONT]]: +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND:.*]] +// CHECK: [[FOR_COND]]: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// CHECK: [[FOR_BODY]]: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label %[[FOR_INC:.*]] +// CHECK: [[FOR_INC]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK: [[FOR_END]]: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define linkonce_odr void @_ZN3SumC1Ei( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined( +// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[REF_TMP4:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[REF_TMP7:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[AGG_TEMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[REDUCTION_TEMP_RESULT:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[REF_TMP10:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], i32 noundef 0) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CHECK: [[COND_TRUE]]: +// CHECK-NEXT: br label %[[COND_END:.*]] +// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label %[[COND_END]] +// CHECK: [[COND_END]]: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]] +// CHECK-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CHECK: [[OMP_BODY_CONTINUE]]: +// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CHECK: [[OMP_INNER_FOR_END]]: +// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[RESULT1]], ptr [[TMP11]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP12]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] +// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] +// CHECK-NEXT: ] +// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK-NEXT: [[CALL5:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) +// CHECK-NEXT: [[COERCE_DIVE6:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP4]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL5]], ptr [[COERCE_DIVE6]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP4]], i64 4, i1 false) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) +// CHECK-NEXT: [[CALL8:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) +// CHECK-NEXT: [[COERCE_DIVE9:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP7]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL8]], ptr [[COERCE_DIVE9]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP7]], i64 4, i1 false) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK: [[INIT]]: +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[AGG_TEMP]], i32 noundef 0) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 4 [[AGG_TEMP]], i64 4, i1 false) +// CHECK-NEXT: br label %[[INIT_END]] +// CHECK: [[INIT_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr [[RESULT]], align 4 +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[CALL11:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) +// CHECK-NEXT: [[COERCE_DIVE12:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP10]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL11]], ptr [[COERCE_DIVE12]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP10]], i64 4, i1 false) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 16 [[RESULT]], i64 4, i1 true) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP15:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: store [[CLASS_SUM]] [[TMP15]], ptr [[RESULT]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define linkonce_odr i32 @_ZNK3SumplERKS_( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined.omp.reduction.reduction_func( +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP7]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP5]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define linkonce_odr void @_ZN3SumC2Ei( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[VAL]], align 4 +// CHECK-NEXT: ret void +// +// // CHECK-LABEL: define dso_local void @_Z6do_rediPiRi( -// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 @@ -52,7 +321,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 // CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 // CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 @@ -130,7 +399,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 // CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8 -// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ // CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] // CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] @@ -140,28 +409,28 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 // CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] // CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_CASE2:.*:]] // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4 // CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4 -// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var.1, align 4 // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var, i32 [[TMP29]] seq_cst, align 4 -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var.1, i32 [[TMP29]] seq_cst, align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4 // CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: br label %[[OMP_PRECOND_END]] // CHECK: [[OMP_PRECOND_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) @@ -169,7 +438,7 @@ void do_red(int n, int *v, int &sum_v) // // // CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func( -// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -189,12 +458,12 @@ void do_red(int n, int *v, int &sum_v) // // // CHECK-LABEL: define dso_local noundef i32 @main( -// CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +// CHECK-SAME: ) #[[ATTR6:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK-NEXT: store i32 0, ptr [[I]], align 4 // CHECK-NEXT: br label %[[FOR_COND:.*]] @@ -213,15 +482,15 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 -// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] // CHECK: [[FOR_END]]: -// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 4) -// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @main.omp_outlined, ptr [[V]]) +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]]) // CHECK-NEXT: ret i32 0 // // // CHECK-LABEL: define internal void @main.omp_outlined( -// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR2]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -234,3 +503,4 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 // CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) // CHECK-NEXT: ret void + _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits