Author: Robert Imschweiler Date: 2026-06-10T10:04:22+02:00 New Revision: 1dcb977d17611721e3b3b51b6557197c45762275
URL: https://github.com/llvm/llvm-project/commit/1dcb977d17611721e3b3b51b6557197c45762275 DIFF: https://github.com/llvm/llvm-project/commit/1dcb977d17611721e3b3b51b6557197c45762275.diff LOG: [clang][OpenMP] Improve loop structure for distributed loops (pt 1: reductions) (#201670) This is a part of a series of patches that rework OpenMP cross-team reductions. This patches wires the existing `kmp_sched_distr_static_chunk_sched_static_chunkone` to be used by CodeGen (this patch is restricted to reduction loops). Example of the intended change of this patch: ``` target teams distribute parallel for reduction(+:s) for (i = 0; i < N; i++) s += a[i]; ``` Before: ``` __kmpc_distribute_static_init(91) for (team_lb = team*nthreads; team_lb < N; team_lb += nteams*nthreads) { __kmpc_for_static_init(33) for (iv = team_lb + tid; iv < team_lb + nthreads; iv += nthreads) { priv += a[iv]; } __kmpc_nvptx_parallel_reduce_nowait_v2 } __kmpc_nvptx_teams_reduce_nowait_v2 ``` After: ``` __kmpc_for_static_init(93) for (iv = team*nthreads + tid; iv < N; iv += nteams*nthreads) { priv += a[iv]; } __kmpc_nvptx_parallel_reduce_nowait_v2 __kmpc_nvptx_teams_reduce_nowait_v2 ``` Performance: All performance tests can be reproduced with https://github.com/ro-i/xteam-test @ commit 6025e5afc14dd6e65ee2658e5001c16e9b9245ff. To reproduce, simply create a `local.mk` file in the cloned directory with a suitable `OFFLOAD_ARCH` for your machine and `CXX_trunk` + `CXX_trunk_cg` set to the paths of the clang++ binaries for llvm/main and this patch. (llvm/main should best be at the commit that is currently the base for this PR. At the moment, this is 69f7aeb52e71ebb7d264bc9e613bc4bc90cb0c47). Then, run `make trunk trunk_cg` to build the benchmark binaries for 208 and 10400 teams. Run them with `./run_bench.sh -rq -n10 red_trunk_208 red_trunk_cg_208 red_trunk_10400 red_trunk_cg_10400` to get the avg performance numbers over 10 rounds. This tests multiple reduction workloads, including reductions that run in the Generic-SPMD mode, with 208 teams and with 10400 teams, both à 512 threads, and with a reduction array size of 177,777,777. I tested on a gfx942 and found the following numbers showing the performance of this patch relative to the baseline: ``` red_comb_sep_arr_32 double change for 208 teams: +0.01% change for 10400 teams: +5.53% red_sum_arr_32 double change for 208 teams: +570.47% change for 10400 teams: -2.23% red_comb double change for 208 teams: +350.30% change for 10400 teams: +0.72% red_comb_sep double change for 208 teams: +4.82% change for 10400 teams: +2.18% red_dot double change for 208 teams: +202.45% change for 10400 teams: +3.48% red_indirect double change for 208 teams: +239.33% change for 10400 teams: +4.63% red_kernel_part double change for 208 teams: +3.30% change for 10400 teams: +3.43% red_max double change for 208 teams: +273.46% change for 10400 teams: +5.12% red_mult double change for 208 teams: +239.50% change for 10400 teams: +5.23% red_sum double change for 208 teams: +239.47% change for 10400 teams: +5.15% red_pi double change for 208 teams: +90.06% change for 10400 teams: +78.67% red_comb_sep_arr_32 uint change for 208 teams: -0.16% change for 10400 teams: +26.98% red_sum_arr_32 uint change for 208 teams: +139.64% change for 10400 teams: -14.55% red_dot uint change for 208 teams: +202.92% change for 10400 teams: +5.11% red_max uint change for 208 teams: +221.41% change for 10400 teams: +6.54% red_sum uint change for 208 teams: +220.83% change for 10400 teams: +7.80% red_comb_sep_arr_32 ulong change for 208 teams: -0.19% change for 10400 teams: +5.80% red_sum_arr_32 ulong change for 208 teams: +523.98% change for 10400 teams: -3.17% red_dot ulong change for 208 teams: +232.14% change for 10400 teams: +3.57% red_max ulong change for 208 teams: +279.87% change for 10400 teams: +6.17% red_sum ulong change for 208 teams: +261.54% change for 10400 teams: +5.72% red_comb_sep_arr_32 Value change for 208 teams: +0.22% change for 10400 teams: +0.04% red_sum_arr_32 Value change for 208 teams: +423.38% change for 10400 teams: +9.08% red_dot Value change for 208 teams: +153.87% change for 10400 teams: -2.62% red_max Value change for 208 teams: +1097.62% change for 10400 teams: +261.16% red_sum Value change for 208 teams: +358.88% change for 10400 teams: +21.44% ``` Claude assisted with this patch. Added: Modified: clang/include/clang/Basic/OpenMPKinds.h clang/lib/CodeGen/CGOpenMPRuntime.cpp clang/lib/CodeGen/CGStmtOpenMP.cpp clang/test/OpenMP/target_teams_generic_loop_codegen.cpp Removed: ################################################################################ diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index 9fd5150921207..40f80802167fa 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -188,6 +188,9 @@ struct OpenMPScheduleTy final { OpenMPScheduleClauseKind Schedule = OMPC_SCHEDULE_unknown; OpenMPScheduleClauseModifier M1 = OMPC_SCHEDULE_MODIFIER_unknown; OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown; + /// Request the fused distr_static_chunk + static_chunkone runtime schedule + /// in `for_static_init`. The outer `distribute_static_init` is skipped. + bool UseFusedDistChunkSchedule = false; }; /// OpenMP modifiers for 'reduction' clause. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 92763d72877f4..eb2f92cdbf972 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -546,6 +546,12 @@ enum OpenMPSchedType { /// dist_schedule types OMP_dist_sch_static_chunked = 91, OMP_dist_sch_static = 92, + /// Fused distribute+for static schedule (entityId = team*nthreads + tid, + /// num_entities = nteams*nthreads). One for_static_init call, no + /// surrounding distribute_static_init. Matches + /// kmp_sched_distr_static_chunk_sched_static_chunkone in the device RTL + /// (openmp/device/include/DeviceTypes.h). + OMP_dist_sch_static_chunked_sch_static_chunkone = 93, /// Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. /// Set if the monotonic schedule modifier was present. OMP_sch_modifier_monotonic = (1 << 29), @@ -2633,7 +2639,8 @@ static int addMonoNonMonoModifier(CodeGenModule &CGM, OpenMPSchedType Schedule, Schedule == OMP_sch_static_balanced_chunked || Schedule == OMP_ord_static_chunked || Schedule == OMP_ord_static || Schedule == OMP_dist_sch_static_chunked || - Schedule == OMP_dist_sch_static)) + Schedule == OMP_dist_sch_static || + Schedule == OMP_dist_sch_static_chunked_sch_static_chunkone)) Modifier = OMP_sch_modifier_nonmonotonic; } return Schedule | Modifier; @@ -2695,7 +2702,8 @@ static void emitForStaticInitCall( Schedule == OMP_sch_static_balanced_chunked || Schedule == OMP_ord_static || Schedule == OMP_ord_static_chunked || Schedule == OMP_dist_sch_static || - Schedule == OMP_dist_sch_static_chunked); + Schedule == OMP_dist_sch_static_chunked || + Schedule == OMP_dist_sch_static_chunked_sch_static_chunkone); // Call __kmpc_for_static_init( // ident_t *loc, kmp_int32 tid, kmp_int32 schedtype, @@ -2713,7 +2721,8 @@ static void emitForStaticInitCall( assert((Schedule == OMP_sch_static_chunked || Schedule == OMP_sch_static_balanced_chunked || Schedule == OMP_ord_static_chunked || - Schedule == OMP_dist_sch_static_chunked) && + Schedule == OMP_dist_sch_static_chunked || + Schedule == OMP_dist_sch_static_chunked_sch_static_chunkone) && "expected static chunked schedule"); } llvm::Value *Args[] = { @@ -2736,8 +2745,11 @@ void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF, OpenMPDirectiveKind DKind, const OpenMPScheduleTy &ScheduleKind, const StaticRTInput &Values) { - OpenMPSchedType ScheduleNum = getRuntimeSchedule( - ScheduleKind.Schedule, Values.Chunk != nullptr, Values.Ordered); + OpenMPSchedType ScheduleNum = + ScheduleKind.UseFusedDistChunkSchedule + ? OMP_dist_sch_static_chunked_sch_static_chunkone + : getRuntimeSchedule(ScheduleKind.Schedule, Values.Chunk != nullptr, + Values.Ordered); assert((isOpenMPWorksharingDirective(DKind) || (DKind == OMPD_loop)) && "Expected loop-based or sections-based directive."); llvm::Value *UpdatedLocation = emitUpdateLocation(CGF, Loc, diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 1eaf8efa142c5..314f4e14dd1d2 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -50,6 +50,22 @@ static const VarDecl *getBaseDecl(const Expr *Ref); static OpenMPDirectiveKind getEffectiveDirectiveKind(const OMPExecutableDirective &S); +/// Whether a combined `distribute parallel for` may use the fused +/// distr_static_chunk + static_chunkone schedule (enum 93): one +/// for_static_init, no surrounding distribute_static_init. +static bool canEmitGPUFusedDistSchedule(const CodeGenModule &CGM, + const OMPLoopDirective &S, + OpenMPDirectiveKind DKind) { + // Reduction-only for now. Non-reduction cases might follow in the future, but + // need more analysis for maximum profit. + return CGM.getLangOpts().OpenMPIsTargetDevice && CGM.getTriple().isGPU() && + isOpenMPLoopBoundSharingDirective(DKind) && + S.hasClausesOfKind<OMPReductionClause>() && + !S.getSingleClause<OMPDistScheduleClause>() && + !S.getSingleClause<OMPScheduleClause>() && + !S.getSingleClause<OMPOrderedClause>(); +} + namespace { /// Lexical scope for OpenMP executable constructs, that handles correct codegen /// for captured expressions. @@ -3879,6 +3895,12 @@ bool CodeGenFunction::EmitOMPWorksharingLoop( RT.isStaticChunked(ScheduleKind.Schedule, /* Chunked */ Chunk != nullptr) && HasChunkSizeOne && isOpenMPLoopBoundSharingDirective(EKind); + // GPU combined `distribute parallel for`: emit a single + // for_static_init with the fused distr_static_chunk + static_chunkone + // schedule (enum 93). The surrounding EmitOMPDistributeLoop must skip + // its distribute_static_init under the same conditions. + if (StaticChunkedOne && canEmitGPUFusedDistSchedule(CGM, S, EKind)) + ScheduleKind.UseFusedDistChunkSchedule = true; bool IsMonotonic = Ordered || (ScheduleKind.Schedule == OMPC_SCHEDULE_static && @@ -6275,102 +6297,113 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S, const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); - // OpenMP [2.10.8, distribute Construct, Description] - // If dist_schedule is specified, kind must be static. If specified, - // iterations are divided into chunks of size chunk_size, chunks are - // assigned to the teams of the league in a round-robin fashion in the - // order of the team number. When no chunk_size is specified, the - // iteration space is divided into chunks that are approximately equal - // in size, and at most one chunk is distributed to each team of the - // league. The size of the chunks is unspecified in this case. - bool StaticChunked = - RT.isStaticChunked(ScheduleKind, /* Chunked */ Chunk != nullptr) && - isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()); - if (RT.isStaticNonchunked(ScheduleKind, - /* Chunked */ Chunk != nullptr) || - StaticChunked) { - CGOpenMPRuntime::StaticRTInput StaticInit( - IVSize, IVSigned, /* Ordered = */ false, IL.getAddress(), - LB.getAddress(), UB.getAddress(), ST.getAddress(), - StaticChunked ? Chunk : nullptr); - RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, - StaticInit); + // GPU fused schedule: omit the outer distribute loop and let the inner + // worksharing loop schedule the flattened team/thread iteration space. + if (canEmitGPUFusedDistSchedule(CGM, S, S.getDirectiveKind())) { JumpDest LoopExit = getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit")); - // UB = min(UB, GlobalUB); - EmitIgnoredExpr(isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) - ? S.getCombinedEnsureUpperBound() - : S.getEnsureUpperBound()); - // IV = LB; - EmitIgnoredExpr(isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) - ? S.getCombinedInit() - : S.getInit()); - - const Expr *Cond = - isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) - ? S.getCombinedCond() - : S.getCond(); - - if (StaticChunked) - Cond = S.getCombinedDistCond(); - - // For static unchunked schedules generate: - // - // 1. For distribute alone, codegen - // while (idx <= UB) { - // BODY; - // ++idx; - // } - // - // 2. When combined with 'for' (e.g. as in 'distribute parallel for') - // while (idx <= UB) { - // <CodeGen rest of pragma>(LB, UB); - // idx += ST; - // } - // - // For static chunk one schedule generate: - // - // while (IV <= GlobalUB) { - // <CodeGen rest of pragma>(LB, UB); - // LB += ST; - // UB += ST; - // UB = min(UB, GlobalUB); - // IV = LB; - // } - // - emitCommonSimdLoop( - *this, S, - [&S](CodeGenFunction &CGF, PrePostActionTy &) { - if (isOpenMPSimdDirective(S.getDirectiveKind())) - CGF.EmitOMPSimdInit(S); - }, - [&S, &LoopScope, Cond, IncExpr, LoopExit, &CodeGenLoop, - StaticChunked](CodeGenFunction &CGF, PrePostActionTy &) { - CGF.EmitOMPInnerLoop( - S, LoopScope.requiresCleanups(), Cond, IncExpr, - [&S, LoopExit, &CodeGenLoop](CodeGenFunction &CGF) { - CodeGenLoop(CGF, S, LoopExit); - }, - [&S, StaticChunked](CodeGenFunction &CGF) { - if (StaticChunked) { - CGF.EmitIgnoredExpr(S.getCombinedNextLowerBound()); - CGF.EmitIgnoredExpr(S.getCombinedNextUpperBound()); - CGF.EmitIgnoredExpr(S.getCombinedEnsureUpperBound()); - CGF.EmitIgnoredExpr(S.getCombinedInit()); - } - }); - }); + CodeGenLoop(*this, S, LoopExit); EmitBlock(LoopExit.getBlock()); - // Tell the runtime we are done. - RT.emitForStaticFinish(*this, S.getEndLoc(), OMPD_distribute); } else { - // Emit the outer loop, which requests its work chunk [LB..UB] from - // runtime and runs the inner loop to process it. - const OMPLoopArguments LoopArguments = { - LB.getAddress(), UB.getAddress(), ST.getAddress(), IL.getAddress(), - Chunk}; - EmitOMPDistributeOuterLoop(ScheduleKind, S, LoopScope, LoopArguments, - CodeGenLoop); + // OpenMP [2.10.8, distribute Construct, Description] + // If dist_schedule is specified, kind must be static. If specified, + // iterations are divided into chunks of size chunk_size, chunks are + // assigned to the teams of the league in a round-robin fashion in the + // order of the team number. When no chunk_size is specified, the + // iteration space is divided into chunks that are approximately equal + // in size, and at most one chunk is distributed to each team of the + // league. The size of the chunks is unspecified in this case. + bool StaticChunked = + RT.isStaticChunked(ScheduleKind, /* Chunked */ Chunk != nullptr) && + isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()); + if (RT.isStaticNonchunked(ScheduleKind, + /* Chunked */ Chunk != nullptr) || + StaticChunked) { + CGOpenMPRuntime::StaticRTInput StaticInit( + IVSize, IVSigned, /* Ordered = */ false, IL.getAddress(), + LB.getAddress(), UB.getAddress(), ST.getAddress(), + StaticChunked ? Chunk : nullptr); + RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, + StaticInit); + JumpDest LoopExit = + getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit")); + // UB = min(UB, GlobalUB); + EmitIgnoredExpr( + isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) + ? S.getCombinedEnsureUpperBound() + : S.getEnsureUpperBound()); + // IV = LB; + EmitIgnoredExpr( + isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) + ? S.getCombinedInit() + : S.getInit()); + + const Expr *Cond = + isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) + ? S.getCombinedCond() + : S.getCond(); + + if (StaticChunked) + Cond = S.getCombinedDistCond(); + + // For static unchunked schedules generate: + // + // 1. For distribute alone, codegen + // while (idx <= UB) { + // BODY; + // ++idx; + // } + // + // 2. When combined with 'for' (e.g. as in 'distribute parallel for') + // while (idx <= UB) { + // <CodeGen rest of pragma>(LB, UB); + // idx += ST; + // } + // + // For static chunk one schedule generate: + // + // while (IV <= GlobalUB) { + // <CodeGen rest of pragma>(LB, UB); + // LB += ST; + // UB += ST; + // UB = min(UB, GlobalUB); + // IV = LB; + // } + // + emitCommonSimdLoop( + *this, S, + [&S](CodeGenFunction &CGF, PrePostActionTy &) { + if (isOpenMPSimdDirective(S.getDirectiveKind())) + CGF.EmitOMPSimdInit(S); + }, + [&S, &LoopScope, Cond, IncExpr, LoopExit, &CodeGenLoop, + StaticChunked](CodeGenFunction &CGF, PrePostActionTy &) { + CGF.EmitOMPInnerLoop( + S, LoopScope.requiresCleanups(), Cond, IncExpr, + [&S, LoopExit, &CodeGenLoop](CodeGenFunction &CGF) { + CodeGenLoop(CGF, S, LoopExit); + }, + [&S, StaticChunked](CodeGenFunction &CGF) { + if (StaticChunked) { + CGF.EmitIgnoredExpr(S.getCombinedNextLowerBound()); + CGF.EmitIgnoredExpr(S.getCombinedNextUpperBound()); + CGF.EmitIgnoredExpr(S.getCombinedEnsureUpperBound()); + CGF.EmitIgnoredExpr(S.getCombinedInit()); + } + }); + }); + EmitBlock(LoopExit.getBlock()); + // Tell the runtime we are done. + RT.emitForStaticFinish(*this, S.getEndLoc(), OMPD_distribute); + } else { + // Emit the outer loop, which requests its work chunk [LB..UB] from + // runtime and runs the inner loop to process it. + const OMPLoopArguments LoopArguments = { + LB.getAddress(), UB.getAddress(), ST.getAddress(), + IL.getAddress(), Chunk}; + EmitOMPDistributeOuterLoop(ScheduleKind, S, LoopScope, LoopArguments, + CodeGenLoop); + } } if (isOpenMPSimdDirective(S.getDirectiveKind())) { EmitOMPSimdFinal(S, [IL, &S](CodeGenFunction &CGF) { diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp index 0c04b3c429d7a..0cc8e46518c9a 100644 --- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp +++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp @@ -122,114 +122,60 @@ int foo() { // IR-GPU-NEXT: store i32 1, ptr [[DOTOMP_STRIDE_ASCAST]], align 4 // IR-GPU-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4 // IR-GPU-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() -// IR-GPU-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 -// IR-GPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 -// IR-GPU-NEXT: call void @__kmpc_distribute_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP3]], i32 91, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_COMB_LB_ASCAST]], ptr [[DOTOMP_COMB_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 [[NVPTX_NUM_THREADS]]) +// IR-GPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 // IR-GPU-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 99 -// IR-GPU-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] -// IR-GPU: cond.true: -// IR-GPU-NEXT: br label [[COND_END:%.*]] -// IR-GPU: cond.false: -// IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: br label [[COND_END]] -// IR-GPU: cond.end: -// IR-GPU-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ] -// IR-GPU-NEXT: store i32 [[COND]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4 -// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] -// IR-GPU: omp.inner.for.cond: -// IR-GPU-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 -// IR-GPU-NEXT: [[CMP5:%.*]] = icmp slt i32 [[TMP7]], 100 -// IR-GPU-NEXT: br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] -// IR-GPU: omp.inner.for.body: -// IR-GPU-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 -// IR-GPU-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 -// IR-GPU-NEXT: [[TMP12:%.*]] = load i32, ptr [[J3_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP12]], ptr addrspace(5) [[J_CASTED]], align 4 -// IR-GPU-NEXT: [[TMP13:%.*]] = load i64, ptr addrspace(5) [[J_CASTED]], align 8 -// IR-GPU-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 -// IR-GPU-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP9]] to ptr -// IR-GPU-NEXT: store ptr [[TMP15]], ptr [[TMP14]], align 8 -// IR-GPU-NEXT: [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1 -// IR-GPU-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP11]] to ptr -// IR-GPU-NEXT: store ptr [[TMP17]], ptr [[TMP16]], align 8 -// IR-GPU-NEXT: [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2 -// IR-GPU-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP13]] to ptr -// IR-GPU-NEXT: store ptr [[TMP19]], ptr [[TMP18]], align 8 -// IR-GPU-NEXT: [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3 -// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8 -// IR-GPU-NEXT: [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 -// IR-GPU-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 -// IR-GPU-NEXT: call void @__kmpc_parallel_60(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP22]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4, i32 0) -// IR-GPU-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] -// IR-GPU: omp.inner.for.inc: -// IR-GPU-NEXT: [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4 -// IR-GPU-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP23]], [[TMP24]] -// IR-GPU-NEXT: store i32 [[ADD]], ptr [[DOTOMP_IV_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4 -// IR-GPU-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP25]], [[TMP26]] -// IR-GPU-NEXT: store i32 [[ADD6]], ptr [[DOTOMP_COMB_LB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE_ASCAST]], align 4 -// IR-GPU-NEXT: [[ADD7:%.*]] = add nsw i32 [[TMP27]], [[TMP28]] -// IR-GPU-NEXT: store i32 [[ADD7]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[TMP29]], 99 -// IR-GPU-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] -// IR-GPU: cond.true9: -// IR-GPU-NEXT: br label [[COND_END11:%.*]] -// IR-GPU: cond.false10: -// IR-GPU-NEXT: [[TMP30:%.*]] = load i32, ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: br label [[COND_END11]] -// IR-GPU: cond.end11: -// IR-GPU-NEXT: [[COND12:%.*]] = phi i32 [ 99, [[COND_TRUE9]] ], [ [[TMP30]], [[COND_FALSE10]] ] -// IR-GPU-NEXT: store i32 [[COND12]], ptr [[DOTOMP_COMB_UB_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP31]], ptr [[DOTOMP_IV_ASCAST]], align 4 -// IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND]] -// IR-GPU: omp.inner.for.end: +// IR-GPU-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[J3_ASCAST]], align 4 +// IR-GPU-NEXT: store i32 [[TMP6]], ptr addrspace(5) [[J_CASTED]], align 4 +// IR-GPU-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(5) [[J_CASTED]], align 8 +// IR-GPU-NEXT: [[TMP8:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0 +// IR-GPU-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP3]] to ptr +// IR-GPU-NEXT: store ptr [[TMP9]], ptr [[TMP8]], align 8 +// IR-GPU-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1 +// IR-GPU-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP5]] to ptr +// IR-GPU-NEXT: store ptr [[TMP11]], ptr [[TMP10]], align 8 +// IR-GPU-NEXT: [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2 +// IR-GPU-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP7]] to ptr +// IR-GPU-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8 +// IR-GPU-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 3 +// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP14]], align 8 +// IR-GPU-NEXT: [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 +// IR-GPU-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 +// IR-GPU-NEXT: call void @__kmpc_parallel_60(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP16]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 4, i32 0) // IR-GPU-NEXT: br label [[OMP_LOOP_EXIT:%.*]] // IR-GPU: omp.loop.exit: -// IR-GPU-NEXT: [[TMP32:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 -// IR-GPU-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4 -// IR-GPU-NEXT: call void @__kmpc_distribute_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP33]]) -// IR-GPU-NEXT: [[TMP34:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0 -// IR-GPU-NEXT: br i1 [[TMP35]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] +// IR-GPU-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IS_LAST_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0 +// IR-GPU-NEXT: br i1 [[TMP18]], label [[DOTOMP_LASTPRIVATE_THEN:%.*]], label [[DOTOMP_LASTPRIVATE_DONE:%.*]] // IR-GPU: .omp.lastprivate.then: // IR-GPU-NEXT: store i32 10, ptr [[J3_ASCAST]], align 4 -// IR-GPU-NEXT: [[TMP36:%.*]] = load i32, ptr [[J3_ASCAST]], align 4 -// IR-GPU-NEXT: store i32 [[TMP36]], ptr [[J_ADDR_ASCAST]], align 4 +// IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[J3_ASCAST]], align 4 +// IR-GPU-NEXT: store i32 [[TMP19]], ptr [[J_ADDR_ASCAST]], align 4 // IR-GPU-NEXT: br label [[DOTOMP_LASTPRIVATE_DONE]] // IR-GPU: .omp.lastprivate.done: -// IR-GPU-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0 -// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP37]], align 8 +// IR-GPU-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0 +// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP20]], align 8 // IR-GPU-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer() -// IR-GPU-NEXT: [[TMP38:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, i64 400, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) -// IR-GPU-NEXT: [[TMP39:%.*]] = icmp eq i32 [[TMP38]], 1 -// IR-GPU-NEXT: br i1 [[TMP39]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// IR-GPU-NEXT: [[TMP21:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, i64 400, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func) +// IR-GPU-NEXT: [[TMP22:%.*]] = icmp eq i32 [[TMP21]], 1 +// IR-GPU-NEXT: br i1 [[TMP22]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] // IR-GPU: .omp.reduction.then: -// IR-GPU-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100 -// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP40]] -// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]] +// IR-GPU-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100 +// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP23]] +// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE8:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]] // IR-GPU: omp.arraycpy.body: // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ] -// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ] -// IR-GPU-NEXT: [[TMP41:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4 -// IR-GPU-NEXT: [[TMP42:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4 -// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP41]], [[TMP42]] -// IR-GPU-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4 -// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1 +// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST5:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT6:%.*]], [[OMP_ARRAYCPY_BODY]] ] +// IR-GPU-NEXT: [[TMP24:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST5]], align 4 +// IR-GPU-NEXT: [[TMP25:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4 +// IR-GPU-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// IR-GPU-NEXT: store i32 [[ADD]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST5]], align 4 +// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT6]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST5]], i32 1 // IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1 -// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP40]] -// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]] -// IR-GPU: omp.arraycpy.done17: +// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE7:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT6]], [[TMP23]] +// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE7]], label [[OMP_ARRAYCPY_DONE8]], label [[OMP_ARRAYCPY_BODY]] +// IR-GPU: omp.arraycpy.done8: // IR-GPU-NEXT: br label [[DOTOMP_REDUCTION_DONE]] // IR-GPU: .omp.reduction.done: // IR-GPU-NEXT: ret void @@ -304,7 +250,7 @@ int foo() { // IR-GPU: omp.arrayinit.done: // IR-GPU-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 -// IR-GPU-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[TMP5]], i32 33, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1) +// IR-GPU-NEXT: call void @__kmpc_for_static_init_4(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), i32 [[TMP5]], i32 93, ptr [[DOTOMP_IS_LAST_ASCAST]], ptr [[DOTOMP_LB_ASCAST]], ptr [[DOTOMP_UB_ASCAST]], ptr [[DOTOMP_STRIDE_ASCAST]], i32 1, i32 1) // IR-GPU-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_LB_ASCAST]], align 4 // IR-GPU-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_IV_ASCAST]], align 4 // IR-GPU-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] @@ -352,7 +298,7 @@ int foo() { // IR-GPU: omp.loop.exit: // IR-GPU-NEXT: [[TMP18:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8 // IR-GPU-NEXT: [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4 -// IR-GPU-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[TMP19]]) +// IR-GPU-NEXT: call void @__kmpc_for_static_fini(ptr addrspacecast (ptr addrspace(1) @[[GLOB2]] to ptr), i32 [[TMP19]]) // IR-GPU-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0 // IR-GPU-NEXT: store ptr [[SUM4_ASCAST]], ptr [[TMP20]], align 8 // IR-GPU-NEXT: [[TMP21:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i64 400, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func) @@ -496,7 +442,7 @@ int foo() { // IR-GPU-NEXT: br i1 [[TMP7]], label [[BODY:%.*]], label [[EXIT:%.*]] // IR-GPU: body: // IR-GPU-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) -// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4:[0-9]+]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM]]) +// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB3:[0-9]+]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM]]) // IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 // IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] // IR-GPU: then: @@ -511,7 +457,7 @@ int foo() { // IR-GPU-NEXT: br label [[IFCONT]] // IR-GPU: ifcont: // IR-GPU-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) -// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM2]]) +// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM2]]) // IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4 // IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP13]] // IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN3:%.*]], label [[ELSE4:%.*]] @@ -641,7 +587,7 @@ int foo() { // IR-GPU-NEXT: br i1 [[TMP7]], label [[BODY:%.*]], label [[EXIT:%.*]] // IR-GPU: body: // IR-GPU-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) -// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM]]) +// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM]]) // IR-GPU-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 // IR-GPU-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] // IR-GPU: then: @@ -656,7 +602,7 @@ int foo() { // IR-GPU-NEXT: br label [[IFCONT]] // IR-GPU: ifcont: // IR-GPU-NEXT: [[OMP_GLOBAL_THREAD_NUM2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr)) -// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB4]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM2]]) +// IR-GPU-NEXT: call void @__kmpc_barrier(ptr addrspacecast (ptr addrspace(1) @[[GLOB3]] to ptr), i32 [[OMP_GLOBAL_THREAD_NUM2]]) // IR-GPU-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTADDR1_ASCAST]], align 4 // IR-GPU-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP13]] // IR-GPU-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN3:%.*]], label [[ELSE4:%.*]] _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
