llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen Author: Johannes Doerfert (jdoerfert) <details> <summary>Changes</summary> By associating the kernel environment with the generic kernel we can access middle-end information easily, including the launch bounds ranges that are acceptable. By constraining the number of threads accordingly, we now obey the user-provided bounds that were passed via attributes. --- Patch is 1.54 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70383.diff 25 Files Affected: - (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+45-48) - (modified) clang/lib/CodeGen/CGOpenMPRuntime.h (+9-1) - (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+20-10) - (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (+2-2) - (modified) clang/test/OpenMP/distribute_simd_codegen.cpp (+508-508) - (modified) clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp (+11-98) - (modified) clang/test/OpenMP/target_parallel_codegen.cpp (+110-110) - (modified) clang/test/OpenMP/target_parallel_debug_codegen.cpp (+420-420) - (modified) clang/test/OpenMP/target_parallel_for_codegen.cpp (+314-314) - (modified) clang/test/OpenMP/target_parallel_for_debug_codegen.cpp (+589-589) - (modified) clang/test/OpenMP/target_parallel_for_simd_codegen.cpp (+932-932) - (modified) clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp (+589-589) - (modified) clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp (+998-998) - (modified) clang/test/OpenMP/teams_distribute_simd_codegen.cpp (+206-206) - (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+27-26) - (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+55-60) - (modified) llvm/lib/Transforms/IPO/OpenMPOpt.cpp (+4-2) - (modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+3-5) - (modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp (+27-47) - (modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h (+19-20) - (modified) openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp (+3-5) - (modified) openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp (+8-12) - (modified) openmp/libomptarget/test/offloading/default_thread_limit.c (+1-2) - (modified) openmp/libomptarget/test/offloading/thread_state_1.c (+2-2) - (modified) openmp/libomptarget/test/offloading/thread_state_2.c (+2-2) ``````````diff diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 6262b3416a1730a..c1be7c2d0321589 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6002,6 +6002,42 @@ void CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF, {ThreadId, AllocatorVal}); } +void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams( + const OMPExecutableDirective &D, CodeGenFunction &CGF, + int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal, + int32_t &MaxTeamsVal) { + + getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal); + getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal, + /*UpperBoundOnly=*/true); + + for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) { + for (auto *A : C->getAttrs()) { + int32_t AttrMinThreadsVal = 1, AttrMaxThreadsVal = -1; + int32_t AttrMinBlocksVal = 1, AttrMaxBlocksVal = -1; + if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A)) + CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &AttrMaxThreadsVal, + &AttrMinBlocksVal, &AttrMaxBlocksVal); + else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A)) + CGM.handleAMDGPUFlatWorkGroupSizeAttr( + nullptr, Attr, /*ReqdWGS=*/nullptr, &AttrMinThreadsVal, + &AttrMaxThreadsVal); + else + continue; + + MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal); + if (AttrMaxThreadsVal > 0) + MaxThreadsVal = MaxThreadsVal > 0 + ? std::min(MaxThreadsVal, AttrMaxThreadsVal) + : AttrMaxThreadsVal; + MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal); + if (AttrMaxBlocksVal > 0) + MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal) + : AttrMaxBlocksVal; + } + } +} + void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, @@ -6020,47 +6056,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper( return CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc()); }; - // Get NumTeams and ThreadLimit attributes - int32_t DefaultValMinTeams = 1; - int32_t DefaultValMaxTeams = -1; - uint32_t DefaultValMinThreads = 1; - uint32_t DefaultValMaxThreads = UINT32_MAX; - - getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams, - DefaultValMaxTeams); - getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads, - /*UpperBoundOnly=*/true); - - for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) { - for (auto *A : C->getAttrs()) { - int32_t MinThreadsVal = 1, MaxThreadsVal = 0; - int32_t MinBlocksVal = 1, MaxBlocksVal = -1; - if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A)) - CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal, - &MinBlocksVal, &MaxBlocksVal); - else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A)) - CGM.handleAMDGPUFlatWorkGroupSizeAttr( - nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal); - else - continue; - - DefaultValMinThreads = - std::max(DefaultValMinThreads, uint32_t(MinThreadsVal)); - DefaultValMaxThreads = - DefaultValMaxThreads - ? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal)) - : MaxThreadsVal; - DefaultValMinTeams = DefaultValMinTeams - ? std::max(DefaultValMinTeams, MinBlocksVal) - : MinBlocksVal; - DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal); - } - } - - OMPBuilder.emitTargetRegionFunction( - EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams, - DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads, - IsOffloadEntry, OutlinedFn, OutlinedFnID); + OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, + IsOffloadEntry, OutlinedFn, OutlinedFnID); if (!OutlinedFn) return; @@ -6306,7 +6303,7 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective( /// store the condition in \p CondVal. If \p E, and \p CondVal respectively, are /// nullptr, no expression evaluation is perfomed. static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS, - const Expr **E, uint32_t &UpperBound, + const Expr **E, int32_t &UpperBound, bool UpperBoundOnly, llvm::Value **CondVal) { const Stmt *Child = CGOpenMPRuntime::getSingleCompoundChild( CGF.getContext(), CS->getCapturedStmt()); @@ -6368,10 +6365,10 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS, UpperBound ? Constant->getZExtValue() : std::min(UpperBound, - static_cast<uint32_t>(Constant->getZExtValue())); + static_cast<int32_t>(Constant->getZExtValue())); // If we haven't found a upper bound, remember we saw a thread limiting // clause. - if (UpperBound == UINT32_MAX) + if (UpperBound == -1) UpperBound = 0; if (!E) return; @@ -6397,7 +6394,7 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS, } const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective( - CodeGenFunction &CGF, const OMPExecutableDirective &D, uint32_t &UpperBound, + CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &UpperBound, bool UpperBoundOnly, llvm::Value **CondVal, const Expr **ThreadLimitExpr) { assert((!CGF.getLangOpts().OpenMPIsTargetDevice || UpperBoundOnly) && "Clauses associated with the teams directive expected to be emitted " @@ -6414,11 +6411,11 @@ const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective( if (auto Constant = E->getIntegerConstantExpr(CGF.getContext())) UpperBound = UpperBound ? Constant->getZExtValue() : std::min(UpperBound, - uint32_t(Constant->getZExtValue())); + int32_t(Constant->getZExtValue())); } // If we haven't found a upper bound, remember we saw a thread limiting // clause. - if (UpperBound == UINT32_MAX) + if (UpperBound == -1) UpperBound = 0; if (EPtr) *EPtr = E; @@ -6562,7 +6559,7 @@ llvm::Value *CGOpenMPRuntime::emitNumThreadsForTargetDirective( llvm::Value *CondVal = nullptr; llvm::Value *ThreadLimitVal = nullptr; const Expr *ThreadLimitExpr = nullptr; - uint32_t UpperBound = -1; + int32_t UpperBound = -1; const Expr *NT = getNumThreadsExprForTargetDirective( CGF, D, UpperBound, /* UpperBoundOnly */ false, &CondVal, diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index d2f922da3320924..0c4ad46e881b9c5 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -311,6 +311,14 @@ class CGOpenMPRuntime { /// An OpenMP-IR-Builder instance. llvm::OpenMPIRBuilder OMPBuilder; + /// Helper to determine the min/max number of threads/teams for \p D. + void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D, + CodeGenFunction &CGF, + int32_t &MinThreadsVal, + int32_t &MaxThreadsVal, + int32_t &MinTeamsVal, + int32_t &MaxTeamsVal); + /// Helper to emit outlined function for 'target' directive. /// \param D Directive to emit. /// \param ParentName Name of the function that encloses the target region. @@ -649,7 +657,7 @@ class CGOpenMPRuntime { /// UpperBoundOnly is true, no expression evaluation is perfomed. const Expr *getNumThreadsExprForTargetDirective( CodeGenFunction &CGF, const OMPExecutableDirective &D, - uint32_t &UpperBound, bool UpperBoundOnly, + int32_t &UpperBound, bool UpperBoundOnly, llvm::Value **CondExpr = nullptr, const Expr **ThreadLimitExpr = nullptr); /// Emit an expression that denotes the number of threads a target region diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 152a7511f4dd1b0..9d00ebae702802a 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -757,13 +757,15 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { CGOpenMPRuntimeGPU::EntryFunctionState &EST; + const OMPExecutableDirective &D; public: - NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST) - : EST(EST) {} + NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, + const OMPExecutableDirective &D) + : EST(EST), D(D) {} void Enter(CodeGenFunction &CGF) override { auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); - RT.emitKernelInit(CGF, EST, /* IsSPMD */ false); + RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } @@ -772,7 +774,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, RT.clearLocThreadIdInsertPt(CGF); RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); } - } Action(EST); + } Action(EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, @@ -780,10 +782,17 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, IsInTTDRegion = false; } -void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF, +void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, + CodeGenFunction &CGF, EntryFunctionState &EST, bool IsSPMD) { + int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1, + MaxTeamsVal = -1; + computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal, + MinTeamsVal, MaxTeamsVal); + CGBuilderTy &Bld = CGF.Builder; - Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD)); + Bld.restoreIP(OMPBuilder.createTargetInit( + Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal)); if (!IsSPMD) emitGenericVarsProlog(CGF, EST.Loc); } @@ -815,19 +824,20 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, CGOpenMPRuntimeGPU::EntryFunctionState &EST; bool IsBareKernel; DataSharingMode Mode; + const OMPExecutableDirective &D; public: NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, CGOpenMPRuntimeGPU::EntryFunctionState &EST, - bool IsBareKernel) + bool IsBareKernel, const OMPExecutableDirective &D) : RT(RT), EST(EST), IsBareKernel(IsBareKernel), - Mode(RT.CurrentDataSharingMode) {} + Mode(RT.CurrentDataSharingMode), D(D) {} void Enter(CodeGenFunction &CGF) override { if (IsBareKernel) { RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA; return; } - RT.emitKernelInit(CGF, EST, /* IsSPMD */ true); + RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } @@ -839,7 +849,7 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, RT.clearLocThreadIdInsertPt(CGF); RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); } - } Action(*this, EST, IsBareKernel); + } Action(*this, EST, IsBareKernel, D); CodeGen.setAction(Action); IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h index c4501a1a2a496b0..46e1361f2f895ba 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -60,8 +60,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { void syncCTAThreads(CodeGenFunction &CGF); /// Helper for target directive initialization. - void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST, - bool IsSPMD); + void emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF, + EntryFunctionState &EST, bool IsSPMD); /// Helper for target directive finalization. void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST, diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp index 297f508575d99d9..f74abbe32e454f6 100644 --- a/clang/test/OpenMP/distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/distribute_simd_codegen.cpp @@ -220,7 +220,7 @@ int fint(void) { return ftemplate<int>(); } // CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0 // CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] // CHECK1: omp_offload.failed: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]] // CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]] // CHECK1: omp_offload.cont: // CHECK1-NEXT: ret void @@ -242,7 +242,7 @@ int fint(void) { return ftemplate<int>(); } // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined -// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -291,45 +291,45 @@ int fint(void) { return ftemplate<int>(); } // CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4 // CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] // CHECK1: omp.inner.for.cond: -// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]] -// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]] +// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]] // CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] // CHECK1: omp.inner.for.body: -// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 33, [[MUL]] -// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP13]] -// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]] +// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]] +// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]] -// CHECK1-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP13]] -// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP13]] -// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]] +// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]] +// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[IDXPROM2:%.*]] = sext i32 [[TMP17]] to i64 // CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM2]] -// CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[MUL4:%.*]] = fmul float [[TMP15]], [[TMP18]] -// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP13]] -// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]] +// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP20]] to i64 // CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM5]] -// CHECK1-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP21]] -// CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP13]] -// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]] +// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP23]] to i64 // CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM8]] -// CHECK1-NEXT: store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] // CHECK1: omp.body.continue: // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: -// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]] +// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]] // CHECK1-NEXT: [[ADD10:%.*]] = add... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/70383 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits