llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> If the runtime flat address resolves to a scratch address, 64-bit atomics do not work correctly. Insert a runtime address space check (which is quite likely to be uniform) and select between the non-atomic and real atomic cases. Consider noalias.addrspace metadata and avoid this expansion when possible (we also need to consider it to avoid infinitely expanding after adding the predication code). --- Patch is 2.29 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109407.diff 22 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+106-33) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+3-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+3-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+11-8) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+11-8) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll (+3-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+8-7) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll (+2-4) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+1129-738) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+1115-724) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+1115-724) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+3243-1098) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+297-113) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+8745-1168) - (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+8-7) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll (+260-36) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll (+39-1) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll (+39-1) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll (+39-2) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+556-179) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll (+16-2) - (modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+5-4) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a9754ba357893f..febd741f947ee1 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/ModRef.h" @@ -16236,12 +16237,39 @@ atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) { : TargetLowering::AtomicExpansionKind::CmpXChg; } +/// Return if a flat address space atomicrmw can access private memory. +static bool flatInstrMayAccessPrivate(const Instruction *I) { + const MDNode *NoaliasAddrSpaceMD = + I->getMetadata(LLVMContext::MD_noalias_addrspace); + if (!NoaliasAddrSpaceMD) + return true; + + // FIXME: Can this actually fail? Why is this optional? + if (std::optional<ConstantRange> CR = + getConstantRangeFromMetadata(*NoaliasAddrSpaceMD)) { + return !CR->contains(APInt(32, AMDGPUAS::PRIVATE_ADDRESS)); + } + + llvm_unreachable("Why is getConstantRangeFromMetadata optional"); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::PRIVATE_ADDRESS) return AtomicExpansionKind::NotAtomic; + // 64-bit flat atomics that dynamically reside in private memory will silently + // be dropped. + // + // Note that we will emit a new copy of the original atomic in the expansion, + // which will be incrementally relegalized. + const DataLayout &DL = RMW->getFunction()->getDataLayout(); + if (AS == AMDGPUAS::FLAT_ADDRESS && + DL.getTypeSizeInBits(RMW->getType()) == 64 && + flatInstrMayAccessPrivate(RMW)) + return AtomicExpansionKind::Expand; + auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) { OptimizationRemarkEmitter ORE(RMW->getFunction()); ORE.emit([=]() { @@ -16640,20 +16668,34 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || Op == AtomicRMWInst::Xor) { - // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 - assert(cast<Constant>(AI->getValOperand())->isNullValue() && - "this cannot be replaced with add"); - AI->setOperation(AtomicRMWInst::Add); - return; + if (auto *ConstVal = dyn_cast<Constant>(AI->getValOperand()); + ConstVal && ConstVal->isNullValue()) { + // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 + AI->setOperation(AtomicRMWInst::Add); + + // TODO: Turn the below private handling into a no-op for idempotent + // cases. + } } - assert(Subtarget->hasAtomicFaddInsts() && - "target should have atomic fadd instructions"); - assert(AI->getType()->isFloatTy() && - AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && - "generic atomicrmw expansion only supports FP32 operand in flat " - "address space"); - assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now"); + // The non-flat expansions should only perform the de-canonicalization of + // identity values. + if (AI->getPointerAddressSpace() != AMDGPUAS::FLAT_ADDRESS) + return; + + // FullFlatEmulation is true if we need to issue the private, shared, and + // global cases. + // + // If this is false, we are only dealing with the flat-targeting-private case, + // where we only insert a check for private and still use the flat instruction + // for global and shared. + + // TODO: Avoid the private check for the fadd case depending on + // noalias.addrspace. + + bool FullFlatEmulation = Op == AtomicRMWInst::FAdd && + Subtarget->hasAtomicFaddInsts() && + AI->getType()->isFloatTy(); // Given: atomicrmw fadd ptr %addr, float %val ordering // @@ -16693,6 +16735,10 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { // // atomicrmw.end: // [...] + // + // + // For 64-bit atomics which may reside in private memory, we perform a simpler + // version that only inserts the private check, and uses the flat operation. IRBuilder<> Builder(AI); LLVMContext &Ctx = Builder.getContext(); @@ -16704,9 +16750,15 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Function *F = BB->getParent(); BasicBlock *ExitBB = BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); - BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); - BasicBlock *CheckPrivateBB = - BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + BasicBlock *SharedBB = nullptr; + + BasicBlock *CheckPrivateBB = BB; + if (FullFlatEmulation) { + SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB); + CheckPrivateBB = + BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB); + } + BasicBlock *PrivateBB = BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB); BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB); @@ -16719,23 +16771,26 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {}, - {Addr}, nullptr, "is.shared"); - Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); - Builder.SetInsertPoint(SharedBB); - Value *CastToLocal = Builder.CreateAddrSpaceCast( - Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); + Value *LoadedShared = nullptr; + if (FullFlatEmulation) { + CallInst *IsShared = Builder.CreateIntrinsic( + Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared"); + Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); + Builder.SetInsertPoint(SharedBB); + Value *CastToLocal = Builder.CreateAddrSpaceCast( + Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); - Instruction *Clone = AI->clone(); - Clone->insertInto(SharedBB, SharedBB->end()); - Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) - .set(CastToLocal); - Instruction *LoadedShared = Clone; + Instruction *Clone = AI->clone(); + Clone->insertInto(SharedBB, SharedBB->end()); + Clone->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) + .set(CastToLocal); + LoadedShared = Clone; - Builder.CreateBr(PhiBB); + Builder.CreateBr(PhiBB); + Builder.SetInsertPoint(CheckPrivateBB); + } - Builder.SetInsertPoint(CheckPrivateBB); CallInst *IsPrivate = Builder.CreateIntrinsic( Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private"); Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); @@ -16752,15 +16807,32 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.CreateBr(PhiBB); Builder.SetInsertPoint(GlobalBB); - Value *CastToGlobal = Builder.CreateAddrSpaceCast( - Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); - Value *LoadedGlobal = AI; - AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()).set(CastToGlobal); + // Continue using a flat instruction if we only emitted the check for private. + Instruction *LoadedGlobal = AI; + if (FullFlatEmulation) { + Value *CastToGlobal = Builder.CreateAddrSpaceCast( + Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); + AI->getOperandUse(AtomicRMWInst::getPointerOperandIndex()) + .set(CastToGlobal); + } AI->removeFromParent(); AI->insertInto(GlobalBB, GlobalBB->end()); + // The new atomicrmw may go through another round of legalization later. + if (!FullFlatEmulation) { + // We inserted the runtime check already, make sure we do not try to + // re-expand this. + // TODO: Should union with any existing metadata. + MDBuilder MDB(F->getContext()); + MDNode *RangeNotPrivate = + MDB.createRange(APInt(32, AMDGPUAS::PRIVATE_ADDRESS), + APInt(32, AMDGPUAS::PRIVATE_ADDRESS + 1)); + LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace, + RangeNotPrivate); + } + Builder.CreateBr(PhiBB); Builder.SetInsertPoint(PhiBB); @@ -16768,7 +16840,8 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { if (ReturnValueIsUsed) { PHINode *Loaded = Builder.CreatePHI(ValTy, 3); AI->replaceAllUsesWith(Loaded); - Loaded->addIncoming(LoadedShared, SharedBB); + if (FullFlatEmulation) + Loaded->addIncoming(LoadedShared, SharedBB); Loaded->addIncoming(LoadedPrivate, PrivateBB); Loaded->addIncoming(LoadedGlobal, GlobalBB); Loaded->takeName(AI); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll index df81b926bceb39..eea4fd5c20cec0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll @@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmax ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_ } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll index 53d9bf0751a1d4..f47ea7bd458fb4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll @@ -1343,7 +1343,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %result = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -1494,7 +1494,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + %unused = atomicrmw fmin ptr %ptr, double %val syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2230,3 +2230,4 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_ } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index 705bcbddf227a6..f5555f8251b47e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -1657,7 +1657,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -1759,7 +1759,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -1832,7 +1832,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -1911,7 +1911,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -1990,7 +1990,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -2118,7 +2118,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out.gep, align 4 ret void } @@ -2217,7 +2217,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3340,7 +3340,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 - %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8 + %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i32 %idx.0, ptr addrspace(1) %add_use, align 4 store i64 %result, ptr addrspace(1) %out, align 4 ret void @@ -3349,5 +3349,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, attributes #0 = { nounwind speculatable willreturn memory(none) } attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index b3a7e65f771c43..3090cc4dddaf87 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -2782,7 +2782,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 { ; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -2884,7 +2884,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -2986,7 +2986,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out, align 4 ret void } @@ -3059,7 +3059,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm - %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %ptr, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3138,7 +3138,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3217,7 +3217,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr %ptr, i32 4 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3345,7 +3345,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % %gep.tid = getelementptr i64, ptr %ptr, i32 %id %out.gep = getelementptr i64, ptr %out, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 store i64 %result, ptr %out.gep, align 4 ret void } @@ -3444,7 +3444,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, ptr %ptr, i32 %id %gep = getelementptr i64, ptr %gep.tid, i32 5 - %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8 + %result = atomicrmw uinc_wrap ptr %gep, i64 42 syncscope("agent") seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -3554,5 +3554,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, attributes #0 = { nounwind speculatable willreturn memory(none) } attributes #1 = { nounwind } attributes #2 = { nounwind memory(none) } + +!0 = !{i32 5, i32 6} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}}... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/109407 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits