llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> When expanding an atomicrmw with a cmpxchg, preserve any metadata attached to it. This will avoid unwanted double expansions in a future commit. The initial load should also probably receive the same metadata (which for some reason is not emitted as an atomic). --- Patch is 863.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109409.diff 20 Files Affected: - (modified) llvm/include/llvm/CodeGen/AtomicExpandUtils.h (+5-4) - (modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+18-10) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll (+193-193) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll (+171-171) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll (+68-68) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll (+68-68) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll (+3-3) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll (+15-15) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll (+3-3) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll (+15-15) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll (+4-4) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll (+87-87) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll (+2-2) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll (+2-2) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll (+2-2) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll (+99-87) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll (+94-82) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll (+89-79) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll (+84-74) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+58-58) ``````````diff diff --git a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h index 1cb410a0c31c69..feb05de20b4571 100644 --- a/llvm/include/llvm/CodeGen/AtomicExpandUtils.h +++ b/llvm/include/llvm/CodeGen/AtomicExpandUtils.h @@ -20,10 +20,11 @@ class Value; /// Parameters (see the expansion example below): /// (the builder, %addr, %loaded, %new_val, ordering, -/// /* OUT */ %success, /* OUT */ %new_loaded) -using CreateCmpXchgInstFun = - function_ref<void(IRBuilderBase &, Value *, Value *, Value *, Align, - AtomicOrdering, SyncScope::ID, Value *&, Value *&)>; +/// /* OUT */ %success, /* OUT */ %new_loaded, +/// %MetadataSrc) +using CreateCmpXchgInstFun = function_ref<void( + IRBuilderBase &, Value *, Value *, Value *, Align, AtomicOrdering, + SyncScope::ID, Value *&, Value *&, Instruction *)>; /// Expand an atomic RMW instruction into a loop utilizing /// cmpxchg. You'll want to make sure your target machine likes cmpxchg diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 3d4e2cb196a16a..5a3e529e5ebd02 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -98,7 +98,7 @@ class AtomicExpandImpl { IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref<Value *(IRBuilderBase &, Value *)> PerformOp, - CreateCmpXchgInstFun CreateCmpXchg); + CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc); bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); @@ -600,7 +600,8 @@ void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) { static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr, Value *Loaded, Value *NewVal, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, - Value *&Success, Value *&NewLoaded) { + Value *&Success, Value *&NewLoaded, + Instruction *MetadataSrc) { Type *OrigTy = NewVal->getType(); // This code can go away when cmpxchg supports FP and vector types. @@ -612,9 +613,12 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr, Loaded = Builder.CreateBitCast(Loaded, IntTy); } - Value *Pair = Builder.CreateAtomicCmpXchg( + AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, AddrAlign, MemOpOrder, AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID); + if (MetadataSrc) + Pair->copyMetadata(*MetadataSrc); + Success = Builder.CreateExtractValue(Pair, 1, "success"); NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); @@ -951,9 +955,9 @@ void AtomicExpandImpl::expandPartwordAtomicRMW( Value *OldResult; if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) { - OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, - PMV.AlignedAddrAlignment, MemOpOrder, SSID, - PerformPartwordOp, createCmpXchgInstFun); + OldResult = insertRMWCmpXchgLoop( + Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment, + MemOpOrder, SSID, PerformPartwordOp, createCmpXchgInstFun, AI); } else { assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC); OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr, @@ -1591,7 +1595,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop( IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign, AtomicOrdering MemOpOrder, SyncScope::ID SSID, function_ref<Value *(IRBuilderBase &, Value *)> PerformOp, - CreateCmpXchgInstFun CreateCmpXchg) { + CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) { LLVMContext &Ctx = Builder.getContext(); BasicBlock *BB = Builder.GetInsertBlock(); Function *F = BB->getParent(); @@ -1637,7 +1641,7 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop( MemOpOrder == AtomicOrdering::Unordered ? AtomicOrdering::Monotonic : MemOpOrder, - SSID, Success, NewLoaded); + SSID, Success, NewLoaded, MetadataSrc); assert(Success && NewLoaded); Loaded->addIncoming(NewLoaded, LoopBB); @@ -1686,7 +1690,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded, AI->getValOperand()); }, - CreateCmpXchg); + CreateCmpXchg, /*MetadataSrc=*/AI); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); @@ -1838,11 +1842,15 @@ void AtomicExpandImpl::expandAtomicRMWToLibcall(AtomicRMWInst *I) { expandAtomicRMWToCmpXchg( I, [this](IRBuilderBase &Builder, Value *Addr, Value *Loaded, Value *NewVal, Align Alignment, AtomicOrdering MemOpOrder, - SyncScope::ID SSID, Value *&Success, Value *&NewLoaded) { + SyncScope::ID SSID, Value *&Success, Value *&NewLoaded, + Instruction *MetadataSrc) { // Create the CAS instruction normally... AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( Addr, Loaded, NewVal, Alignment, MemOpOrder, AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder), SSID); + if (MetadataSrc) + Pair->copyMetadata(*MetadataSrc); + Success = Builder.CreateExtractValue(Pair, 1, "success"); NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll index d3fb9d8ee522e7..443c5d18e68949 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll @@ -187,7 +187,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -204,7 +204,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -221,7 +221,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -238,7 +238,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -260,7 +260,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -292,7 +292,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -309,7 +309,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -326,7 +326,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -343,7 +343,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -365,7 +365,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -382,7 +382,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_remote_memory(ptr ; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -409,7 +409,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -426,7 +426,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX906-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX906-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX906-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -443,7 +443,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -460,7 +460,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX90A-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -482,7 +482,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX10-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX10-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX10-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -514,7 +514,7 @@ define float @test_atomicrmw_fadd_f32_global_agent__amdgpu_no_fine_grained_memor ; GFX803-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]] ; GFX803-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32 ; GFX803-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32 -; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] ; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float @@ -531,7 +531,7 @@ define float @tes... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/109409 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits