https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/150587
We can do it all in finalizeStore if we ensure it always sees the stores. For that, I needed to fix a hidden bug where finalizeStore wouldn't see all stores because sometimes the iterator got out-of-sync and didn't point to the store anymore. This also removes the waits before volatile LDS stores which never needed it, that was a bug until now. >From e4821db935e509479cf505f64e5f9e9ecce769bb Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Thu, 24 Jul 2025 12:12:09 +0200 Subject: [PATCH] [AMDGPU][gfx12] Clean-up implementation of waits before SCOPE_SYS stores We can do it all in finalizeStore if we ensure it always sees the stores. For that, I needed to fix a hidden bug where finalizeStore wouldn't see all stores because sometimes the iterator got out-of-sync and didn't point to the store anymore. This also removes the waits before volatile LDS stores which never needed it, that was a bug until now. --- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 20 +++++++++---------- .../AMDGPU/memory-legalizer-local-volatile.ll | 20 ------------------- 2 files changed, 9 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 4456d9b241425..d6337a85a7361 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,7 +321,7 @@ class SICacheControl { bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -602,7 +602,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool finalizeStore(MachineBasicBlock::iterator &MI, bool Atomic) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -2526,9 +2526,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2541,9 +2538,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); if (!CPol) return false; @@ -2558,7 +2554,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU) + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) return setScope(MI, CPol::SCOPE_SE); return false; @@ -2662,6 +2658,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2678,7 +2676,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); - Changed |= CC->finalizeStore(MI, /*Atomic=*/true); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2691,7 +2689,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->finalizeStore(MI, /*Atomic=*/false); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index bc2508411ed6b..5e5e3bf83d610 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { @@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits