llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Consider the new atomic metadata when choosing to expand as cmpxchg instead. --- Patch is 1.01 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96759.diff 13 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+53-30) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+203-130) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+203-130) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+148-298) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+148-298) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+191-388) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+191-388) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+634-1766) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+634-1766) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll (+1786-266) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll (+1294-202) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll (+888-128) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll (+642-96) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index fc34277c580a8..11ebfe7511f7b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16093,6 +16093,34 @@ static bool isBFloat2(Type *Ty) { return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); } +/// \returns true if it's valid to emit a native instruction for \p RMW, based +/// on the properties of the target memory. +static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, + const AtomicRMWInst *RMW, + bool HasSystemScope) { + // The remote/fine-grained access logic is different from the integer + // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support, + // fine-grained access does not work, even for a device local allocation. + // + // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local + // allocations work. + if (HasSystemScope) { + if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics() && + RMW->hasMetadata("amdgpu.no.remote.memory")) + return true; + } else if (Subtarget.supportsAgentScopeFineGrainedRemoteMemoryAtomics()) + return true; + + if (RMW->hasMetadata("amdgpu.no.fine.grained.memory")) + return true; + + // TODO: Auto-upgrade this attribute to the metadata in function body and stop + // checking it. + return RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsBool(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); @@ -16236,37 +16264,32 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { Type *Ty = RMW->getType(); // LDS float and double fmin/fmax were always supported. - if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) - return AtomicExpansionKind::None; - - if (unsafeFPAtomicsDisabled(RMW->getFunction())) - return AtomicExpansionKind::CmpXChg; - - // Always expand system scope fp atomics. - if (HasSystemScope) - return AtomicExpansionKind::CmpXChg; + if (AS == AMDGPUAS::LOCAL_ADDRESS) { + return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + } - // For flat and global cases: - // float, double in gfx7. Manual claims denormal support. - // Removed in gfx8. - // float, double restored in gfx10. - // double removed again in gfx11, so only f32 for gfx11/gfx12. - // - // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but no - // f32. - // - // FIXME: Check scope and fine grained memory - if (AS == AMDGPUAS::FLAT_ADDRESS) { - if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || - AS == AMDGPUAS::BUFFER_FAT_POINTER) { - if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { + // For flat and global cases: + // float, double in gfx7. Manual claims denormal support. + // Removed in gfx8. + // float, double restored in gfx10. + // double removed again in gfx11, so only f32 for gfx11/gfx12. + // + // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but + // no f32. + if (AS == AMDGPUAS::FLAT_ADDRESS) { + if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { + if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } } return AtomicExpansionKind::CmpXChg; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 06dee9c279f2c..2a15cdaede44e 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -796,23 +796,64 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_addk_i32 s4, 0x400 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1 +; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX11-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB3_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: s_addk_i32 s8, 0x400 +; GFX10-NEXT: v_mov_b32_e32 v3, s8 +; GFX10-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX10-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, v4 +; GFX10-NEXT: v_mov_b32_e32 v1, v5 +; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB3_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: @@ -904,19 +945,60 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX7-NEXT: s_add_i32 s10, s8, 0x400 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 +; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen offset:1024 glc +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024 +; GFX6-NEXT: s_add_i32 s10, s8, 0x400 +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s10 +; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5 +; GFX6-NEXT: v_max_f32_e32 v4, v0, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, v4 +; GFX6-NEXT: v_mov_b32_e32 v1, v5 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256 @@ -1992,21 +2074,66 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v2, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: s_addk_i32 s8, 0x800 +; GFX10-NEXT: v_mov_b32_e32 v6, s8 +; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v10, v1 +; GFX10-NEXT: v_mov_b32_e32 v9, v0 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX10-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX10-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX10-NEXT: v_mov_b32_e32 v0, v7 +; GFX10-NEXT: v_mov_b32_e32 v1, v8 +; GFX10-NEXT: v_mov_b32_e32 v2, v9 +; GFX10-NEXT: v_mov_b32_e32 v3, v10 +; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10] +; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8 +; GFX10-NEXT: s_cbranch_execnz .LBB8_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s8 -; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s8 +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX90A-NEXT: s_add_i32 s10, s8, 0x800 +; GFX90A-NEXT: s_mov_b64 s[8:9], 0 +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v6, s10 +; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[0:1], v[10:11], v[10:11] +; GFX90A-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[8:9] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[10:11], v[10:11] op_sel:[0,1] +; GFX90A-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1 +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX90A-NEXT: s_cbranch_execnz .LBB8_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: @@ -2078,19 +2205,68 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remot ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX7-NEXT: s_add_i32 s10, s8, 0x800 +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s10 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v10, v1 +; GFX7-NEXT: v_mov_b32_e32 v9, v0 +; GFX7-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX7-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v7 +; GFX7-NEXT: v_mov_b32_e32 v1, v8 +; GFX7-NEXT: v_mov_b32_e32 v2, v9 +; GFX7-NEXT: v_mov_b32_e32 v3, v10 +; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen offset:2048 glc +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, v1 +; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048 +; GFX6-NEXT: s_add_i32 s10, s8, 0x800 +; GFX6-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX6-NEXT: s_mov_b64 s[8:9], 0 +; GFX6-NEXT: v_mov_b32_e32 v6, s10 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v10, v1 +; GFX6-NEXT: v_mov_b32_e32 v9, v0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10] +; GFX6-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v7 +; GFX6-NEXT: v_mov_b32_e32 v1, v8 +; GFX6-NEXT: v_mov_b32_e32 v2, v9 +; GFX6-NEXT: v_mov_b32_e32 v3, v10 +; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10] +; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256 @@ -7943,32 +8119,11 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_addk_co_i32 s4, 0x400 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1 -; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024 -; GFX12-NEXT: s_mov_b32 s4, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v5, v0 +; GFX12-NEXT: v_mov_b32_e32 v1, s4 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5 -; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 -; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 -; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: @@ -8003,64 +8158,23 @@ define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_ ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset__amdgpu_no_fine_grained_memory: ; GFX11: ; %bb.0: ; GFX11-NE... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/96759 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits