llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> This is the most complex atomicrmw support case. Note we don't have accurate remarks for all of the cases, which I'm planning on fixing in a later change with more precise wording. Continue respecting amdgpu-unsafe-fp-atomics until it's eventual removal. Also seems to fix a few cases not interpreting amdgpu-unsafe-fp-atomics appropriately aaggressively. --- Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96760.diff 37 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+69-81) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll (+4-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll (+4-2) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll (+5-3) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+61-178) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+420-101) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+262-17) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll (+110-39) - (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+51-69) - (modified) llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll (+11-9) - (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+236-87) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll (+5-3) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll (+5-3) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+736-958) - (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll (+13-50) - (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+59-156) - (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll (+5-5) - (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll (+73-28) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll (+5-3) - (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+746-232) - (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll (+4-2) - (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+78-182) - (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+53-51) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll (+52-676) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll (+182-1186) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll (+4-52) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll (+20-175) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i128.ll (+30-30) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll (+10-22) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll (+45-45) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll (+1471-3143) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll (+3-3) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll (+33-223) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll (+52-4) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll (+59-201) - (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll (+13-1) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 11ebfe7511f7b..f9b5aea144440 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16037,26 +16037,15 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, SNaN, Depth); } -#if 0 -// FIXME: This should be checked before unsafe fp atomics are enabled -// Global FP atomic instructions have a hardcoded FP mode and do not support -// FP32 denormals, and only support v2f16 denormals. -static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { - const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); - auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt); - if (&Flt == &APFloat::IEEEsingle()) - return DenormMode == DenormalMode::getPreserveSign(); - return DenormMode == DenormalMode::getIEEE(); -} -#endif +// On older subtargets, global FP atomic instructions have a hardcoded FP mode +// and do not support FP32 denormals, and only support v2f16/f64 denormals. +static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) { + if (RMW->hasMetadata("amdgpu.ignore.denormal.mode")) + return true; -// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe -// floating point atomic instructions. May generate more efficient code, -// but may not respect rounding and denormal modes, and may give incorrect -// results for certain memory destinations. -bool unsafeFPAtomicsDisabled(Function *F) { - return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() != - "true"; + const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); + auto DenormMode = RMW->getFunction()->getDenormalMode(Flt); + return DenormMode == DenormalMode::getPreserveSign(); } static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { @@ -16185,75 +16174,74 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::CmpXChg; } - if (!AMDGPU::isFlatGlobalAddrSpace(AS) && - AS != AMDGPUAS::BUFFER_FAT_POINTER) - return AtomicExpansionKind::CmpXChg; - - if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) - return AtomicExpansionKind::None; - - if (AS == AMDGPUAS::FLAT_ADDRESS) { - // gfx940, gfx12 - // FIXME: Needs to account for no fine-grained memory - if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) - return AtomicExpansionKind::None; - } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { - // gfx90a, gfx940, gfx12 - // FIXME: Needs to account for no fine-grained memory - if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) - return AtomicExpansionKind::None; - - // gfx940, gfx12 - // FIXME: Needs to account for no fine-grained memory - if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) - return AtomicExpansionKind::None; - } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // gfx90a, gfx940, gfx12 - // FIXME: Needs to account for no fine-grained memory - if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) - return AtomicExpansionKind::None; - - // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for - // buffer. gfx12 does have the buffer version. - if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty)) - return AtomicExpansionKind::None; - } - - if (unsafeFPAtomicsDisabled(RMW->getFunction())) - return AtomicExpansionKind::CmpXChg; - - // Always expand system scope fp atomics. - if (HasSystemScope) + // LDS atomics respect the denormal mode from the mode register. + // + // Traditionally f32 global/buffer memory atomics would unconditionally + // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never + // flush. + // + // On targets with flat atomic fadd, denormals would flush depending on + // whether the target address resides in LDS or global memory. We consider + // this flat-maybe-flush as will-flush. + if (Ty->isFloatTy() && + !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() && + !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW)) return AtomicExpansionKind::CmpXChg; - // global and flat atomic fadd f64: gfx90a, gfx940. - if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - - if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { - // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. - if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. - if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) - return ReportUnsafeHWInst(AtomicExpansionKind::None); - } + if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { + if (AS == AMDGPUAS::FLAT_ADDRESS) { + // gfx940, gfx12 + if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) + return AtomicExpansionKind::None; + } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { + // gfx90a, gfx940, gfx12 + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // gfx940, gfx12 + if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { + // gfx90a, gfx940, gfx12 + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for + // buffer. gfx12 does have the buffer version. + if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } - // flat atomic fadd f32: gfx940, gfx11+. - if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { - if (Subtarget->hasFlatAtomicFaddF32Inst()) + // global and flat atomic fadd f64: gfx90a, gfx940. + if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // If it is in flat address space, and the type is float, we will try to - // expand it, if the target supports global and lds atomic fadd. The - // reason we need that is, in the expansion, we emit the check of address - // space. If it is in global address space, we emit the global atomic - // fadd; if it is in shared address space, we emit the LDS atomic fadd. - if (Subtarget->hasLDSFPAtomicAddF32()) { + if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { + // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) - return AtomicExpansionKind::Expand; + return ReportUnsafeHWInst(AtomicExpansionKind::None); + // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) - return AtomicExpansionKind::Expand; + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } + + // flat atomic fadd f32: gfx940, gfx11+. + if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { + if (Subtarget->hasFlatAtomicFaddF32Inst()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + + // If it is in flat address space, and the type is float, we will try to + // expand it, if the target supports global and lds atomic fadd. The + // reason we need that is, in the expansion, we emit the check of + // address space. If it is in global address space, we emit the global + // atomic fadd; if it is in shared address space, we emit the LDS atomic + // fadd. + if (Subtarget->hasLDSFPAtomicAddF32()) { + if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) + return AtomicExpansionKind::Expand; + if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) + return AtomicExpansionKind::Expand; + } } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll index aa9ebb9226cdd..7ce1c17b7ccfd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -79,7 +79,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; GFX11-NEXT: S_ENDPGM 0 - %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic + %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -107,10 +107,12 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic + %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret float %ret } declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr, float) attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index 68d8e3d747b86..869f073f3b1b9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -55,7 +55,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) ; GFX90A_GFX940-NEXT: S_ENDPGM 0 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret void } @@ -78,10 +78,12 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 - %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0 ret double %ret } declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double) attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 632dbd45279fb..710d48be037e0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -34,7 +34,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm - %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst + %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -50,7 +50,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_endpgm - %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst + %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -75,7 +75,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) { ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 ; GFX940-NEXT: s_setpc_b64 s[30:31] - %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst + %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0 ret float %ret } @@ -235,3 +235,5 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val } attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } + +!0 = !{} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 66b22bedaf072..429b045e31c12 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1095,32 +1095,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB39_3 +; GFX90A-NEXT: s_cbranch_execz .LBB39_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_2 -; GFX90A-NEXT: .LBB39_3: +; GFX90A-NEXT: .LBB39_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: @@ -1146,7 +1134,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1196,7 +1184,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace( ; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1209,32 +1197,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX90A-NEXT: s_cbranch_execz .LBB41_3 +; GFX90A-NEXT: s_cbranch_execz .LBB41_2 ; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 -; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_2 -; GFX90A-NEXT: .LBB41_3: +; GFX90A-NEXT: .LBB41_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: @@ -1260,7 +1236,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1310,7 +1286,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace( ; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: - %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst + %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1337,26 +1313,13 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v2 -; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat: @@ -136... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/96760 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits