llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> --- Patch is 759.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109406.diff 4 Files Affected: - (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll (+51) - (added) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+6911) - (added) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+9196) - (added) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+1523) ``````````diff diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll index e5dcf9ce309cd8..32cb1056022de2 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll @@ -77,6 +77,29 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ret void } +define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate(ptr %ptr, double %data) #0 { + ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate + ; GFX90A_GFX940: bb.0 (%ir-block.0): + ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: S_ENDPGM 0 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret void +} + define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 { ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw ; GFX90A_GFX940: bb.0 (%ir-block.0): @@ -104,8 +127,36 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ret double %ret } +define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr, double %data) #0 { + ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw__noprivate + ; GFX90A_GFX940: bb.0 (%ir-block.0): + ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX90A_GFX940-NEXT: {{ $}} + ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]] + ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 + ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 + ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]] + ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]] + ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 + %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 + ret double %ret +} + declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double) attributes #0 = { nounwind } !0 = !{} +!1 = !{i32 5, i32 6} diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll new file mode 100644 index 00000000000000..c0b3adce81342d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -0,0 +1,6911 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN1 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { +; GCN1-LABEL: atomic_add_i64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_add_i64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GCN1-LABEL: atomic_add_i64_ret_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_add_i64_ret_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr %out, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_add_i64_addr64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_add_i64_addr64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_add_i64_ret_addr64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_add_i64_ret_addr64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret_addr64_offset: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %ptr = getelementptr i64, ptr %out, i64 %index + %gep = getelementptr i64, ptr %ptr, i64 4 + %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) { +; GCN1-LABEL: atomic_add_i64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_add_i64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GCN1-LABEL: atomic_add_i64_ret: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_add_i64_ret: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_ret: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX12-NEXT: global_wb scope:SCOPE_DEV +; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: s_endpgm +entry: + %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 + store i64 %tmp0, ptr %out2 + ret void +} + +define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_add_i64_addr64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_add_i64_addr64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_i64_addr64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/109406 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits