https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/76224
>From 89b94cc98e188142cff11d58f27fe6c25183b376 Mon Sep 17 00:00:00 2001 From: Vang Thao <vang.t...@amd.com> Date: Thu, 21 Dec 2023 11:58:47 +0100 Subject: [PATCH 1/3] [AMDGPU][GFX12] Add Atomic cond_sub_u32 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 19 +- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 + .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 3 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 7 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 14 + llvm/lib/Target/AMDGPU/DSInstructions.td | 27 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 31 +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 1 + llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 254 ++++++++++++++++++ .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 171 ++++++++++++ llvm/test/MC/AMDGPU/gfx11_unsupported.s | 12 + llvm/test/MC/AMDGPU/gfx12_asm_ds.s | 18 ++ llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 66 +++++ llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 36 +++ .../MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt | 81 ++++++ .../AMDGPU/gfx12_dasm_vbuffer_mubuf.txt | 42 +++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 18 ++ 23 files changed, 812 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index cb48f54b13a6cd..2d066350ee9f84 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,10 @@ // //===----------------------------------------------------------------------===// +def flat_ptr_ty : LLVMQualPointerType<0>; +def global_ptr_ty : LLVMQualPointerType<1>; +def local_ptr_ty : LLVMQualPointerType<3>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -1243,6 +1247,7 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1279,6 +1284,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; +def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1317,6 +1323,7 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -1352,6 +1359,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; +def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -2353,10 +2361,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUAtomicRtn<LLVMType vt> : Intrinsic < +class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic < [vt], - [llvm_anyptr_ty, // vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2491,6 +2499,11 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; +def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty, flat_ptr_ty>; +def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty, global_ptr_ty>; + +def int_amdgcn_ds_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty, local_ptr_ty>; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 2b85024a9b40be..801c5fa2e1565d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -264,6 +264,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>; def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>; +def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>; def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>; def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 4bf4707553e5fe..609c70a60d6463 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5438,6 +5438,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_FMIN) NODE_NAME_CASE(BUFFER_ATOMIC_FMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 827fb106b55199..6828db6e0220d5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -589,6 +589,7 @@ enum NodeType : unsigned { BUFFER_ATOMIC_FADD, BUFFER_ATOMIC_FMIN, BUFFER_ATOMIC_FMAX, + BUFFER_ATOMIC_COND_SUB_U32, LAST_AMDGPU_ISD_NUMBER }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index eaf72d7157ee2d..e3c6f46c758e89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -642,10 +642,14 @@ defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_ds_fadd_v2bf16 : noret_op; +defm int_amdgcn_flat_atomic_cond_sub_u32 : noret_op; +defm int_amdgcn_global_atomic_cond_sub_u32 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; defm int_amdgcn_global_atomic_fmax_num : noret_op; +defm int_amdgcn_ds_cond_sub_u32 : noret_op; +defm int_amdgcn_ds_cond_sub_u32 : local_addr_space_atomic_op; multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> { let HasNoUse = true in diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fbee2888945185..e605986564f2fc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5882,6 +5882,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_struct_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; default: llvm_unreachable("unhandled atomic opcode"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c9412f720c62ec..51d2bb130774f9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4690,6 +4690,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_flat_atomic_cond_sub_u32: + case Intrinsic::amdgcn_global_atomic_cond_sub_u32: + case Intrinsic::amdgcn_ds_cond_sub_u32: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index beb670669581f1..1a28197fd90265 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -243,17 +243,20 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>; +def : SourceOfDivergence<int_amdgcn_global_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>; +def : SourceOfDivergence<int_amdgcn_flat_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>; def : SourceOfDivergence<int_amdgcn_ds_fadd>; def : SourceOfDivergence<int_amdgcn_ds_fmin>; def : SourceOfDivergence<int_amdgcn_ds_fmax>; def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>; +def : SourceOfDivergence<int_amdgcn_ds_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>; @@ -281,6 +284,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>; @@ -297,6 +301,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>; @@ -313,6 +318,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>; @@ -329,6 +335,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_live_mask>; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 43d35fa5291ca0..fb4ef8620b795d 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1245,6 +1245,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; +let SubtargetPredicate = isGFX12Plus in { +defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics < + "buffer_atomic_cond_sub_u32", VGPR_32, i32 +>; +} + //===----------------------------------------------------------------------===// // MTBUF Instructions //===----------------------------------------------------------------------===// @@ -1708,6 +1714,13 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">; let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>; +let SubtargetPredicate = isGFX12Plus in { + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>; +} + let SubtargetPredicate = isGFX6GFX7GFX10Plus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">; @@ -2610,6 +2623,7 @@ defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x049, defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x034, "buffer_atomic_cmpswap_b32">; defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x042, "buffer_atomic_cmpswap_b64">; defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">; +defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Real_Atomic_gfx12<0x050>; defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">; def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">; defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x040, "buffer_atomic_dec_u32">; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index bc9049b4ef33c0..fb79d9264a3583 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -438,6 +438,12 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag, let has_gds = 0; } +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, + bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { + let AddedComplexity = complexity; +} + defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">; defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">; defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">; @@ -733,9 +739,22 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">; let SubtargetPredicate = isGFX12Plus in { +defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">; +defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32, "ds_cond_sub_u32">; defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">; defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32, "ds_sub_clamp_u32">; +multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst, + ValueType vt, string frag> { + def : DSAtomicRetPat<inst, vt, + !cast<PatFrag>(frag#"_local_addrspace")>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + def : DSAtomicRetPat<noRetInst, vt, + !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>; +} + +defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_ds_cond_sub_u32">; } // let SubtargetPredicate = isGFX12Plus //===----------------------------------------------------------------------===// @@ -955,12 +974,6 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; } // End AddedComplexity = 100 -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, - bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { - let AddedComplexity = complexity; -} - multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>; @@ -1238,7 +1251,9 @@ defm DS_MIN_NUM_F64 : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num defm DS_MAX_NUM_F64 : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">; defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">; defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">; +defm DS_COND_SUB_U32 : DS_Real_gfx12<0x098>; defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>; +defm DS_COND_SUB_RTN_U32 : DS_Real_gfx12<0x0a8>; defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0dd2b3f5c2c912..4d5ebf11d7232f 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -801,6 +801,7 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { let SubtargetPredicate = isGFX12Plus in { defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>; + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPR_32, i32>; } // End SubtargetPredicate = isGFX12Plus defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; @@ -926,6 +927,10 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">; defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; +let SubtargetPredicate = isGFX12Plus in { + defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>; +} // End SubtargetPredicate = isGFX12Plus + } // End is_flat_global = 1 @@ -1268,6 +1273,13 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64 defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>; } // end foreach as +let SubtargetPredicate = isGFX12Plus in { + defm : FlatAtomicIntrRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_flat_atomic_cond_sub_u32", i32>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : FlatAtomicIntrNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_flat_atomic_cond_sub_u32", i32>; +} + def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; @@ -1377,6 +1389,16 @@ multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt, defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>; } +multiclass GlobalFLATAtomicIntrPatsNoRtn<string inst, string node, ValueType vt, + ValueType data_vt = vt> { + defm : GlobalFLATAtomicPatsNoRtn<inst, node, vt, data_vt, /* isIntr */ 1>; +} + +multiclass GlobalFLATAtomicIntrPatsRtn<string inst, string node, ValueType vt, + ValueType data_vt = vt> { + defm : GlobalFLATAtomicPatsRtn<inst, node, vt, data_vt, /* isIntr */ 1>; +} + multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { def : ScratchLoadSignedPat <inst, node, vt> { let AddedComplexity = 25; @@ -1529,6 +1551,13 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64> defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; +let SubtargetPredicate = isGFX12Plus in { + defm : GlobalFLATAtomicIntrPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_global_atomic_cond_sub_u32", i32>; + + let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + defm : GlobalFLATAtomicIntrPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_global_atomic_cond_sub_u32", i32>; +} + let OtherPredicates = [isGFX10Plus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -2594,6 +2623,7 @@ defm FLAT_ATOMIC_OR_B64 : VFLAT_Real_Atomics_gfx12<0x04a, "FLAT_ATOMI defm FLAT_ATOMIC_XOR_B64 : VFLAT_Real_Atomics_gfx12<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>; defm FLAT_ATOMIC_INC_U64 : VFLAT_Real_Atomics_gfx12<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>; defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>; +defm FLAT_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050, "FLAT_ATOMIC_COND_SUB_U32", "flat_atomic_cond_sub_u32">; defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">; defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">; defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">; @@ -2651,6 +2681,7 @@ defm GLOBAL_ATOMIC_OR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04a, "GLOBAL_A defm GLOBAL_ATOMIC_XOR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>; defm GLOBAL_ATOMIC_INC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>; defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050, "GLOBAL_ATOMIC_COND_SUB_U32", "global_atomic_cond_sub_u32">; defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">; defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">; defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index fc119aa61d01a2..e33f6377c02f76 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1240,12 +1240,15 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: + case Intrinsic::amdgcn_global_atomic_cond_sub_u32: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin_num: case Intrinsic::amdgcn_flat_atomic_fmax_num: + case Intrinsic::amdgcn_flat_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_ds_cond_sub_u32: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); @@ -1331,6 +1334,7 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_ds_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); @@ -8431,6 +8435,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: + return lowerRawBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_struct_buffer_atomic_swap: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: return lowerStructBufferAtomicIntrin(Op, DAG, @@ -8472,6 +8479,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_dec: case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); + case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32); case Intrinsic::amdgcn_buffer_atomic_cmpswap: { unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 173c877b8d29ef..8cc78a39fc873a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -197,6 +197,7 @@ defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">; defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">; defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">; +defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", SDTypeProfile<1, 9, diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index f9bc623abcd04b..850ee035af58f7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3702,6 +3702,7 @@ def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; +def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll new file mode 100644 index 00000000000000..61814b39931775 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -0,0 +1,254 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +declare i32 @llvm.amdgcn.flat.atomic.cond.sub.u32(ptr, i32) +declare i32 @llvm.amdgcn.global.atomic.cond.sub.u32(ptr addrspace(1), i32) +declare i32 @llvm.amdgcn.ds.cond.sub.u32(ptr addrspace(3), i32) + +define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { +; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.flat.atomic.cond.sub.u32(ptr %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.flat.atomic.cond.sub.u32(ptr %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) { +; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 +; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr %addr, i32 4 + %val = call i32 @llvm.amdgcn.flat.atomic.cond.sub.u32(ptr %gep, i32 %in) + store i32 %val, ptr %use + ret void +} + +define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { +; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.global.atomic.cond.sub.u32(ptr addrspace(1) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.global.atomic.cond.sub.u32(ptr addrspace(1) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) { +; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.global.atomic.cond.sub.u32(ptr addrspace(1) %gep, i32 %in) + store i32 %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) { +; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.ds.cond.sub.u32(ptr addrspace(3) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { +; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16 +; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16 +; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0 +; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4 + %unused = call i32 @llvm.amdgcn.ds.cond.sub.u32(ptr addrspace(3) %gep, i32 %in) + ret void +} + +define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) { +; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: ds_store_b32 v1, v0 +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: ds_store_b32 v1, v0 +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4 + %val = call i32 @llvm.amdgcn.ds.cond.sub.u32(ptr addrspace(3) %gep, i32 %in) + store i32 %val, ptr addrspace(3) %use + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll new file mode 100644 index 00000000000000..9f89aa0ebb9431 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12 + +define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + ret void +} + +define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) + ret void +} + +define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0) + ret void +} + +define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + ret void +} + +define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN +; GFX12-NEXT: s_waitcnt vmcnt(0) +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) + ret void +} + +define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 { +; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mov_b32 s4, 4 +; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen +; GFX12-NEXT: s_setpc_b64 s[30:31] +main_body: + %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0) + ret void +} + +declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" } + diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index 89078c1ad4e049..fa940c2ceae4bc 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -2008,8 +2008,20 @@ flat_atomic_csub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU +ds_cond_sub_rtn_u32 v5, v1, v2 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +ds_cond_sub_u32 v1, v2 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + ds_sub_clamp_rtn_u32 v5, v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU ds_sub_clamp_u32 v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s index ba32dc8820eaad..c89d1ba8a4e540 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_ds.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_ds.s @@ -1443,6 +1443,24 @@ ds_storexchg_rtn_b64 v[5:6], v1, v[2:3] offset:0 ds_storexchg_rtn_b64 v[254:255], v255, v[254:255] offset:4 // GFX12: [0x04,0x00,0xb4,0xd9,0xff,0xfe,0x00,0xfe] +ds_cond_sub_rtn_u32 v5, v1, v2 +// GFX12: [0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x05] + +ds_cond_sub_rtn_u32 v5, v1, v2 offset:65535 +// GFX12: [0xff,0xff,0xa0,0xda,0x01,0x02,0x00,0x05] + +ds_cond_sub_rtn_u32 v5, v1, v2 offset:0 +// GFX12: [0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x05] + +ds_cond_sub_u32 v1, v2 +// GFX12: [0x00,0x00,0x60,0xda,0x01,0x02,0x00,0x00] + +ds_cond_sub_u32 v1, v2 offset:65535 +// GFX12: [0xff,0xff,0x60,0xda,0x01,0x02,0x00,0x00] + +ds_cond_sub_u32 v1, v2 offset:0 +// GFX12: [0x00,0x00,0x60,0xda,0x01,0x02,0x00,0x00] + ds_sub_clamp_rtn_u32 v5, v1, v2 // GFX12: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x05] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s index a7a256cfd2b8fe..4ee2e5ed294e9e 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s @@ -3178,6 +3178,72 @@ buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 glc dlc buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v255, off, s[8:11], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[12:15], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[96:99], s3 offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s101 offset:8388607 +// GFX12: encoding: [0x65,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], m0 offset:8388607 +// GFX12: encoding: [0x7d,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], 0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], -1 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], 0.5 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], -4.0 offset:8388607 +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 idxen offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 offen offset:8388607 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:0 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:7 +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RT_RETURN scope:SCOPE_SE +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV +// GFX12: encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 glc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 slc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc +// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:8388607 // GFX12: encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s index c0ffc5247d90e8..727c8be4341fc8 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s @@ -117,6 +117,18 @@ flat_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8] offset:2047 th:TH_ATOMIC_RETURN flat_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8] offset:2047 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x7c,0x80,0x10,0xec,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00] +flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +flat_atomic_cond_sub_u32 v[0:1], v2 offset:-64 +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 +// GFX12: encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + flat_atomic_dec_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] @@ -864,6 +876,30 @@ global_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8], off offset:2047 th:TH_ATOMIC_R global_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8], off offset:2047 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x7c,0x80,0x10,0xee,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0 +global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:-64 +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + +global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_cond_sub_u32 v[0:1], v2, off offset:-64 +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_cond_sub_u32 v[0:1], v2, off offset:64 +// GFX12: encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt index d3c0e714949907..338442c98c3ef1 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_ds.txt @@ -2079,6 +2079,87 @@ # GFX12: ds_rsub_u64 v255, v[2:3] offset:65535 ; encoding: [0xff,0xff,0x08,0xd9,0xff,0x02,0x00,0x00] 0xff,0xff,0x08,0xd9,0xff,0x02,0x00,0x00 +# GFX12: ds_cond_sub_rtn_u32 v0, v1, v2 ; encoding: [0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x00] +0x00,0x00,0xa0,0xda,0x01,0x02,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v1, v2 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0x01,0x02,0x00,0x00] +0x34,0x12,0xa0,0xda,0x01,0x02,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v1, v2 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0x02,0x00,0x00] +0xff,0xff,0xa0,0xda,0x01,0x02,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v254, v253 ; encoding: [0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0x00] +0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v254, v253 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0x00] +0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v0, v254, v253 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0x00] +0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0x00 + +# GFX12: ds_cond_sub_rtn_u32 v255, v1, v253 ; encoding: [0x00,0x00,0xa0,0xda,0x01,0xfd,0x00,0xff] +0x00,0x00,0xa0,0xda,0x01,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v1, v253 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0x01,0xfd,0x00,0xff] +0x34,0x12,0xa0,0xda,0x01,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v1, v253 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0x01,0xfd,0x00,0xff] +0xff,0xff,0xa0,0xda,0x01,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v2 ; encoding: [0x00,0x00,0xa0,0xda,0xfe,0x02,0x00,0xff] +0x00,0x00,0xa0,0xda,0xfe,0x02,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v2 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0xfe,0x02,0x00,0xff] +0x34,0x12,0xa0,0xda,0xfe,0x02,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v2 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xfe,0x02,0x00,0xff] +0xff,0xff,0xa0,0xda,0xfe,0x02,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v253 ; encoding: [0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0xff] +0x00,0x00,0xa0,0xda,0xfe,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v253 offset:4660 ; encoding: [0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0xff] +0x34,0x12,0xa0,0xda,0xfe,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_rtn_u32 v255, v254, v253 offset:65535 ; encoding: [0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0xff] +0xff,0xff,0xa0,0xda,0xfe,0xfd,0x00,0xff + +# GFX12: ds_cond_sub_u32 v0, v1 ; encoding: [0x00,0x00,0x60,0xda,0x00,0x01,0x00,0x00] +0x00,0x00,0x60,0xda,0x00,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v1 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0x00,0x01,0x00,0x00] +0x34,0x12,0x60,0xda,0x00,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v1 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x00,0x01,0x00,0x00] +0xff,0xff,0x60,0xda,0x00,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v254 ; encoding: [0x00,0x00,0x60,0xda,0x00,0xfe,0x00,0x00] +0x00,0x00,0x60,0xda,0x00,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v254 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0x00,0xfe,0x00,0x00] +0x34,0x12,0x60,0xda,0x00,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v0, v254 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0x00,0xfe,0x00,0x00] +0xff,0xff,0x60,0xda,0x00,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v1 ; encoding: [0x00,0x00,0x60,0xda,0xff,0x01,0x00,0x00] +0x00,0x00,0x60,0xda,0xff,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v1 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0xff,0x01,0x00,0x00] +0x34,0x12,0x60,0xda,0xff,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v1 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0x01,0x00,0x00] +0xff,0xff,0x60,0xda,0xff,0x01,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v254 ; encoding: [0x00,0x00,0x60,0xda,0xff,0xfe,0x00,0x00] +0x00,0x00,0x60,0xda,0xff,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v254 offset:4660 ; encoding: [0x34,0x12,0x60,0xda,0xff,0xfe,0x00,0x00] +0x34,0x12,0x60,0xda,0xff,0xfe,0x00,0x00 + +# GFX12: ds_cond_sub_u32 v255, v254 offset:65535 ; encoding: [0xff,0xff,0x60,0xda,0xff,0xfe,0x00,0x00] +0xff,0xff,0x60,0xda,0xff,0xfe,0x00,0x00 + # GFX12: ds_sub_clamp_rtn_u32 v0, v1, v2 ; encoding: [0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00] 0x00,0x00,0xa4,0xda,0x01,0x02,0x00,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt index ff8437155e12ed..dce542e93075db 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vbuffer_mubuf.txt @@ -1986,6 +1986,48 @@ # GFX12: buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:8388607 scope:SCOPE_SE ; encoding: [0x03,0xc0,0x0d,0xc4,0x05,0x10,0x84,0x00,0x00,0xff,0xff,0x7f] 0x03,0xc0,0x0d,0xc4,0x05,0x10,0x84,0x00,0x00,0xff,0xff,0x7f +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v255, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[12:15], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[96:99], s3 offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s101 offset:8388607 ; encoding: [0x65,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x65,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], m0 offset:8388607 ; encoding: [0x7d,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] +0x7d,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 idxen offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, v0, s[8:11], s3 offen offset:8388607 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00 + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:7 ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00] +0x03,0x00,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00 + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN scope:SCOPE_SE ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f + +# GFX12: buffer_atomic_cond_sub_u32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV ; encoding: [0x03,0x00,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f] +0x03,0x00,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f + # GFX12: buffer_atomic_dec_u32 v5, off, s[8:11], s3 offset:8388607 ; encoding: [0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f] 0x03,0x00,0x10,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt index d7f9daf295845a..713d2536cc0957 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt @@ -69,6 +69,12 @@ # GFX12: flat_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00] 0x7c,0x80,0x10,0xec,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00 +# GFX12: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + # GFX12: flat_atomic_dec_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] 0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 @@ -513,6 +519,18 @@ # GFX12: global_atomic_cmpswap_b64 v[1:2], v[3:4], v[5:8], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00] 0x7c,0x80,0x10,0xee,0x01,0x00,0x90,0x02,0x03,0xff,0x07,0x00 +# GFX12: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_cond_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + # GFX12: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] 0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 >From c6c21789945925923e133a16313e7f0352735a09 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora <mariusz.sik...@amd.com> Date: Fri, 5 Jan 2024 16:14:38 +0100 Subject: [PATCH 2/3] review comments --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 5374875bf72ff2..1c35b02c48d7d1 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,9 +10,7 @@ // //===----------------------------------------------------------------------===// -def flat_ptr_ty : LLVMQualPointerType<0>; def global_ptr_ty : LLVMQualPointerType<1>; -def local_ptr_ty : LLVMQualPointerType<3>; class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -2502,10 +2500,9 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; -def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty, flat_ptr_ty>; -def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty, global_ptr_ty>; - -def int_amdgcn_ds_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty, local_ptr_ty>; +def int_amdgcn_flat_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; +def int_amdgcn_global_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; +def int_amdgcn_ds_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; //===----------------------------------------------------------------------===// // Deep learning intrinsics. >From 88fa5b2f005a5019d8b644092f2b3ab206678a70 Mon Sep 17 00:00:00 2001 From: Mariusz Sikora <mariusz.sik...@amd.com> Date: Mon, 15 Jan 2024 22:15:28 +0100 Subject: [PATCH 3/3] Add tests for UniformityAnalysis --- .../UniformityAnalysis/AMDGPU/atomics.ll | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll index 59fbd5627ebfc4..2b25eafed86b1f 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll @@ -22,7 +22,55 @@ define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) ret void } +; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.ds.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in) +define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4 + %val = call i32 @llvm.amdgcn.ds.cond.sub.u32(ptr addrspace(3) %gep, i32 %in) + store i32 %val, ptr addrspace(3) %use + ret void +} + +; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.flat.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) +define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 { +entry: + %gep = getelementptr i32, ptr %addr, i32 4 + %val = call i32 @llvm.amdgcn.flat.atomic.cond.sub.u32(ptr %gep, i32 %in) + store i32 %val, ptr %use + ret void +} + +; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.global.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in) +define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 { +entry: + %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4 + %val = call i32 @llvm.amdgcn.global.atomic.cond.sub.u32(ptr addrspace(1) %gep, i32 %in) + store i32 %val, ptr addrspace(1) %use + ret void +} + +; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) +define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +entry: + %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + +; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) +define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 { +entry: + %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %r = bitcast i32 %orig to float + ret float %r +} + declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1 +declare i32 @llvm.amdgcn.ds.cond.sub.u32(ptr addrspace(3), i32) #1 +declare i32 @llvm.amdgcn.flat.atomic.cond.sub.u32(ptr, i32) #1 +declare i32 @llvm.amdgcn.global.atomic.cond.sub.u32(ptr addrspace(1), i32) #1 +declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1 +declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind willreturn } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits