https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/76149
>From b14a554a15e4de88c9afc428f9c6898090e6eb23 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.f...@amd.com> Date: Thu, 21 Dec 2023 12:00:26 +0000 Subject: [PATCH] [AMDGPU] GFX12 global_atomic_ordered_add_b64 instruction and intrinsic --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 ++- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 1 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 1 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 11 +++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 + ...vm.amdgcn.global.atomic.ordered.add.b64.ll | 65 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx11_unsupported.s | 3 + llvm/test/MC/AMDGPU/gfx12_asm_vflat.s | 24 +++++++ .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt | 12 ++++ 10 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 51bd9b63c127ed..3985c8871e1615 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -10,6 +10,8 @@ // //===----------------------------------------------------------------------===// +def global_ptr_ty : LLVMQualPointerType<1>; + class AMDGPUReadPreloadRegisterIntrinsic : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; @@ -2353,10 +2355,10 @@ def int_amdgcn_s_get_waveid_in_workgroup : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; -class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic < +class AMDGPUGlobalAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic < [vt], - [llvm_anyptr_ty, // vaddr - vt], // vdata(VGPR) + [pt, // vaddr + vt], // vdata(VGPR) [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; @@ -2486,6 +2488,8 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var" [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUGlobalAtomicRtn<llvm_i64_ty, global_ptr_ty>; + def int_amdgcn_flat_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_flat_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index eaf72d7157ee2d..36e07d944c942c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -642,6 +642,7 @@ defm int_amdgcn_global_atomic_fmax : noret_op; defm int_amdgcn_global_atomic_csub : noret_op; defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op; defm int_amdgcn_ds_fadd_v2bf16 : noret_op; +defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op; defm int_amdgcn_flat_atomic_fmin_num : noret_op; defm int_amdgcn_flat_atomic_fmax_num : noret_op; defm int_amdgcn_global_atomic_fmin_num : noret_op; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index c9412f720c62ec..fba060464a6e74 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4690,6 +4690,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fmax_num: case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_global_atomic_ordered_add_b64: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index beb670669581f1..4cc8871a00fe1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -243,6 +243,7 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>; +def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 0dd2b3f5c2c912..615f8cd54d8f9c 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -926,9 +926,11 @@ defm GLOBAL_LOAD_LDS_USHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_usho defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sshort">; defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">; -} // End is_flat_global = 1 - +let SubtargetPredicate = isGFX12Plus in { + defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>; +} // End SubtargetPredicate = isGFX12Plus +} // End is_flat_global = 1 let SubtargetPredicate = HasFlatScratchInsts in { defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>; @@ -1529,6 +1531,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64> defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>; +let OtherPredicates = [isGFX12Plus] in { + defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>; +} + let OtherPredicates = [isGFX10Plus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; @@ -2654,6 +2660,7 @@ defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_A defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">; defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">; defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; +defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">; // ENC_VSCRATCH. defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 4f4bc45e49b43e..715f67876994af 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1243,6 +1243,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_fmax_num: + case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll new file mode 100644 index 00000000000000..6a6c5b33e0dd8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s + +declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64) + +define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) { +; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 -4 + %unused = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in) + ret void +} + +define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) { +; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn: +; GFX12-SDAG: ; %bb.0: ; %entry +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6 +; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-SDAG-NEXT: s_nop 0 +; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-SDAG-NEXT: s_endpgm +; +; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_rtn: +; GFX12-GISEL: ; %bb.0: ; %entry +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-GISEL-NEXT: s_nop 0 +; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1) %gep, i64 %in) + store i64 %val, ptr addrspace(1) %use + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index 89078c1ad4e049..e01eb05e85588d 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -2013,3 +2013,6 @@ ds_sub_clamp_rtn_u32 v5, v1, v2 ds_sub_clamp_u32 v1, v2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64 +// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s index c0ffc5247d90e8..95d352b421a284 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s @@ -1266,6 +1266,30 @@ global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN global_atomic_or_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN // GFX12: encoding: [0x7c,0x80,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:-64 +// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64 +// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + +global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:-64 +// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64 +// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] + +global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + +global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] + +global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN +// GFX12: encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] + global_atomic_sub_u32 v0, v2, s[0:1] offset:-64 // GFX12: encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt index d7f9daf295845a..f4038cf10f50dc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt @@ -837,6 +837,18 @@ # GFX12: global_atomic_xor_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] 0x7c,0xc0,0x12,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 +# GFX12: global_atomic_ordered_add_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_ordered_add_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x1c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_ordered_add_b64 v[1:2], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX12: global_atomic_ordered_add_b64 v[1:2], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x1c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + # GFX12: global_load_addtid_b32 v1, off offset:64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] 0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits