https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/116681
Enforcing this limit in the clang builtin will come later. >From f5657c9cc25cfed321ced807510a21dc374bcfe3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Thu, 18 Jan 2024 16:18:05 +0700 Subject: [PATCH] AMDGPU: Handle gfx950 96/128-bit buffer_load_lds Enforcing this limit in the clang builtin will come later. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 18 ++ llvm/lib/Target/AMDGPU/BUFInstructions.td | 24 ++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 16 ++ .../llvm.amdgcn.global.load.lds.gfx950.ll | 8 + ...m.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll | 176 ++++++++++++++++ ...mdgcn.struct.ptr.buffer.load.lds.gfx950.ll | 196 ++++++++++++++++++ llvm/test/MC/AMDGPU/mubuf-gfx950.s | 32 +++ llvm/test/MC/Disassembler/AMDGPU/gfx950.txt | 19 ++ 9 files changed, 485 insertions(+), 12 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll create mode 100644 llvm/test/MC/AMDGPU/mubuf-gfx950.s diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f43ab50d2ea441..360af786c5160d 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1674,7 +1674,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic < [], [llvm_v4i32_ty, // rsrc(SGPR) LLVMQualPointerType<3>, // LDS base offset - llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) @@ -1693,7 +1693,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic < [], [AMDGPUBufferRsrcTy, // rsrc(SGPR) LLVMQualPointerType<3>, // LDS base offset - llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) @@ -1715,7 +1715,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic < [], [llvm_v4i32_ty, // rsrc(SGPR) LLVMQualPointerType<3>, // LDS base offset - llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) @@ -1735,7 +1735,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic < [], [AMDGPUBufferRsrcTy, // rsrc(SGPR) LLVMQualPointerType<3>, // LDS base offset - llvm_i32_ty, // Data byte size: 1/2/4 + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a6ef0069f134bd..3522ece24f1c45 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3240,6 +3240,24 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; break; + case 12: + if (!Subtarget->hasLDSLoadB96_B128()) + return false; + + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET; + break; + case 16: + if (!Subtarget->hasLDSLoadB96_B128()) + return false; + + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET; + break; } MachineBasicBlock *MBB = MI.getParent(); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 79d6a825f60b03..7283733dea22db 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -573,9 +573,17 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32, } } -multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> { +multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> { defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>; - defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>; + + if !ne(LDSPred, TruePredicate) then { + let SubtargetPredicate = LDSPred in { + defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>; + } + } else { + defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, 0, 1>; + } + } multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName, @@ -956,11 +964,11 @@ defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds < defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads < "buffer_load_dwordx2", v2i32 >; -defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx3", v3i32 +defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads_Lds < + "buffer_load_dwordx3", v3i32, /*LDSPred=*/HasGFX950Insts >; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < - "buffer_load_dwordx4", v4i32 +defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads_Lds < + "buffer_load_dwordx4", v4i32, /*LDSPred=*/HasGFX950Insts >; defm BUFFER_LOAD_LDS_B32 : MUBUF_Pseudo_Loads_LDSOpc < @@ -3231,8 +3239,8 @@ defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>; defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>; defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>; defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>; -defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>; defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>; defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>; defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0f7764906527d0..5b02f9bf80d3fc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9825,6 +9825,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET; break; + case 12: + if (!Subtarget->hasLDSLoadB96_B128()) + return SDValue(); + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET; + break; + case 16: + if (!Subtarget->hasLDSLoadB96_B128()) + return SDValue(); + Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN + : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN + : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN + : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET; + break; } SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3)); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll index b7819ea0431588..8f67375a09cb72 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.gfx950.ll @@ -2,6 +2,14 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s + +; ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.global.load.lds + +; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.load.lds), + + declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux) ;---------------------------------------------------------------------y diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll new file mode 100644 index 00000000000000..58b1d0da4a5f35 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.lds.gfx950.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s + +; FIXME: Not a great error +; ERR-SDAG: LLVM ERROR: Do not know how to expand this operator's operand! +; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.ptr.buffer.load.lds), + +declare void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +;---------------------------------------------------------------------y +; dwordx3 +;--------------------------------------------------------------------- + +define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GFX950-LABEL: buffer_load_lds_dwordx3: +; GFX950: ; %bb.0: ; %main_body +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 sc0 lds +; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 nt lds +; GFX950-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ds_read_b32 v0, v0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2) + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_imm_voffset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GFX950-LABEL: buffer_load_lds_dwordx3_imm_voffset: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_v_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_s_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 off, s[0:3], s5 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_vs_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], s5 offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_vs_imm_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], s5 offen offset:2048 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} + +;---------------------------------------------------------------------y +; dwordx4 +;--------------------------------------------------------------------- + +define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GFX950-LABEL: buffer_load_lds_dwordx4: +; GFX950: ; %bb.0: ; %main_body +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 lds +; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:4 sc0 lds +; GFX950-NEXT: buffer_load_dword off, s[0:3], 0 offset:8 nt lds +; GFX950-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ds_read_b32 v0, v0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; return to shader part epilog +main_body: + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 4, i32 0, i32 0, i32 8, i32 2) + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_imm_voffset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GFX950-LABEL: buffer_load_lds_dwordx4_imm_voffset: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 2048, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_v_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_s_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 off, s[0:3], s5 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_vs_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], s5 offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_vs_imm_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], s5 offen offset:2048 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX950-GISEL: {{.*}} +; GFX950-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll new file mode 100644 index 00000000000000..cfe9545b074e3c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.lds.gfx950.ll @@ -0,0 +1,196 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s +; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s +; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -filetype=null < %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s + +; ERR-SDAG: LLVM ERROR: Do not know how to expand this operator's operand! +; ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.ptr.buffer.load.lds), + +declare void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %vindex, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +;---------------------------------------------------------------------y +; dwordx3 +;--------------------------------------------------------------------- + +define amdgpu_ps float @buffer_load_lds_dwordx3(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GFX950-SDAG-LABEL: buffer_load_lds_dwordx3: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds +; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-SDAG-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: ds_read_b32 v0, v0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: buffer_load_lds_dwordx3: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen lds +; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-GISEL-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: ds_read_b32 v0, v0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 8, i32 0, i32 0, i32 8, i32 2) + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { +; GFX950-LABEL: buffer_load_lds_dwordx3_imm_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], 0 idxen offset:2048 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_v_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v[0:1], s[0:3], 0 idxen offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_s_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v0, s[0:3], s5 idxen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_vs_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v[0:1], s[0:3], s5 idxen offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx3_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx3_vs_imm_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx3 v[0:1], s[0:3], s5 idxen offen offset:2048 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 12, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} + +;---------------------------------------------------------------------y +; dwordx4 +;--------------------------------------------------------------------- + +define amdgpu_ps float @buffer_load_lds_dwordx4(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds) { +; GFX950-SDAG-LABEL: buffer_load_lds_dwordx4: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-SDAG-NEXT: s_mov_b32 m0, s4 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds +; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-SDAG-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: ds_read_b32 v0, v0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: buffer_load_lds_dwordx4: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_mov_b32 m0, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 8 +; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen lds +; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:4 sc0 lds +; GFX950-GISEL-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:8 nt lds +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: ds_read_b32 v0, v0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 0, i32 0) + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 4, i32 1) + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 8, i32 0, i32 0, i32 8, i32 2) + %res = load float, ptr addrspace(3) %lds + ret float %res +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex) { +; GFX950-LABEL: buffer_load_lds_dwordx4_imm_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], 0 idxen offset:2048 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 0, i32 0, i32 2048, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_v_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_v_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v[0:1], s[0:3], 0 idxen offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_s_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_s_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v0, s[0:3], s5 idxen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 0, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_vs_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_vs_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v[0:1], s[0:3], s5 idxen offen lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 %voffset, i32 %soffset, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_load_lds_dwordx4_vs_imm_offset(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) inreg %lds, i32 %vindex, i32 %voffset, i32 inreg %soffset) { +; GFX950-LABEL: buffer_load_lds_dwordx4_vs_imm_offset: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_mov_b32 m0, s4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: buffer_load_dwordx4 v[0:1], s[0:3], s5 idxen offen offset:2048 lds +; GFX950-NEXT: s_endpgm + call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) %rsrc, ptr addrspace(3) %lds, i32 16, i32 %vindex, i32 %voffset, i32 %soffset, i32 2048, i32 0) + ret void +} diff --git a/llvm/test/MC/AMDGPU/mubuf-gfx950.s b/llvm/test/MC/AMDGPU/mubuf-gfx950.s new file mode 100644 index 00000000000000..0ba6f2ca4f6c4e --- /dev/null +++ b/llvm/test/MC/AMDGPU/mubuf-gfx950.s @@ -0,0 +1,32 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX950 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERR %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx803 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERR %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1030 -show-encoding %s 2>&1 | FileCheck -check-prefix=ERR %s + +// FIXME: Bad diagnostics on unsupported subtarget + +// GFX950: buffer_load_dwordx3 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x59,0xe0,0x00,0x00,0x02,0x03] +// ERR: :[[@LINE+1]]:21: error: invalid operand for instruction +buffer_load_dwordx3 off, s[8:11], s3 lds + +// GFX950: buffer_load_dwordx3 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x59,0xe0,0x00,0x00,0x02,0x03] +// ERR: :[[@LINE+1]]:38: error: not a valid operand +buffer_load_dwordx3 off, s[8:11], s3 offset:4095 lds + +// GFX950: buffer_load_dwordx3 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x59,0xe0,0x00,0x00,0x02,0x65] +// ERR: :[[@LINE+1]]:39: error: invalid operand for instruction +buffer_load_dwordx3 v0, s[8:11], s101 offen lds + + + +// GFX950: buffer_load_dwordx4 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x5d,0xe0,0x00,0x00,0x02,0x03] +// ERR: :[[@LINE+1]]:21: error: invalid operand for instruction +buffer_load_dwordx4 off, s[8:11], s3 lds + +// GFX950: buffer_load_dwordx4 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x5d,0xe0,0x00,0x00,0x02,0x03] +// ERR: :[[@LINE+1]]:38: error: not a valid operand +buffer_load_dwordx4 off, s[8:11], s3 offset:4095 lds + +// GFX950: buffer_load_dwordx4 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x5d,0xe0,0x00,0x00,0x02,0x65] +// ERR: :[[@LINE+1]]:39: error: invalid operand for instruction +buffer_load_dwordx4 v0, s[8:11], s101 offen lds diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt index a9f28332860ee5..ce37e228f03fa3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950.txt @@ -23,3 +23,22 @@ # GFX950: global_load_lds_dwordx4 v[2:3], off sc0 nt sc1 ; encoding: [0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00] 0x00,0x80,0xf7,0xdf,0x02,0x00,0x7f,0x00 + + +# GFX950: buffer_load_dwordx3 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x59,0xe0,0x00,0x00,0x02,0x03] +0x00,0x00,0x59,0xe0,0x00,0x00,0x02,0x03 + +# GFX950: buffer_load_dwordx3 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x59,0xe0,0x00,0x00,0x02,0x03] +0xff,0x0f,0x59,0xe0,0x00,0x00,0x02,0x03 + +# GFX950: buffer_load_dwordx3 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x59,0xe0,0x00,0x00,0x02,0x65] +0x00,0x10,0x59,0xe0,0x00,0x00,0x02,0x65 + +# GFX950: buffer_load_dwordx4 off, s[8:11], s3 lds ; encoding: [0x00,0x00,0x5d,0xe0,0x00,0x00,0x02,0x03] +0x00,0x00,0x5d,0xe0,0x00,0x00,0x02,0x03 + +# GFX950: buffer_load_dwordx4 off, s[8:11], s3 offset:4095 lds ; encoding: [0xff,0x0f,0x5d,0xe0,0x00,0x00,0x02,0x03] +0xff,0x0f,0x5d,0xe0,0x00,0x00,0x02,0x03 + +# GFX950: buffer_load_dwordx4 v0, s[8:11], s101 offen lds ; encoding: [0x00,0x10,0x5d,0xe0,0x00,0x00,0x02,0x65] +0x00,0x10,0x5d,0xe0,0x00,0x00,0x02,0x65 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits