llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) <details> <summary>Changes</summary> --- Patch is 42.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146636.diff 20 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+5) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+26) - (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl (+46) - (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl (+11) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+31) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+30) - (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+23-6) - (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+3-7) - (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+94) - (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+1-2) - (modified) llvm/lib/Target/AMDGPU/SIInstrFormats.td (+4-2) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+13) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+1) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+1) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+24) - (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+7) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll (+245) - (added) llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s (+34) - (added) llvm/test/MC/AMDGPU/gfx1250_asm_vimage_err.s (+25) - (added) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt (+25) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 5133947c498ca..fb358297a5eed 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -640,6 +640,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16 // GFX1250+ only builtins. //===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts") + TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index f09b3b92c4ea0..1fc2d57d4941c 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -621,6 +621,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy}); return Builder.CreateCall(F, {Addr}); } + case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds: + case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2: + case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds: + case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds: + IID = Intrinsic::amdgcn_tensor_load_to_lds; + break; + case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2: + IID = Intrinsic::amdgcn_tensor_load_to_lds_d2; + break; + case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds: + IID = Intrinsic::amdgcn_tensor_store_from_lds; + break; + case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2: + IID = Intrinsic::amdgcn_tensor_store_from_lds_d2; + break; + } + + SmallVector<Value *, 5> Args; + for (int i = 0, e = E->getNumArgs(); i != e; ++i) + Args.push_back(EmitScalarExpr(E->getArg(i))); + llvm::Function *F = CGM.getIntrinsic(IID, {}); + return Builder.CreateCall(F, {Args}); + } case AMDGPU::BI__builtin_amdgcn_load_to_lds: { // Should this have asan instrumentation? return emitBuiltinWithOneOverloadedType<5>(*this, E, diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl new file mode 100644 index 0000000000000..49ffbf4517160 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl @@ -0,0 +1,46 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 + +typedef int v4i __attribute__((ext_vector_type(4))); +typedef int v8i __attribute__((ext_vector_type(8))); + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3) +{ + __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1) +{ + __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3) +{ + __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1) +{ + __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index 69857087bae08..3ba0d50e79031 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -1,6 +1,9 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s +typedef int v4i __attribute__((ext_vector_type(4))); +typedef int v8i __attribute__((ext_vector_type(8))); + void test_setprio_inc_wg(short a) { __builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}} } @@ -16,3 +19,11 @@ void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) { void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) { __builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}} } + +void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol) +{ + __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}} + __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}} + __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}} + __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a0a81568424f5..2aabf6109022f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3580,6 +3580,37 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< [IntrNoMem, IntrSpeculatable] >; +class AMDGPUTensorLoadStore: + Intrinsic< + [], + [llvm_v4i32_ty, // D# group 0 + llvm_v8i32_ty, // D# group 1 + llvm_v4i32_ty, // D# group 2 + llvm_v4i32_ty, // D# group 3 + llvm_i32_ty], // cachepolicy: + // bits [0-2] = th + // bits [3-4] = scope + [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] + >; + +class AMDGPUTensorLoadStoreD2: + Intrinsic< + [], + [llvm_v4i32_ty, // D# group 0 + llvm_v8i32_ty, // D# group 1 + llvm_i32_ty], // cachepolicy: + // bits [0-2] = th + // bits [3-4] = scope + [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree], + "", [SDNPMemOperand] + >; + +def int_amdgcn_tensor_load_to_lds : AMDGPUTensorLoadStore; +def int_amdgcn_tensor_store_from_lds : AMDGPUTensorLoadStore; +def int_amdgcn_tensor_load_to_lds_d2 : AMDGPUTensorLoadStoreD2; +def int_amdgcn_tensor_store_from_lds_d2 : AMDGPUTensorLoadStoreD2; + /// Emit an addrspacecast without null pointer checking. /// Should only be inserted by a pass based on analysis of an addrspacecast's src. def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic< diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 6f6d7b8d99af5..353fb23fa1520 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3348,6 +3348,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } + case Intrinsic::amdgcn_tensor_load_to_lds: + case Intrinsic::amdgcn_tensor_store_from_lds: { + constrainOpWithReadfirstlane(B, MI, 1); + constrainOpWithReadfirstlane(B, MI, 2); + constrainOpWithReadfirstlane(B, MI, 3); + constrainOpWithReadfirstlane(B, MI, 4); + return; + } + case Intrinsic::amdgcn_tensor_load_to_lds_d2: + case Intrinsic::amdgcn_tensor_store_from_lds_d2: { + constrainOpWithReadfirstlane(B, MI, 1); + constrainOpWithReadfirstlane(B, MI, 2); + return; + } default: { if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID)) { @@ -5354,6 +5368,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_pops_exiting_wave_id: return getDefaultMappingSOP(MI); + case Intrinsic::amdgcn_tensor_load_to_lds_d2: + case Intrinsic::amdgcn_tensor_store_from_lds_d2: + case Intrinsic::amdgcn_tensor_load_to_lds: + case Intrinsic::amdgcn_tensor_store_from_lds: { + // Lie and claim everything is legal, even all operands need to be + // SGPRs. applyMapping will have to deal with it with readfirstlane. + for (unsigned I = 1; I < MI.getNumOperands(); ++I) { + if (MI.getOperand(I).isReg()) { + Register Reg = MI.getOperand(I).getReg(); + auto OpBank = getRegBankID(Reg, MRI); + unsigned Size = getSizeInBits(Reg, MRI, *TRI); + OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); + } + } + break; + } case Intrinsic::amdgcn_s_prefetch_data: { OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 14fec71996a0e..c429e95f52a9d 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1784,6 +1784,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc); bool validateMIMGD16(const MCInst &Inst); bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands); + bool validateTensorR128(const MCInst &Inst); bool validateMIMGMSAA(const MCInst &Inst); bool validateOpSel(const MCInst &Inst); bool validateTrue16OpSel(const MCInst &Inst); @@ -4280,6 +4281,20 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateTensorR128(const MCInst &Inst) { + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::TENSOR_CNT) == 0) + return true; + + int R128Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128); + if (R128Idx >= 0 && Inst.getOperand(R128Idx).getImm()) + return false; + + return true; +} + static bool IsRevOpcode(const unsigned Opcode) { switch (Opcode) { @@ -5113,14 +5128,11 @@ bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst, return PrintError("scope and th combination is not valid"); } - bool IsStore = TID.mayStore(); - bool IsAtomic = - TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet); - - if (IsAtomic) { + unsigned THType = AMDGPU::getTemporalHintType(TID); + if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) { if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC)) return PrintError("invalid th value for atomic instructions"); - } else if (IsStore) { + } else if (THType == AMDGPU::CPol::TH_TYPE_STORE) { if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE)) return PrintError("invalid th value for store instructions"); } else { @@ -5205,6 +5217,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, Error(IDLoc, "missing dim operand"); return false; } + if (!validateTensorR128(Inst)) { + Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands), + "instruction must set modifier r128=0"); + return false; + } if (!validateMIMGMSAA(Inst)) { Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands), "invalid dim; must be MSAA type"); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index a6ce42dca92be..fa1474d153834 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -173,13 +173,12 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope, const unsigned Opcode = MI->getOpcode(); const MCInstrDesc &TID = MII.get(Opcode); - bool IsStore = TID.mayStore(); - bool IsAtomic = - TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet); + unsigned THType = AMDGPU::getTemporalHintType(TID); + bool IsStore = (THType == AMDGPU::CPol::TH_TYPE_STORE); O << " th:"; - if (IsAtomic) { + if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) { O << "TH_ATOMIC_"; if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) { if (Scope >= AMDGPU::CPol::SCOPE_DEV) @@ -196,9 +195,6 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope, if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED) O << formatHex(TH); else { - // This will default to printing load variants when neither MayStore nor - // MayLoad flag is present which is the case with instructions like - // image_get_resinfo. O << (IsStore ? "TH_STORE_" : "TH_LOAD_"); switch (TH) { case AMDGPU::CPol::TH_NT: diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 8d94d73bc1aab..531fae3ceff59 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -2019,3 +2019,97 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>; def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>; def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>; def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>; + +//===----------------------------------------------------------------------===// +// VIMAGE Tensor Instructions +//===----------------------------------------------------------------------===// + +class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> : + InstSI<(outs ), (ins ), "", []>, + SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + string Mnemonic = opName; + + let VALU = 1; + let maybeAtomic = 0; + let TENSOR_CNT = 1; + let mayLoad = 1; + let mayStore = 1; + let Uses = [EXEC, TENSORcnt]; + let Defs = [TENSORcnt]; + let SchedRW = [WriteVMEM, WriteLDS]; + let UseNamedOperandTable = 1; + let hasSideEffects = 0; + + bit UpTo2D = _UpTo2D; + let InOperandList = !if(UpTo2D, (ins SReg_128:$vaddr0, SReg_256:$vaddr1, R128A16:$r128, CPol:$cpol), + (ins SReg_128:$vaddr0, SReg_256:$vaddr1, SReg_128:$vaddr2, + SReg_128:$vaddr3, R128A16:$r128, CPol:$cpol)); + string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol"; +} + +let SubtargetPredicate = isGFX1250Plus in { +def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">; +def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">; +def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>; +def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>; +} // End SubtargetPredicate = isGFX1250Plus. + +class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat < + (node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)), + (inst $vaddr0, $vaddr1, $vaddr2, $vaddr3, 0, $cpol) +>; + +class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat < + (node v4i32:$vaddr0, v8i32:$vaddr1, (i32 timm:$cpol)), + (inst $vaddr0, $vaddr1, 0, $cpol) +>; + +let SubtargetPredicate = isGFX1250Plus in { +def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>; +def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>; +def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>; +def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>; +} + +class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> : + InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>, + VIMAGEe<op> { + + // copy relevant pseudo op flags + let SubtargetPredicate = ps.SubtargetPredicate; + let TSFlags = ps.TSFlags; + let mayLoad = ps.mayLoad; + let mayStore = ps.mayStore; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let SchedRW = ps.SchedRW; + + // D# group 2 and 3 set to NULL for 2D or less. + let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); + let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?); + + // set to 0 based on SPG. + let vaddr4 = 0; + let rsrc = 0; + let vdata = 0; + let d16 = 0; + let a16 = 0; + let tfe = 0; + let dmask = 1; // sp3 + let dim = 1; // sp3 +} + +multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> { + let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in { + foreach DSuffix = ["_D2", ""] in { + defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix); + def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>, + SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>; + } + } +} + +defm TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc4>; +defm TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc5>; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 4b72f66abbd76..76e29e4393206 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -107,8 +107,7 @@ enum : uint64_t { DisableWQM = UINT64_C(1) << 36, Gather4 = UINT64_C(1) << 37, - // Reserved, must be 0. - Reserved0 = UINT64_C(1) << 38, + TENSOR_CNT = UINT64_C(1) << 38, SCALAR_STORE = UINT64_C(1) << 39, FIXED_SIZE = UINT64_C(1) << 40, diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 42aae35112cac..c27d4e0df6fc5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -68,6 +68,9 @@ class InstSI <dag outs, dag ins, string asm = "", field bit Gather4 = 0; + // wait count to manage tensor loads/stores. + field bit TENSOR_CNT = 0; + // This is an s_store_dword* instruction that requires a cache flush // on wave termination. It is necessary to distinguish from mayStore // SMEM instructions like the cache flush ones. @@ -201,8 +204,7 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{36} = DisableWQM; let TSFlags{37} = Gather4; - // Reserved, must be 0. - let TSFlags{38} = 0; + let TSFlags{38} = TENSOR_CNT; let TSFlags{39} = ScalarStore; let TSFlags{40} = FixedSize; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/li... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/146636 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits