llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-transforms Author: Changpeng Fang (changpeng) <details> <summary>Changes</summary> The intrinsic has five arguments for the tensor descriptor (D#), while the fifth one is reserved for future targets, and it will be silently ignored in codegen for gfx1250. For tensor up to 2D, only the first two D# groups are meaningful and the rest should be zero-initialized. --- Patch is 44.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182334.diff 14 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.td (+3-4) - (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl (+38-8) - (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl (+3-5) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+7-24) - (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+35) - (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (-21) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+44) - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+1-8) - (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (-17) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll (+49-33) - (modified) llvm/test/CodeGen/AMDGPU/waitcnt-debug-output-crash.ll (+1-1) - (removed) llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll (-185) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index 86b10eba55e8e..966a176a6882d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -752,10 +752,9 @@ def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_Ext def __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64 : AMDGPUBuiltin<"void(long int address_space<3> *)", [Const], "gfx1250-insts">; def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(long int address_space<3> *, long int)", [Const], "gfx1250-insts">; -def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">; -def __builtin_amdgcn_tensor_load_to_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; -def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">; -def __builtin_amdgcn_tensor_store_from_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; +def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; +def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">; + def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl index 49ffbf4517160..cb106805d24bd 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl @@ -5,42 +5,72 @@ typedef int v4i __attribute__((ext_vector_type(4))); typedef int v8i __attribute__((ext_vector_type(8))); +static v4i v4i_zeros = (v4i){0,0,0,0}; +static v8i v8i_zeros = (v8i){0,0,0,0,0,0,0,0}; + // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds( // CHECK-GFX1250-NEXT: entry: -// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0) +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0) // CHECK-GFX1250-NEXT: ret void // void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3) { - __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0); + __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0); } // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2( // CHECK-GFX1250-NEXT: entry: -// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27) +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 27) // CHECK-GFX1250-NEXT: ret void // void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1) { - __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27); + __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27); } // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds( // CHECK-GFX1250-NEXT: entry: -// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22) +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22) // CHECK-GFX1250-NEXT: ret void // void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3) { - __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22); + __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22); } // CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2( // CHECK-GFX1250-NEXT: entry: -// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0) +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0) // CHECK-GFX1250-NEXT: ret void // void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1) { - __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0); + __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 0); +} + +//======================================================================= +// It is fine to pass 5 arguments as tensor descriptor, but the fifth one +// will be ignored by llvm CodeGen for gfx1250, which only supports D# up +// to 4 groups. +//======================================================================== + + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d5( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_load_to_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4) +{ + __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, 0); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d5( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0) +// CHECK-GFX1250-NEXT: ret void +// +void test_amdgcn_tensor_store_from_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4) +{ + __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, 0); } diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl index 8ab4f43d70c40..295707b53ed18 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl @@ -183,12 +183,10 @@ void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gadd __builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}} } -void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol) +void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4, int cpol) { - __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}} - __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}} - __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}} - __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}} + __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}} + __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}} } void test_prefetch(generic void *fptr, global void *gptr, int cpol) { diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 66591519de73e..9101666c2a49c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -4194,41 +4194,24 @@ def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdxClamp<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; } - class AMDGPUTensorLoadStore: Intrinsic< [], [llvm_v4i32_ty, // D# group 0 llvm_v8i32_ty, // D# group 1 - llvm_v4i32_ty, // D# group 2 - llvm_v4i32_ty, // D# group 3 + llvm_v4i32_ty, // D# group 2: group 2 and 3 should be zero-initialized for D# up to 2D. + llvm_v4i32_ty, // D# group 3: + llvm_v8i32_ty, // D# group 4: reserved for future targets, use <8 x i32> zeroinitializer for now. + // This argument will be silently ignored. llvm_i32_ty], // cachepolicy: // bits [0-2] = th // bits [3-4] = scope - [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree], - "", [SDNPMemOperand] - >; - -class AMDGPUTensorLoadStoreD2: - Intrinsic< - [], - [llvm_v4i32_ty, // D# group 0 - llvm_v8i32_ty, // D# group 1 - llvm_i32_ty], // cachepolicy: - // bits [0-2] = th - // bits [3-4] = scope - [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree], + [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand] >; -def int_amdgcn_tensor_load_to_lds : - ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore; -def int_amdgcn_tensor_store_from_lds : - ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore; -def int_amdgcn_tensor_load_to_lds_d2 : - ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds_d2">, AMDGPUTensorLoadStoreD2; -def int_amdgcn_tensor_store_from_lds_d2 : - ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2; +def int_amdgcn_tensor_load_to_lds : ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore; +def int_amdgcn_tensor_store_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore; class AMDGPUClusterLoad<LLVMType ptr_ty>: Intrinsic< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 238f06fbd33c0..b0a26495c014d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3005,6 +3005,37 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) { CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO}); } +void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) { + bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds; + unsigned Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : + AMDGPU::TENSOR_STORE_FROM_LDS; + + SmallVector<SDValue, 7> TensorOps; + // First two groups + TensorOps.push_back(N->getOperand(2)); // D# group 0 + TensorOps.push_back(N->getOperand(3)); // D# group 1 + + // Use _D2 version if both group 2 and 3 are zero-initialized. + SDValue Group2 = N->getOperand(4); + SDValue Group3 = N->getOperand(5); + if (ISD::isBuildVectorAllZeros(Group2.getNode()) && + ISD::isBuildVectorAllZeros(Group3.getNode())) { + Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2 : + AMDGPU::TENSOR_STORE_FROM_LDS_D2; + } else { // Has at least 4 groups + TensorOps.push_back(Group2); // D# group 2 + TensorOps.push_back(Group3); // D# group 3 + } + + // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored + // for now because all existing targets only support up to 4 groups. + TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128 + TensorOps.push_back(N->getOperand(7)); // cache policy + TensorOps.push_back(N->getOperand(0)); // chain + + (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps); +} + static unsigned gwsIntrinToOpcode(unsigned IntrID) { switch (IntrID) { case Intrinsic::amdgcn_ds_gws_init: @@ -3287,6 +3318,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { case Intrinsic::amdgcn_ds_gws_sema_release_all: SelectDS_GWS(N, IntrID); return; + case Intrinsic::amdgcn_tensor_load_to_lds: + case Intrinsic::amdgcn_tensor_store_from_lds: + SelectTensorLoadStore(N, IntrID); + return; default: break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index a86b75458923e..ffeb6dfdb3f90 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -285,6 +285,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { void SelectFP_EXTEND(SDNode *N); void SelectDSAppendConsume(SDNode *N, unsigned IntrID); void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID); + void SelectTensorLoadStore(SDNode *N, unsigned IntrID); void SelectDS_GWS(SDNode *N, unsigned IntrID); void SelectInterpP1F16(SDNode *N); void SelectINTRINSIC_W_CHAIN(SDNode *N); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 0ebe69de56fa9..02879a7bba897 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1821,27 +1821,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { NewII->takeName(&II); return IC.replaceInstUsesWith(II, NewII); } - case Intrinsic::amdgcn_tensor_load_to_lds: - case Intrinsic::amdgcn_tensor_store_from_lds: { - Value *D2 = II.getArgOperand(2); - Value *D3 = II.getArgOperand(3); - // We know that not passing the second and third tensor DMA groups is - // equivalent to passing zeroes for those registers, so we rewrite to the - // shorter form here. Undef or poison are replaced by 0. - auto Pred = m_CombineOr(m_Zero(), m_Undef()); - if (!match(D2, Pred) || !match(D3, Pred)) - return std::nullopt; - - auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds - ? Intrinsic::amdgcn_tensor_load_to_lds_d2 - : Intrinsic::amdgcn_tensor_store_from_lds_d2; - CallInst *NewII = IC.Builder.CreateIntrinsic( - ShortIntrinsic, - {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)}); - NewII->takeName(&II); - NewII->copyMetadata(II); - return IC.eraseInstFromFunction(II); - } case Intrinsic::amdgcn_wave_shuffle: { if (!ST->hasDPP()) return std::nullopt; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 7f913cfca5d7c..14c50a52b08a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2393,6 +2393,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_global_load_lds: case Intrinsic::amdgcn_global_load_async_lds: return selectGlobalLoadLds(I); + case Intrinsic::amdgcn_tensor_load_to_lds: + case Intrinsic::amdgcn_tensor_store_from_lds: + return selectTensorLoadStore(I, IntrinsicID); case Intrinsic::amdgcn_asyncmark: case Intrinsic::amdgcn_wait_asyncmark: // FIXME: Not supported on GFX12 yet. Will need a new feature when we do. @@ -3787,6 +3790,47 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ return true; } +bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI, + Intrinsic::ID IID) const { + bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds; + unsigned Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS : + AMDGPU::TENSOR_STORE_FROM_LDS; + int NumGroups = 4; + + // A lamda function to check whether an operand is a vector of all 0s. + const auto isAllZeros = [&](MachineOperand &Opnd) { + const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg()); + if (!DefMI) + return false; + return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true); + }; + + // Use _D2 version if both group 2 and 3 are zero-initialized. + if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) { + NumGroups = 2; + Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2 : + AMDGPU::TENSOR_STORE_FROM_LDS_D2; + } + + // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored + // for now because all existing targets only support up to 4 groups. + MachineBasicBlock *MBB = MI.getParent(); + auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc)) + .add(MI.getOperand(1)) // D# group 0 + .add(MI.getOperand(2)); // D# group 1 + + if (NumGroups >= 4) { // Has at least 4 groups + MIB.add(MI.getOperand(3)) // D# group 2 + .add(MI.getOperand(4)); // D# group 3 + } + + MIB.addImm(0) // r128 + .add(MI.getOperand(6)); // cpol + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic( MachineInstr &MI) const { unsigned OpcodeOpIdx = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 627cce277ae38..98c4e7837a1ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -145,6 +145,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector { bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectBufferLoadLds(MachineInstr &MI) const; bool selectGlobalLoadLds(MachineInstr &MI) const; + bool selectTensorLoadStore(MachineInstr &MI, Intrinsic::ID IID) const; bool selectBVHIntersectRayIntrinsic(MachineInstr &I) const; bool selectSMFMACIntrin(MachineInstr &I) const; bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index e8f316d332321..7e047278fe78f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3388,12 +3388,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 2); constrainOpWithReadfirstlane(B, MI, 3); constrainOpWithReadfirstlane(B, MI, 4); - return; - } - case Intrinsic::amdgcn_tensor_load_to_lds_d2: - case Intrinsic::amdgcn_tensor_store_from_lds_d2: { - constrainOpWithReadfirstlane(B, MI, 1); - constrainOpWithReadfirstlane(B, MI, 2); + constrainOpWithReadfirstlane(B, MI, 5); return; } default: { @@ -5636,8 +5631,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_pops_exiting_wave_id: return getDefaultMappingSOP(MI); - case Intrinsic::amdgcn_tensor_load_to_lds_d2: - case Intrinsic::amdgcn_tensor_store_from_lds_d2: case Intrinsic::amdgcn_tensor_load_to_lds: case Intrinsic::amdgcn_tensor_store_from_lds: { // Lie and claim everything is legal, even all operands need to be diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index b023c96296b2c..0521e199c31dd 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/182334 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
