https://github.com/changpeng created https://github.com/llvm/llvm-project/pull/146289
None >From fc2039dcf338f04977b2a0b43e8714cb5eb0f440 Mon Sep 17 00:00:00 2001 From: Changpeng Fang <changpeng.f...@amd.com> Date: Fri, 27 Jun 2025 14:59:33 -0700 Subject: [PATCH] AMDGPU: Implement intrinsic/builtins for gfx1250 load transpose instructions --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 13 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 36 ++ .../builtins-amdgcn-gfx1250-load-tr.cl | 130 +++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 6 + .../Target/AMDGPU/AMDGPUSearchableTables.td | 6 + llvm/lib/Target/AMDGPU/DSInstructions.td | 17 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 5 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 12 + .../UniformityAnalysis/AMDGPU/intrinsics.ll | 72 ++++ .../AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll | 322 ++++++++++++++++++ 11 files changed, 622 insertions(+), 3 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 1d1f5a4ee3f9f..4e28f3bb7ef81 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -640,6 +640,19 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16 // GFX1250+ only builtins. //===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8i16, "V8sV8s*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8f16, "V8hV8h*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8bf16, "V8yV8y*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr4_b64_v2i32, "V2iV2i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr6_b96_v3i32, "V3iV3i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32") + TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 982e5cd37ffd1..f09b3b92c4ea0 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -545,6 +545,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: @@ -555,6 +567,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: IID = Intrinsic::amdgcn_global_load_tr_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: @@ -563,8 +576,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: IID = Intrinsic::amdgcn_global_load_tr_b128; break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_global_load_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_global_load_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_ds_load_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_ds_load_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: + IID = Intrinsic::amdgcn_ds_load_tr8_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: + IID = Intrinsic::amdgcn_ds_load_tr16_b128; + break; case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: IID = Intrinsic::amdgcn_ds_read_tr4_b64; break; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl new file mode 100644 index 0000000000000..1e3a88a41f90e --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl @@ -0,0 +1,130 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v3i __attribute__((ext_vector_type(3))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_global_load_tr4_b64_v2i32(global v2i* inptr) +{ + return __builtin_amdgcn_global_load_tr4_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr8_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_global_load_tr8_b64_v2i32(global v2i* inptr) +{ + return __builtin_amdgcn_global_load_tr8_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr6_b96_v3i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]] +// +v3i test_amdgcn_global_load_tr6_b96_v3i32(global v3i* inptr) +{ + return __builtin_amdgcn_global_load_tr6_b96_v3i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8i16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]] +// +v8s test_amdgcn_global_load_tr16_b128_v8i16(global v8s* inptr) +{ + return __builtin_amdgcn_global_load_tr16_b128_v8i16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8f16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]] +// +v8h test_amdgcn_global_load_tr16_b128_v8f16(global v8h* inptr) +{ + return __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8bf16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]] +// +v8y test_amdgcn_global_load_tr16_b128_v8bf16(global v8y* inptr) +{ + return __builtin_amdgcn_global_load_tr16_b128_v8bf16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr4_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_ds_load_tr4_b64_v2i32(local v2i* inptr) +{ + return __builtin_amdgcn_ds_load_tr4_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr8_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_ds_load_tr8_b64_v2i32(local v2i* inptr) +{ + return __builtin_amdgcn_ds_load_tr8_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr6_b96_v3i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]] +// +v3i test_amdgcn_ds_load_tr6_b96_v3i32(local v3i* inptr) +{ + return __builtin_amdgcn_ds_load_tr6_b96_v3i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8i16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]] +// +v8s test_amdgcn_ds_load_tr16_b128_v8i16(local v8s* inptr) +{ + return __builtin_amdgcn_ds_load_tr16_b128_v8i16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8f16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]] +// +v8h test_amdgcn_ds_load_tr16_b128_v8f16(local v8h* inptr) +{ + return __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8bf16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]] +// +v8y test_amdgcn_ds_load_tr16_b128_v8bf16(local v8y* inptr) +{ + return __builtin_amdgcn_ds_load_tr16_b128_v8bf16(inptr); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 6f974c97361de..ce37702b91486 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2996,6 +2996,12 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>: def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_global_load_tr4_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_global_load_tr6_b96 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_ds_load_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_load_tr16_b128 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_load_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_load_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>; def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>; def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b20760c356263..6874657a4ffe7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5105,6 +5105,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: + case Intrinsic::amdgcn_global_load_tr4_b64: + case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_ds_load_tr8_b64: + case Intrinsic::amdgcn_ds_load_tr16_b128: + case Intrinsic::amdgcn_ds_load_tr4_b64: + case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 3b62dcf3c92cd..1f6002a3c6a20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -344,6 +344,12 @@ def : SourceOfDivergence<intr>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>; +def : SourceOfDivergence<int_amdgcn_global_load_tr4_b64>; +def : SourceOfDivergence<int_amdgcn_global_load_tr6_b96>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr8_b64>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr16_b128>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr4_b64>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr6_b96>; def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>; def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 445ba9a26d336..f824253ce0f35 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -789,12 +789,12 @@ let WaveSizePredicate = isWave32, mayStore = 0 in { let OtherPredicates = [HasTransposeLoadF4F6Insts] in { defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>; defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>; -} // let OtherPredicates = [HasTransposeLoadF4F6Insts] +} // End OtherPredicates = [HasTransposeLoadF4F6Insts] defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>; defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>; -} // let WaveSizePredicate = isWave32, mayStore = 0 +} // End WaveSizePredicate = isWave32, mayStore = 0 -} // let SubtargetPredicate = isGFX1250Plus +} // End SubtargetPredicate = isGFX1250Plus let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in { defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>; @@ -1276,6 +1276,17 @@ class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPa (inst $ptr, Offset:$offset, (i1 0)) >; +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in { +let OtherPredicates = [HasTransposeLoadF4F6Insts] in { + def : DSLoadTrPat <DS_LOAD_TR4_B64, v2i32, int_amdgcn_ds_load_tr4_b64>; + def : DSLoadTrPat <DS_LOAD_TR6_B96, v3i32, int_amdgcn_ds_load_tr6_b96>; +} // End OtherPredicates = [HasTransposeLoadF4F6Insts] + + def : DSLoadTrPat <DS_LOAD_TR8_B64, v2i32, int_amdgcn_ds_load_tr8_b64>; + foreach vt = [v8i16, v8f16, v8bf16] in + def : DSLoadTrPat <DS_LOAD_TR16_B128, vt, int_amdgcn_ds_load_tr16_b128>; +} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus + let SubtargetPredicate = HasGFX950Insts in { def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>; def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c4db88b6e5105..dc6dbcef1f033 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1848,6 +1848,11 @@ let WaveSizePredicate = isWave64, OtherPredicates = [isGFX12PlusNot12_50] in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>; } +let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] in { + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR4_B64, int_amdgcn_global_load_tr4_b64, v2i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d7dcf8c4a064..bb1de58e04fbc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1444,6 +1444,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_ds_load_tr6_b96: + case Intrinsic::amdgcn_ds_load_tr4_b64: + case Intrinsic::amdgcn_ds_load_tr8_b64: + case Intrinsic::amdgcn_ds_load_tr16_b128: + case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: @@ -1548,6 +1554,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: + case Intrinsic::amdgcn_ds_load_tr8_b64: + case Intrinsic::amdgcn_ds_load_tr16_b128: + case Intrinsic::amdgcn_ds_load_tr4_b64: + case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: @@ -1562,6 +1572,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: + case Intrinsic::amdgcn_global_load_tr4_b64: + case Intrinsic::amdgcn_global_load_tr6_b96: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 1028cc9ebb342..bd7464577b7db 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -261,6 +261,70 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr4_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr6_b96_v3i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr) + store <3 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr) +define amdgpu_kernel void @ds_load_tr8_b64_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) %addr) +define amdgpu_kernel void @ds_load_tr4_b64_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) %addr) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) %addr) +define amdgpu_kernel void @ds_load_tr6_b96_v3i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) %addr) + store <3 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) %addr) +define amdgpu_kernel void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) %addr) + store <8 x i16> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %addr) +define amdgpu_kernel void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) %addr) + store <8 x half> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) %addr) +define amdgpu_kernel void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) %addr) + store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3)) ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32(ptr addrspace(3) %gep) @@ -563,6 +627,14 @@ declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1)) declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1)) +declare <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3)) +declare <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3)) +declare <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3)) +declare <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3)) +declare <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3)) +declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3)) declare i32 @llvm.amdgcn.dead.i32() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll new file mode 100644 index 0000000000000..d91b03ca4461d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll @@ -0,0 +1,322 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) +declare <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1)) +declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1)) + +declare <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3)) +declare <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3)) +declare <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3)) +declare <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3)) +declare <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3)) +declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3)) + + +define amdgpu_ps void @global_load_tr4_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr4_b64_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_tr4_b64 v[0:1], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr4_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr4_b64_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_tr4_b64 v[2:3], v2, s[0:1] offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32.p1(ptr addrspace(1) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr8_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr8_b64_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_tr8_b64 v[0:1], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr8_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr8_b64_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_tr8_b64 v[2:3], v2, s[0:1] offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr6_b96_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr6_b96_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_tr6_b96 v[4:6], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b96 v[2:3], v[4:6], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep) + store <3 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr6_b96_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr6_b96_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_tr6_b96 v[2:4], v2, s[0:1] offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b96 v[0:1], v[2:4], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32.p1(ptr addrspace(1) %gep) + store <3 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr16_b128_v8i16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr16_b128_v8i16_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) + store <8 x i16> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr16_b128_v8i16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr16_b128_v8i16_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) + store <8 x i16> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr16_b128_v8f16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr16_b128_v8f16_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) + store <8 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr16_b128_v8f16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr16_b128_v8f16_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) + store <8 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr16_b128_v8b16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr16_b128_v8b16_vaddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_load_tr16_b128 v[4:7], v[0:1], off offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep) + store <8 x bfloat> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @global_load_tr16_b128_v8bf16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: global_load_tr16_b128_v8bf16_saddr: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep) + store <8 x bfloat> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX1250-SDAG-LABEL: ds_load_tr4_b64: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: ds_load_tr4_b64: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: ds_load_tr4_b64 v[0:1], v0 offset:32 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32.p3(ptr addrspace(3) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX1250-SDAG-LABEL: ds_load_tr8_b64: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: ds_load_tr8_b64: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: ds_load_tr8_b64 v[0:1], v0 offset:32 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32.p3(ptr addrspace(3) %gep) + store <2 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX1250-SDAG-LABEL: ds_load_tr6_b96: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX1250-SDAG-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_store_b96 v[4:5], v[0:2], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: ds_load_tr6_b96: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: ds_load_tr6_b96 v[0:2], v0 offset:32 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_store_b96 v[4:5], v[0:2], off +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32.p3(ptr addrspace(3) %gep) + store <3 x i32> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8i16: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8i16: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16.p3(ptr addrspace(3) %gep) + store <8 x i16> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8f16: +; GFX1250-SDAG: ; %bb.0: ; %entry +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX1250-SDAG-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8f16: +; GFX1250-GISEL: ; %bb.0: ; %entry +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-GISEL-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16.p3(ptr addrspace(3) %gep) + store <8 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_ps void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) { +; GFX1250-LABEL: ds_load_tr16_b128_v8bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX1250-NEXT: ds_load_tr16_b128 v[0:3], v0 offset:32 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 + %val = call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3) %gep) + store <8 x bfloat> %val, ptr addrspace(1) %use + ret void +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits