llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Changpeng Fang (changpeng) <details> <summary>Changes</summary> --- Patch is 37.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146289.diff 11 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+13) - (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+36) - (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl (+130) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6) - (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) - (modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+6) - (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+14-3) - (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+5) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+12) - (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+72) - (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll (+322) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 1d1f5a4ee3f9f..4e28f3bb7ef81 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -640,6 +640,19 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16 // GFX1250+ only builtins. //===----------------------------------------------------------------------===// +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8i16, "V8sV8s*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8f16, "V8hV8h*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr16_b128_v8bf16, "V8yV8y*1", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr4_b64_v2i32, "V2iV2i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr6_b96_v3i32, "V3iV3i*3", "nc", "transpose-load-f4f6-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32") + TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 982e5cd37ffd1..f09b3b92c4ea0 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -545,6 +545,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr8_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_ds_read_tr6_b96_v3i32: @@ -555,6 +567,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr8_b64_v2i32: IID = Intrinsic::amdgcn_global_load_tr_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: @@ -563,8 +576,31 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr16_b128_v8bf16: IID = Intrinsic::amdgcn_global_load_tr_b128; break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_global_load_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_global_load_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr4_b64_v2i32: + IID = Intrinsic::amdgcn_ds_load_tr4_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr6_b96_v3i32: + IID = Intrinsic::amdgcn_ds_load_tr6_b96; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr8_b64_v2i32: + IID = Intrinsic::amdgcn_ds_load_tr8_b64; + break; + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_ds_load_tr16_b128_v8bf16: + IID = Intrinsic::amdgcn_ds_load_tr16_b128; + break; case AMDGPU::BI__builtin_amdgcn_ds_read_tr4_b64_v2i32: IID = Intrinsic::amdgcn_ds_read_tr4_b64; break; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl new file mode 100644 index 0000000000000..1e3a88a41f90e --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl @@ -0,0 +1,130 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250 + +typedef int v2i __attribute__((ext_vector_type(2))); +typedef int v3i __attribute__((ext_vector_type(3))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_global_load_tr4_b64_v2i32(global v2i* inptr) +{ + return __builtin_amdgcn_global_load_tr4_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr8_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_global_load_tr8_b64_v2i32(global v2i* inptr) +{ + return __builtin_amdgcn_global_load_tr8_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr6_b96_v3i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]] +// +v3i test_amdgcn_global_load_tr6_b96_v3i32(global v3i* inptr) +{ + return __builtin_amdgcn_global_load_tr6_b96_v3i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8i16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]] +// +v8s test_amdgcn_global_load_tr16_b128_v8i16(global v8s* inptr) +{ + return __builtin_amdgcn_global_load_tr16_b128_v8i16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8f16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]] +// +v8h test_amdgcn_global_load_tr16_b128_v8f16(global v8h* inptr) +{ + return __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr16_b128_v8bf16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]] +// +v8y test_amdgcn_global_load_tr16_b128_v8bf16(global v8y* inptr) +{ + return __builtin_amdgcn_global_load_tr16_b128_v8bf16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr4_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr4.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_ds_load_tr4_b64_v2i32(local v2i* inptr) +{ + return __builtin_amdgcn_ds_load_tr4_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr8_b64_v2i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]] +// +v2i test_amdgcn_ds_load_tr8_b64_v2i32(local v2i* inptr) +{ + return __builtin_amdgcn_ds_load_tr8_b64_v2i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr6_b96_v3i32( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <3 x i32> @llvm.amdgcn.ds.load.tr6.b96.v3i32(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <3 x i32> [[TMP0]] +// +v3i test_amdgcn_ds_load_tr6_b96_v3i32(local v3i* inptr) +{ + return __builtin_amdgcn_ds_load_tr6_b96_v3i32(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8i16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.ds.load.tr16.b128.v8i16(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x i16> [[TMP0]] +// +v8s test_amdgcn_ds_load_tr16_b128_v8i16(local v8s* inptr) +{ + return __builtin_amdgcn_ds_load_tr16_b128_v8i16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8f16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x half> [[TMP0]] +// +v8h test_amdgcn_ds_load_tr16_b128_v8f16(local v8h* inptr) +{ + return __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr); +} + +// CHECK-GFX1250-LABEL: @test_amdgcn_ds_load_tr16_b128_v8bf16( +// CHECK-GFX1250-NEXT: entry: +// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16(ptr addrspace(3) [[INPTR:%.*]]) +// CHECK-GFX1250-NEXT: ret <8 x bfloat> [[TMP0]] +// +v8y test_amdgcn_ds_load_tr16_b128_v8bf16(local v8y* inptr) +{ + return __builtin_amdgcn_ds_load_tr16_b128_v8bf16(inptr); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 6f974c97361de..ce37702b91486 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2996,6 +2996,12 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>: def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_global_load_tr4_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_global_load_tr6_b96 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_ds_load_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_load_tr16_b128 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_load_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; +def int_amdgcn_ds_load_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>; def int_amdgcn_ds_read_tr4_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; def int_amdgcn_ds_read_tr6_b96 : AMDGPULoadIntrinsic<local_ptr_ty>; def int_amdgcn_ds_read_tr8_b64 : AMDGPULoadIntrinsic<local_ptr_ty>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b20760c356263..6874657a4ffe7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -5105,6 +5105,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: + case Intrinsic::amdgcn_global_load_tr4_b64: + case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_ds_load_tr8_b64: + case Intrinsic::amdgcn_ds_load_tr16_b128: + case Intrinsic::amdgcn_ds_load_tr4_b64: + case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 3b62dcf3c92cd..1f6002a3c6a20 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -344,6 +344,12 @@ def : SourceOfDivergence<intr>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>; +def : SourceOfDivergence<int_amdgcn_global_load_tr4_b64>; +def : SourceOfDivergence<int_amdgcn_global_load_tr6_b96>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr8_b64>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr16_b128>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr4_b64>; +def : SourceOfDivergence<int_amdgcn_ds_load_tr6_b96>; def : SourceOfDivergence<int_amdgcn_ds_read_tr4_b64>; def : SourceOfDivergence<int_amdgcn_ds_read_tr6_b96>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 445ba9a26d336..f824253ce0f35 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -789,12 +789,12 @@ let WaveSizePredicate = isWave32, mayStore = 0 in { let OtherPredicates = [HasTransposeLoadF4F6Insts] in { defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>; defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>; -} // let OtherPredicates = [HasTransposeLoadF4F6Insts] +} // End OtherPredicates = [HasTransposeLoadF4F6Insts] defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>; defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>; -} // let WaveSizePredicate = isWave32, mayStore = 0 +} // End WaveSizePredicate = isWave32, mayStore = 0 -} // let SubtargetPredicate = isGFX1250Plus +} // End SubtargetPredicate = isGFX1250Plus let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in { defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>; @@ -1276,6 +1276,17 @@ class DSLoadTrPat <DS_Pseudo inst, ValueType vt, SDPatternOperator node> : GCNPa (inst $ptr, Offset:$offset, (i1 0)) >; +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in { +let OtherPredicates = [HasTransposeLoadF4F6Insts] in { + def : DSLoadTrPat <DS_LOAD_TR4_B64, v2i32, int_amdgcn_ds_load_tr4_b64>; + def : DSLoadTrPat <DS_LOAD_TR6_B96, v3i32, int_amdgcn_ds_load_tr6_b96>; +} // End OtherPredicates = [HasTransposeLoadF4F6Insts] + + def : DSLoadTrPat <DS_LOAD_TR8_B64, v2i32, int_amdgcn_ds_load_tr8_b64>; + foreach vt = [v8i16, v8f16, v8bf16] in + def : DSLoadTrPat <DS_LOAD_TR16_B128, vt, int_amdgcn_ds_load_tr16_b128>; +} // End WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus + let SubtargetPredicate = HasGFX950Insts in { def : DSLoadTrPat <DS_READ_B64_TR_B4, v2i32, int_amdgcn_ds_read_tr4_b64>; def : DSLoadTrPat <DS_READ_B64_TR_B8, v2i32, int_amdgcn_ds_read_tr8_b64>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c4db88b6e5105..dc6dbcef1f033 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1848,6 +1848,11 @@ let WaveSizePredicate = isWave64, OtherPredicates = [isGFX12PlusNot12_50] in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>; } +let WaveSizePredicate = isWave32, OtherPredicates = [HasTransposeLoadF4F6Insts] in { + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR4_B64, int_amdgcn_global_load_tr4_b64, v2i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR6_B96, int_amdgcn_global_load_tr6_b96, v3i32>; +} + let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d7dcf8c4a064..bb1de58e04fbc 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1444,6 +1444,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_ds_load_tr6_b96: + case Intrinsic::amdgcn_ds_load_tr4_b64: + case Intrinsic::amdgcn_ds_load_tr8_b64: + case Intrinsic::amdgcn_ds_load_tr16_b128: + case Intrinsic::amdgcn_global_load_tr6_b96: + case Intrinsic::amdgcn_global_load_tr4_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: case Intrinsic::amdgcn_ds_read_tr4_b64: @@ -1548,6 +1554,10 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: + case Intrinsic::amdgcn_ds_load_tr8_b64: + case Intrinsic::amdgcn_ds_load_tr16_b128: + case Intrinsic::amdgcn_ds_load_tr4_b64: + case Intrinsic::amdgcn_ds_load_tr6_b96: case Intrinsic::amdgcn_ds_read_tr4_b64: case Intrinsic::amdgcn_ds_read_tr6_b96: case Intrinsic::amdgcn_ds_read_tr8_b64: @@ -1562,6 +1572,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II, case Intrinsic::amdgcn_global_atomic_ordered_add_b64: case Intrinsic::amdgcn_global_load_tr_b64: case Intrinsic::amdgcn_global_load_tr_b128: + case Intrinsic::amdgcn_global_load_tr4_b64: + case Intrinsic::amdgcn_global_load_tr6_b96: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_load_to_lds: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 1028cc9ebb342..bd7464577b7db 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -261,6 +261,70 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr4_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr4.b64.v2i32(ptr addrspace(1) %addr) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr6_b96_v3i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <3 x i32> @llvm.amdgcn.global.load.tr6.b96.v3i32(ptr addrspace(1) %addr) + store <3 x i32> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr) +define amdgpu_kernel void @ds_load_tr8_b64_v2i32(ptr addrspace(3) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <2 x i32> @llvm.amdgcn.ds.load.tr8.b64.v2i32(ptr addrspace(3) %addr) + store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 + r... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/146289 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits