https://github.com/changpeng created https://github.com/llvm/llvm-project/pull/86313
Rename the intrinsics to close to the instruction mnemonic names: Use global_load_re_b64 and global_load_tr_b128 instead of global_load_tr. This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast. >From 79fd7cf6eee74d4485a215e47ddd8349b126f2f4 Mon Sep 17 00:00:00 2001 From: Changpeng Fang <changpeng.f...@amd.com> Date: Fri, 22 Mar 2024 10:06:02 -0700 Subject: [PATCH] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose Rename the intrinsics to close to the instruction mnemonic names: Use global_load_re_b64 and global_load_tr_b128 instead of global_load_tr. This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 5 - clang/lib/CodeGen/CGBuiltin.cpp | 28 +--- ...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 16 +- ...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 6 +- ...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 6 +- .../builtins-amdgcn-global-load-tr-w32.cl | 26 +--- .../builtins-amdgcn-global-load-tr-w64.cl | 26 +--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 +- .../Target/AMDGPU/AMDGPUSearchableTables.td | 3 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 12 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- .../UniformityAnalysis/AMDGPU/intrinsics.ll | 70 ++------- .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 146 ++++-------------- .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 146 ++++-------------- 15 files changed, 104 insertions(+), 410 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 4153b316c22b1d..c660582cc98e66 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -434,13 +434,8 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32") - TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64") //===----------------------------------------------------------------------===// // WMMA builtins. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 2eaceeba617700..e476234b1379ab 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18533,51 +18533,35 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, } case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: { + Intrinsic::ID IID; llvm::Type *ArgTy; switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: ArgTy = llvm::Type::getInt32Ty(getLLVMContext()); + IID = Intrinsic::amdgcn_global_load_tr_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: ArgTy = llvm::FixedVectorType::get( llvm::Type::getInt32Ty(getLLVMContext()), 2); - break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getBFloatTy(getLLVMContext()), 4); - break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getHalfTy(getLLVMContext()), 4); + IID = Intrinsic::amdgcn_global_load_tr_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getInt16Ty(getLLVMContext()), 4); - break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getBFloatTy(getLLVMContext()), 8); - break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: - ArgTy = llvm::FixedVectorType::get( - llvm::Type::getHalfTy(getLLVMContext()), 8); + IID = Intrinsic::amdgcn_global_load_tr_b128; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getInt16Ty(getLLVMContext()), 8); + IID = Intrinsic::amdgcn_global_load_tr_b128; break; } llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); - llvm::Function *F = - CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy}); + llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); return Builder.CreateCall(F, {Addr}); } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl index 4363769b864571..1e78ab28348682 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -4,24 +4,14 @@ // REQUIRES: amdgpu-registered-target typedef int v2i __attribute__((ext_vector_type(2))); -typedef half v8h __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef __bf16 v8bf16 __attribute__((ext_vector_type(8))); - -typedef half v4h __attribute__((ext_vector_type(4))); typedef short v4s __attribute__((ext_vector_type(4))); -typedef __bf16 v4bf16 __attribute__((ext_vector_type(4))); -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr, - global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr) { v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} - v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} - v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} - int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} - v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} - v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} - v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} + int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl index 208f92fc5d44f3..1acc4cd7adc960 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -3,14 +3,10 @@ // REQUIRES: amdgpu-registered-target -typedef half v4h __attribute__((ext_vector_type(4))); typedef short v4s __attribute__((ext_vector_type(4))); -typedef __bf16 v4bf16 __attribute__((ext_vector_type(4))); -void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr) +void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr) { int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} - v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} - v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl index 199146a9715da6..96b0e4c3993ab6 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -4,14 +4,10 @@ // REQUIRES: amdgpu-registered-target typedef int v2i __attribute__((ext_vector_type(2))); -typedef half v8h __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef __bf16 v8bf16 __attribute__((ext_vector_type(8))); -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr) { v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} - v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} - v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl index 0035b16b902b68..126d7d6fb7b053 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -3,13 +3,11 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 typedef int v2i __attribute__((ext_vector_type(2))); -typedef half v8h __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef __bf16 v8bf16 __attribute__((ext_vector_type(8))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]] // v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr) @@ -19,30 +17,10 @@ v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr) // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]] // v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr) { return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr); } - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16( -// CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]]) -// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]] -// -v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr) -{ - return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); -} - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16( -// CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]]) -// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]] -// -v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr) -{ - return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr); -} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl index 6c025bb5a55a36..7c70ccf73ad385 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -2,13 +2,11 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 -typedef half v4h __attribute__((ext_vector_type(4))); typedef short v4s __attribute__((ext_vector_type(4))); -typedef __bf16 v4bf16 __attribute__((ext_vector_type(4))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret i32 [[TMP0]] // int test_amdgcn_global_load_tr_b64_i32(global int* inptr) @@ -18,30 +16,10 @@ int test_amdgcn_global_load_tr_b64_i32(global int* inptr) // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16( // CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]] // v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr) { return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr); } - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16( -// CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]]) -// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]] -// -v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr) -{ - return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); -} - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16( -// CHECK-GFX1200-NEXT: entry: -// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]]) -// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]] -// -v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr) -{ - return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr); -} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index fff03dee20a18b..bda3b066b77636 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2769,17 +2769,14 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>: >; // Wave32 -// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64 -// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128 -// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128 -// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128 +// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64 +// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128 // Wave64 -// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64 -// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128 -// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128 -// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128 +// i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64 +// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128 -def int_amdgcn_global_load_tr : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>; +def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>; // i32 @llvm.amdgcn.wave.id() def int_amdgcn_wave_id : diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 0037825ce08938..a42e95f140ce99 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4889,7 +4889,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_atomic_cond_sub_u32: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: - case Intrinsic::amdgcn_global_load_tr: + case Intrinsic::amdgcn_global_load_tr_b64: + case Intrinsic::amdgcn_global_load_tr_b128: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 8eb46a9801482c..410dc83d45c57f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -365,7 +365,8 @@ def : SourceOfDivergence<intr>; foreach intr = AMDGPUWMMAIntrinsicsGFX12 in def : SourceOfDivergence<intr>; -def : SourceOfDivergence<int_amdgcn_global_load_tr>; +def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>; +def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index db1f8c1872652b..d017ec4a741510 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1593,16 +1593,12 @@ let OtherPredicates = [isGFX12Plus] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>; let WaveSizePredicate = isWave32 in { - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>; } let WaveSizePredicate = isWave64 in { - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>; } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 7f0cff72c18661..4b08c28b05cef9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1355,7 +1355,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } - case Intrinsic::amdgcn_global_load_tr: { + case Intrinsic::amdgcn_global_load_tr_b64: + case Intrinsic::amdgcn_global_load_tr_b128: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1462,7 +1463,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmin_num: case Intrinsic::amdgcn_global_atomic_ordered_add_b64: - case Intrinsic::amdgcn_global_load_tr: + case Intrinsic::amdgcn_global_load_tr_b64: + case Intrinsic::amdgcn_global_load_tr_b128: Ptr = II->getArgOperand(0); break; case Intrinsic::amdgcn_global_load_lds: diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 333ec6dcaf488a..26c85e83b53adc 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -197,78 +197,38 @@ bb: ret void } -; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep) +; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) %addr) define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep) + %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) %addr) store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep) +; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) %addr) define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep) + %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) %addr) store <8 x i16> %tmp0, ptr addrspace(1) %out, align 16 ret void } -; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep) -define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { -bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep) - store <8 x half> %tmp0, ptr addrspace(1) %out, align 16 - ret void -} - -; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep) -define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { -bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep) - store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16 - ret void -} - -; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep) +; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr) define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep) + %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr) store i32 %tmp0, ptr addrspace(1) %out, align 4 ret void } -; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep) -define amdgpu_kernel void @global_load_tr_b128_v4i16_(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep) + %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) %addr) store <4 x i16> %tmp0, ptr addrspace(1) %out, align 8 ret void } -; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep) -define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { -bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep) - store <4 x half> %tmp0, ptr addrspace(1) %out, align 8 - ret void -} - -; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep) -define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { -bb: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep) - store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8 - ret void -} - declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1 @@ -296,14 +256,10 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) -declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -declare i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) +declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) +declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index b4415c12926ac3..f6197e0770213c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -1,132 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s -declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1)) -declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1)) -declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1)) -declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b64: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b64: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm +; GFX12-LABEL: global_load_tr_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1) %gep) + %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_i16: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_i16: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm +define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1) %gep) + %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) store <8 x i16> %val, ptr addrspace(1) %use ret void } - -define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_half: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_half: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1) %gep) - store <8 x half> %val, ptr addrspace(1) %use - ret void -} - -define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_bfloat: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_bfloat: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1) %gep) - store <8 x bfloat> %val, ptr addrspace(1) %use - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 7ad1416789de79..a2dc3662fcc485 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -1,132 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s -declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1)) -declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1)) -declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1)) -declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1)) +declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) +declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b64: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b64: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm +; GFX12-LABEL: global_load_tr_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1) %gep) + %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep) store i32 %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_i16: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_i16: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm +define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1) %gep) + %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep) store <4 x i16> %val, ptr addrspace(1) %use ret void } - -define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_half: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_half: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1) %gep) - store <4 x half> %val, ptr addrspace(1) %use - ret void -} - -define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_bfloat: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_bfloat: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1) %gep) - store <4 x bfloat> %val, ptr addrspace(1) %use - ret void -} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits