https://github.com/changpeng created https://github.com/llvm/llvm-project/pull/99551
None >From 31601c539553d7af0efd94722eabf4627f8a387c Mon Sep 17 00:00:00 2001 From: Changpeng Fang <changpeng.f...@amd.com> Date: Thu, 18 Jul 2024 11:03:24 -0700 Subject: [PATCH 1/2] AMDGPU: Add back half and bfloat support for global_load_tr16 pats half and bfloat are common types for 16-bit elements. The support of them was original there and dropped due to some reasons. This work adds the support of the float types back. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 ++ clang/lib/CodeGen/CGBuiltin.cpp | 10 +++- ...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 15 ++++-- ...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 6 ++- ...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 6 ++- .../builtins-amdgcn-global-load-tr-w32.cl | 22 +++++++++ .../builtins-amdgcn-global-load-tr-w64.cl | 22 +++++++++ llvm/lib/Target/AMDGPU/FLATInstructions.td | 4 ++ .../UniformityAnalysis/AMDGPU/intrinsics.ll | 36 ++++++++++++++ .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 48 +++++++++++++++++-- .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 48 +++++++++++++++++-- 11 files changed, 207 insertions(+), 14 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 56bba448e12a4..e62315eea277a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -445,8 +445,12 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64") //===----------------------------------------------------------------------===// // WMMA builtins. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 67027f8aa93f3..2ad62d6ee0bb2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18725,7 +18725,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: { + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: { Intrinsic::ID IID; switch (BuiltinID) { @@ -18734,7 +18738,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, IID = Intrinsic::amdgcn_global_load_tr_b64; break; case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: IID = Intrinsic::amdgcn_global_load_tr_b128; break; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl index 1fcb1d721ad72..8242ae6a98c40 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -5,13 +5,22 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef __bf16 v4y __attribute__((ext_vector_type(4))); -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr, + global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr) { v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} - int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} - v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + v4y out_8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl index 7a36881c051b1..6f7a93ef897ac 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -4,9 +4,13 @@ // REQUIRES: amdgpu-registered-target typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef __bf16 v4y __attribute__((ext_vector_type(4))); -void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr) +void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr) { int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + v4y out_4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl index 9155ee6e61822..b7323f1b41c2a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -5,9 +5,13 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr) { v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v8y out_4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8y_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl index ce8b2c2c7c5ba..186fc4eacfaaf 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -4,6 +4,8 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef half v8h __attribute__((ext_vector_type(8))); +typedef __bf16 v8y __attribute__((ext_vector_type(8))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( // CHECK-GFX1200-NEXT: entry: @@ -24,3 +26,23 @@ v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr) { return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr); } + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]] +// +v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]] +// +v8y test_amdgcn_global_load_tr_b128_v8bf16(global v8y* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl index b0eed07627f41..b6627f1c8114d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -3,6 +3,8 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 typedef short v4s __attribute__((ext_vector_type(4))); +typedef half v4h __attribute__((ext_vector_type(4))); +typedef __bf16 v4y __attribute__((ext_vector_type(4))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( // CHECK-GFX1200-NEXT: entry: @@ -23,3 +25,23 @@ v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr) { return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr); } + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]] +// +v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); +} + +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]] +// +v4y test_amdgcn_global_load_tr_b128_v4bf16(global v4y* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr); +} diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 16dc019ede810..56cc6ffa19096 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1590,10 +1590,14 @@ let OtherPredicates = [isGFX12Plus] in { let WaveSizePredicate = isWave32 in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8f16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8bf16>; } let WaveSizePredicate = isWave64 in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4f16>; + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4bf16>; } } diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index 680c998a4b39f..b215fc2c2ae74 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -213,6 +213,22 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) %addr) + store <8 x half> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1) %addr) + store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16 + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) %addr) define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: @@ -229,6 +245,22 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) %addr) + store <4 x half> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr) +define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) { +bb: + %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1) %addr) + store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8 + ret void +} + declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1 declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1 @@ -258,8 +290,12 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1)) +declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16(ptr addrspace(1)) declare i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) +declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1)) +declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1)) attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 291c249e4b738..0e659b758cd0f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -4,9 +4,11 @@ declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) +declare <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1)) +declare <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1)) -define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b64: +define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b64_v2i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -24,8 +26,8 @@ entry: ret void } -define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b128: +define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v8i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v4, 0 @@ -42,3 +44,41 @@ entry: store <8 x i16> %val, ptr addrspace(1) %use ret void } + +define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v8f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16.p1(ptr addrspace(1) %gep) + store <8 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v8bf16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.b128.v8bf16.p1(ptr addrspace(1) %gep) + store <8 x bfloat> %val, ptr addrspace(1) %use + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 12742f4f7127b..d941830e8dafc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -4,9 +4,11 @@ declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) +declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1)) +declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1)) -define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b64: +define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b64_i32: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -24,8 +26,8 @@ entry: ret void } -define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-LABEL: global_load_tr_b128: +define amdgpu_kernel void @global_load_tr_b128_v4i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v4i16: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 ; GFX12-NEXT: v_mov_b32_e32 v2, 0 @@ -42,3 +44,41 @@ entry: store <4 x i16> %val, ptr addrspace(1) %use ret void } + +define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v4f16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16.p1(ptr addrspace(1) %gep) + store <4 x half> %val, ptr addrspace(1) %use + ret void +} + +define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128_v4bf16: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm +entry: + %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 + %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16.p1(ptr addrspace(1) %gep) + store <4 x bfloat> %val, ptr addrspace(1) %use + ret void +} >From d4e877d58ca9bf43ffbab59ec665f74bb43160fc Mon Sep 17 00:00:00 2001 From: Changpeng Fang <changpeng.f...@amd.com> Date: Thu, 18 Jul 2024 11:56:06 -0700 Subject: [PATCH 2/2] AMDGPU: Loop over the types for global_load_tr16 pats (NFC) --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 56cc6ffa19096..88c6039473338 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1589,15 +1589,13 @@ let OtherPredicates = [isGFX12Plus] in { let WaveSizePredicate = isWave32 in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8f16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8bf16>; + foreach vt = [v8i16, v8f16, v8bf16] in + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, vt>; } let WaveSizePredicate = isWave64 in { defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4i16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4f16>; - defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, v4bf16>; + foreach vt = [v4i16, v4f16, v4bf16] in + defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr_b128, vt>; } } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits