https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/80503
>From b07f5866aa8acf881fbdb15450ecda4dfc8a68e8 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Date: Fri, 2 Feb 2024 14:28:00 -0800 Subject: [PATCH 1/2] [AMDGPU] Fixed byte_sel of v_cvt_f32_bf8/v_cvt_f32_fp8 Opsel bits are swapped. Actual byte select table: Byte OPSEL 0 0 1 2 2 1 3 3 --- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 6 ++---- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll | 4 ++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 920c220fb2c65..58b67b21e274b 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -668,10 +668,8 @@ class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index, VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat< (f32 (node i32:$src, index)), !if (index, - (inst_e64 !if(index{0}, - !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1), - SRCMODS.OP_SEL_0), - !if(index{1}, SRCMODS.OP_SEL_1, 0)), + (inst_e64 !or(!if(index{0}, SRCMODS.OP_SEL_1, 0), + !if(index{1}, SRCMODS.OP_SEL_0, 0)), $src, 0), (inst_e32 $src)) >; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll index f49fec60892cd..e21d61036375a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll @@ -16,7 +16,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) { ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1) @@ -28,7 +28,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) { ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] ; GFX12-NEXT: ; return to shader part epilog %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1) %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index 17b1fcf865e94..f915fa8e6cd1c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -45,7 +45,7 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1) ret float %ret @@ -65,7 +65,7 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2) ret float %ret @@ -125,7 +125,7 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0] +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1) ret float %ret @@ -145,7 +145,7 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1] +; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2) ret float %ret >From 5f211ec3068988ab397d7234e2fc5a61e074bee8 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin <stanislav.mekhanos...@amd.com> Date: Fri, 2 Feb 2024 14:35:59 -0800 Subject: [PATCH 2/2] [AMDGPU] GlobalISel for f8 conversions --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 8 ++++++++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll | 6 ++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 09fac963d222d..5323e4fc58de8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4497,6 +4497,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_dot4_f32_bf8_fp8: case Intrinsic::amdgcn_dot4_f32_fp8_fp8: case Intrinsic::amdgcn_dot4_f32_bf8_bf8: + case Intrinsic::amdgcn_cvt_f32_fp8: + case Intrinsic::amdgcn_cvt_f32_bf8: + case Intrinsic::amdgcn_cvt_pk_f32_fp8: + case Intrinsic::amdgcn_cvt_pk_f32_bf8: + case Intrinsic::amdgcn_cvt_pk_fp8_f32: + case Intrinsic::amdgcn_cvt_pk_bf8_f32: + case Intrinsic::amdgcn_cvt_sr_fp8_f32: + case Intrinsic::amdgcn_cvt_sr_bf8_f32: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll index f915fa8e6cd1c..fc4b663b85a61 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32) declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits