https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/117592
Co-authored-by: Pravin Jagtap <pravin.jag...@amd.com> >From 3ba5c37284ce7df02470662c790cc5280e0a62a2 Mon Sep 17 00:00:00 2001 From: Pravin Jagtap <pravin.jag...@amd.com> Date: Mon, 8 Apr 2024 04:56:56 -0400 Subject: [PATCH] AMDGPU: Support v_cvt_scalef32_pk32_{bf|f}6_{bf|fp}16 for gfx950 Co-authored-by: Pravin Jagtap <pravin.jag...@amd.com> --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-gfx950.cl | 43 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 9 + llvm/lib/Target/AMDGPU/AMDGPU.td | 17 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 + llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + .../Disassembler/AMDGPUDisassembler.cpp | 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 7 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 1 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 14 + llvm/lib/TargetParser/TargetParser.cpp | 1 + .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll | 474 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx950_asm_features.s | 16 + llvm/test/MC/AMDGPU/gfx950_err.s | 48 ++ .../Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 12 + 16 files changed, 653 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index a42ad56ce4f998..e09dc0e1107a82 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -559,6 +559,10 @@ TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64") TARGET_BUILTIN(__builtin_amdgcn_prng_b32, "UiUi", "nc", "prng-inst") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16, "V6UiV32hf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16, "V6UiV32hf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") +TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts") #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index f9e07fbc6b0480..56013dad9b6651 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -89,7 +89,7 @@ // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 49f85982faf5a5..779aadd96f3f41 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -4,6 +4,9 @@ typedef unsigned int uint; typedef unsigned int __attribute__((ext_vector_type(2))) uint2; +typedef unsigned int __attribute__((ext_vector_type(6))) uint6; +typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32; +typedef half __attribute__((ext_vector_type(32))) half32; // CHECK-LABEL: @test_prng_b32( // CHECK-NEXT: entry: @@ -106,3 +109,43 @@ void test_permlane32_swap(global uint2* out, uint old, uint src) { *out = __builtin_amdgcn_permlane32_swap(old, src, true, false); *out = __builtin_amdgcn_permlane32_swap(old, src, false, true); } + +// CHECK-LABEL: @test_cvt_scalef32_pk( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5) +// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) +// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 +// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 +// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], float [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 32 +// CHECK-NEXT: [[TMP4:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP4]], float [[TMP5]]) +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 32 +// CHECK-NEXT: [[TMP8:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 +// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> [[TMP8]], float [[TMP9]]) +// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 32 +// CHECK-NEXT: [[TMP12:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 +// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]]) +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 +// CHECK-NEXT: store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32 +// CHECK-NEXT: ret void +// +void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float scale) +{ + *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(srcbf32, scale); + *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(srch32, scale); + *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(srcbf32, scale); + *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(srch32, scale); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index b46fe668ea7afd..9c88f11c5bb335 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -594,6 +594,15 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; +class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic< + [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable] +>, ClangBuiltin<"__builtin_amdgcn_"#name>; + +def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">; +def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">; +def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">; +def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_bf6_bf16">; + def int_amdgcn_prng_b32 : DefaultAttrsIntrinsic< [llvm_i32_ty], [llvm_i32_ty], [IntrNoMem] >, ClangBuiltin<"__builtin_amdgcn_prng_b32">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 5efcbf9338ddb9..702c9fc25bc9e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -408,11 +408,23 @@ def FeatureFP6BF6ConversionScaleInsts : SubtargetFeature<"fp6bf6-cvt-scale-insts "Has fp6 and bf6 conversion scale instructions" >; +def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp6bf6-cvt-scale-insts", + "HasF16BF16ToFP6BF6ConversionScaleInsts", + "true", + "Has f16bf16 to fp6bf6 conversion scale instructions" +>; + def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", "GFX950Insts", "true", "Additional instructions for GFX950+", - [FeaturePermlane16Swap, FeaturePermlane32Swap, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts, FeatureFP6BF6ConversionScaleInsts] + [FeaturePermlane16Swap, + FeaturePermlane32Swap, + FeatureFP8ConversionScaleInsts, + FeatureBF8ConversionScaleInsts, + FeatureFP4ConversionScaleInsts, + FeatureFP6BF6ConversionScaleInsts, + FeatureF16BF16ToFP6BF6ConversionScaleInsts] >; def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", @@ -2446,6 +2458,9 @@ def HasFP4ConversionScaleInsts : Predicate<"Subtarget->hasFP4ConversionScaleInst def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionScaleInsts()">, AssemblerPredicate<(all_of FeatureFP6BF6ConversionScaleInsts)>; +def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">, + AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>; + def HasGDS : Predicate<"Subtarget->hasGDS()">; def HasGWS : Predicate<"Subtarget->hasGWS()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b3a6f0fd09ea02..ff882486d12665 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4542,6 +4542,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_bf8_f32: case Intrinsic::amdgcn_cvt_sr_fp8_f32: case Intrinsic::amdgcn_cvt_sr_bf8_f32: + case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16: + case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16: + case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 1a09f55dfdb28a..742f4e6e80f1a9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -54,6 +54,7 @@ class AMDGPUSubtarget { bool HasBF8ConversionScaleInsts = false; bool HasFP4ConversionScaleInsts = false; bool HasFP6BF6ConversionScaleInsts = false; + bool HasF16BF16ToFP6BF6ConversionScaleInsts = false; bool EnableRealTrue16Insts = false; bool HasBF16ConversionInsts = false; bool HasMadMixInsts = false; @@ -187,6 +188,8 @@ class AMDGPUSubtarget { bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; } + bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; } + bool hasMadMacF32Insts() const { return HasMadMacF32Insts || !isGCN(); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index fa5f86b0788cc2..983a10027b20f4 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -262,6 +262,7 @@ DECODE_OPERAND_REG_8(VGPR_32_Lo128) DECODE_OPERAND_REG_8(VReg_64) DECODE_OPERAND_REG_8(VReg_96) DECODE_OPERAND_REG_8(VReg_128) +DECODE_OPERAND_REG_8(VReg_192) DECODE_OPERAND_REG_8(VReg_256) DECODE_OPERAND_REG_8(VReg_288) DECODE_OPERAND_REG_8(VReg_352) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index ea36347423c57c..324d4e0e3376f6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1699,6 +1699,7 @@ class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> { RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand<VReg_1024>, !eq(VT.Size, 512) : VOPDstOperand<VReg_512>, !eq(VT.Size, 256) : VOPDstOperand<VReg_256>, + !eq(VT.Size, 192) : VOPDstOperand<VReg_192>, !eq(VT.Size, 128) : VOPDstOperand<VReg_128>, !eq(VT.Size, 64) : VOPDstOperand<VReg_64>, !eq(VT.Size, 32) : VOPDstOperand<VGPR_32>, @@ -1754,7 +1755,8 @@ class getSOPSrcForVT<ValueType VT> { // Returns the vreg register class to use for source operand given VT class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 1> { RegisterOperand ret = - !cond(!eq(VT.Size, 192) : RegisterOperand<VReg_192>, + !cond(!eq(VT.Size, 512) : RegisterOperand<VReg_512>, + !eq(VT.Size, 192) : RegisterOperand<VReg_192>, !eq(VT.Size, 128) : RegisterOperand<VReg_128>, !eq(VT.Size, 96) : RegisterOperand<VReg_96>, !eq(VT.Size, 64) : RegisterOperand<VReg_64>, @@ -1788,6 +1790,7 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> { !eq(VT, v2i16) : VSrc_v2b16, !eq(VT, v4f16) : AVSrc_64, !eq(VT, v4bf16) : AVSrc_64, + !eq(VT.Size, 512) : VRegSrc_512, !eq(VT.Size, 192) : VRegSrc_192, !eq(VT.Size, 128) : VRegSrc_128, !eq(VT.Size, 96) : VRegSrc_96, @@ -2835,6 +2838,8 @@ def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>; +def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>; +def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 11ca4df6e9f445..51fdd4211a5cf6 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1251,6 +1251,7 @@ def VRegSrc_96 : SrcReg9<VReg_96, "OPW96">; def VRegSrc_128: SrcReg9<VReg_128, "OPW128">; def VRegSrc_192: SrcReg9<VReg_192, "OPW192">; def VRegSrc_256: SrcReg9<VReg_256, "OPW256">; +def VRegSrc_512: SrcReg9<VReg_512, "OPW512">; def VRegOrLdsSrc_32 : SrcReg9<VRegOrLds_32, "OPW32">; // True 16 Operands diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 554aff7082010a..764a2275205665 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -972,6 +972,13 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>>; } +let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in { + defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_f16>; + defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_f16>; + defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>; + defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_pk32_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_bf6_bf16>; +} + let SubtargetPredicate = isGFX10Plus in { let isCommutable = 1, isReMaterializable = 1 in { defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -1924,3 +1931,10 @@ defm V_CVT_SCALEF32_PK32_BF16_FP6 : VOP3_Real_gfx9<0x261, "v_cvt_scalef32_pk32_b defm V_CVT_SCALEF32_PK32_F16_BF6 : VOP3_Real_gfx9<0x262, "v_cvt_scalef32_pk32_f16_bf6">; defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3_Real_gfx9<0x263, "v_cvt_scalef32_pk32_bf16_bf6">; } + +let OtherPredicates = [HasF16BF16ToFP6BF6ConversionScaleInsts] in { +defm V_CVT_SCALEF32_PK32_FP6_F16 : VOP3_Real_gfx9<0x258, "v_cvt_scalef32_pk32_fp6_f16">; +defm V_CVT_SCALEF32_PK32_FP6_BF16 : VOP3_Real_gfx9<0x259, "v_cvt_scalef32_pk32_fp6_bf16">; +defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_bf6_f16">; +defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">; +} diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index c60c5a0fc2bb78..6da48da608cc14 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -470,6 +470,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gws"] = true; break; case GK_GFX950: + Features["f16bf16-to-fp6bf6-cvt-scale-insts"] = true; Features["prng-inst"] = true; Features["permlane16-swap"] = true; Features["permlane32-swap"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll new file mode 100644 index 00000000000000..4153bc8f43563b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll @@ -0,0 +1,474 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s + +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale) +declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float %scale) + +define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_sl: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[2:7], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_lshr_b32 s16, s0, 16 +; GFX950-GISEL-NEXT: s_lshr_b32 s17, s1, 16 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s16, 16 +; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s18, s2, 16 +; GFX950-GISEL-NEXT: s_or_b32 s0, s16, s0 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s17, 16 +; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s19, s3, 16 +; GFX950-GISEL-NEXT: s_or_b32 s1, s16, s1 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s18, 16 +; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s20, s4, 16 +; GFX950-GISEL-NEXT: s_or_b32 s2, s16, s2 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s19, 16 +; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s21, s5, 16 +; GFX950-GISEL-NEXT: s_or_b32 s3, s16, s3 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s20, 16 +; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s22, s6, 16 +; GFX950-GISEL-NEXT: s_or_b32 s4, s16, s4 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s21, 16 +; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s23, s7, 16 +; GFX950-GISEL-NEXT: s_or_b32 s5, s16, s5 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s22, 16 +; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s24, s8, 16 +; GFX950-GISEL-NEXT: s_or_b32 s6, s16, s6 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s23, 16 +; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s25, s9, 16 +; GFX950-GISEL-NEXT: s_or_b32 s7, s16, s7 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s24, 16 +; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s26, s10, 16 +; GFX950-GISEL-NEXT: s_or_b32 s8, s16, s8 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s25, 16 +; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s27, s11, 16 +; GFX950-GISEL-NEXT: s_or_b32 s9, s16, s9 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s26, 16 +; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s28, s12, 16 +; GFX950-GISEL-NEXT: s_or_b32 s10, s16, s10 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s27, 16 +; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s29, s13, 16 +; GFX950-GISEL-NEXT: s_or_b32 s11, s16, s11 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s28, 16 +; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s30, s14, 16 +; GFX950-GISEL-NEXT: s_or_b32 s12, s16, s12 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s29, 16 +; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s31, s15, 16 +; GFX950-GISEL-NEXT: s_or_b32 s13, s16, s13 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s30, 16 +; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff +; GFX950-GISEL-NEXT: s_or_b32 s14, s16, s14 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s31, 16 +; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff +; GFX950-GISEL-NEXT: s_or_b32 s15, s16, s15 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[2:7], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f16_vv(<32 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[0:5], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[0:5], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl(<32 x half> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_sl: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[2:7], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_sl: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[2:7], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[0:5], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v4 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v10 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v11 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[0:5], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_sl: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[2:7], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_lshr_b32 s16, s0, 16 +; GFX950-GISEL-NEXT: s_lshr_b32 s17, s1, 16 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s16, 16 +; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s18, s2, 16 +; GFX950-GISEL-NEXT: s_or_b32 s0, s16, s0 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s17, 16 +; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s19, s3, 16 +; GFX950-GISEL-NEXT: s_or_b32 s1, s16, s1 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s18, 16 +; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s20, s4, 16 +; GFX950-GISEL-NEXT: s_or_b32 s2, s16, s2 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s19, 16 +; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s21, s5, 16 +; GFX950-GISEL-NEXT: s_or_b32 s3, s16, s3 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s20, 16 +; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s22, s6, 16 +; GFX950-GISEL-NEXT: s_or_b32 s4, s16, s4 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s21, 16 +; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s23, s7, 16 +; GFX950-GISEL-NEXT: s_or_b32 s5, s16, s5 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s22, 16 +; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s24, s8, 16 +; GFX950-GISEL-NEXT: s_or_b32 s6, s16, s6 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s23, 16 +; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s25, s9, 16 +; GFX950-GISEL-NEXT: s_or_b32 s7, s16, s7 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s24, 16 +; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s26, s10, 16 +; GFX950-GISEL-NEXT: s_or_b32 s8, s16, s8 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s25, 16 +; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s27, s11, 16 +; GFX950-GISEL-NEXT: s_or_b32 s9, s16, s9 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s26, 16 +; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s28, s12, 16 +; GFX950-GISEL-NEXT: s_or_b32 s10, s16, s10 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s27, 16 +; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s29, s13, 16 +; GFX950-GISEL-NEXT: s_or_b32 s11, s16, s11 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s28, 16 +; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s30, s14, 16 +; GFX950-GISEL-NEXT: s_or_b32 s12, s16, s12 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s29, 16 +; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff +; GFX950-GISEL-NEXT: s_lshr_b32 s31, s15, 16 +; GFX950-GISEL-NEXT: s_or_b32 s13, s16, s13 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s30, 16 +; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff +; GFX950-GISEL-NEXT: s_or_b32 s14, s16, s14 +; GFX950-GISEL-NEXT: s_lshl_b32 s16, s31, 16 +; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff +; GFX950-GISEL-NEXT: s_or_b32 s15, s16, s15 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[2:7], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f16_vv(<32 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_vv: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v18 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[0:5], v[0:15], v16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_vv: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v20, v17 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[0:5], v[0:15], v16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[20:21], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[20:21], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_sl: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[2:7], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_sl: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[2:7], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s index 271ad4d62c3a43..1aef267537aa55 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s @@ -913,3 +913,19 @@ v_cvt_scalef32_pk32_bf16_fp6 v[10:25], v[20:25], v8 // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_cvt_scalef32_pk32_f16_fp6 v[10:25], v[20:25], v8 ; encoding: [0x0a,0x00,0x60,0xd2,0x14,0x11,0x02,0x00] v_cvt_scalef32_pk32_f16_fp6 v[10:25], v[20:25], v8 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x5b,0xd2,0x0a,0x11,0x02,0x00] +v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], v8 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x5a,0xd2,0x0a,0x11,0x02,0x00] +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x59,0xd2,0x0a,0x11,0x02,0x00] +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x58,0xd2,0x0a,0x11,0x02,0x00] +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s index cb5b69be744781..f81a240701d949 100644 --- a/llvm/test/MC/AMDGPU/gfx950_err.s +++ b/llvm/test/MC/AMDGPU/gfx950_err.s @@ -197,3 +197,51 @@ v_cvt_scalef32_pk32_f16_fp6 v[10:25], v[20:25], v8 div:2 // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand v_cvt_scalef32_pk32_f16_fp6 v[10:25], v[20:25], v8 clamp div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 mul:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 clamp div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 mul:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 clamp div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 mul:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 clamp div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 mul:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 clamp div:2 \ No newline at end of file diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt index b897681aae0054..927366b9a410bc 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt @@ -629,3 +629,15 @@ # GFX950: v_cvt_scalef32_pk32_f16_fp6 v[10:25], v[20:25], v8 ; encoding: [0x0a,0x00,0x60,0xd2,0x14,0x11,0x02,0x00] 0x0a,0x00,0x60,0xd2,0x14,0x11,0x02,0x00 + +# GFX950: v_cvt_scalef32_pk32_bf6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x5b,0xd2,0x0a,0x11,0x02,0x00] +0x14,0x00,0x5b,0xd2,0x0a,0x11,0x02,0x00 + +# GFX950: v_cvt_scalef32_pk32_bf6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x5a,0xd2,0x0a,0x11,0x02,0x00] +0x14,0x00,0x5a,0xd2,0x0a,0x11,0x02,0x00 + +# GFX950: v_cvt_scalef32_pk32_fp6_bf16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x59,0xd2,0x0a,0x11,0x02,0x00] +0x14,0x00,0x59,0xd2,0x0a,0x11,0x02,0x00 + +# GFX950: v_cvt_scalef32_pk32_fp6_f16 v[20:25], v[10:25], v8 ; encoding: [0x14,0x00,0x58,0xd2,0x0a,0x11,0x02,0x00] +0x14,0x00,0x58,0xd2,0x0a,0x11,0x02,0x00 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits