https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117596
>From 2e0074c4bb6f973aeb201ef52772800b2bb1a810 Mon Sep 17 00:00:00 2001 From: Sirish Pande <sirish.pa...@amd.com> Date: Tue, 13 Feb 2024 10:54:51 -0600 Subject: [PATCH] AMDGPU: Add support for v_ashr_pk_i8/u8_i32 instructions for gfx950 This patch adds assembly and builtin support for v_ashr_pk_i8/u8_i32 instructions. Co-authored-by: Sirish Pande <sirish.pa...@amd.com> --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 3 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-gfx950.cl | 46 ++++++++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 10 +++ llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +++ .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 8 +++ llvm/lib/Target/AMDGPU/VOPInstructions.td | 1 + llvm/lib/TargetParser/TargetParser.cpp | 1 + llvm/test/MC/AMDGPU/gfx950_asm_vop3.s | 72 +++++++++++++++++++ .../Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 36 ++++++++++ 13 files changed, 194 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index dacbf5aa902f60..fd449697e91216 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -467,6 +467,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr6_b96_v3i32, "V3iV3i*3", "nc", "gfx950 TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr8_b64_v2i32, "V2iV2i*3", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_read_tr16_b64_v4i16, "V4sV4s*3", "nc", "gfx950-insts") +TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_i8_i32, "UsUiUiUi", "nc", "ashr-pk-insts") +TARGET_BUILTIN(__builtin_amdgcn_ashr_pk_u8_i32, "UsUiUiUi", "nc", "ashr-pk-insts") + TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32, "V6UiV16fV16ff", "nc", "gfx950-insts") diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 56013dad9b6651..db7fd76ec91189 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -89,7 +89,7 @@ // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX950: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" +// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl index 6f3c81b26be0b8..34eae17827ff7c 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl @@ -169,3 +169,49 @@ void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, f *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale); *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale); } + +// CHECK-LABEL: @test_ashr_pk_i8_i32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2); +} + +// CHECK-LABEL: @test_ashr_pk_u8_i32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 +// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) +// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 +// CHECK-NEXT: ret void +// +void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) { + *out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 72298dc298a6fc..b8e07cc799b42e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3199,6 +3199,16 @@ def int_amdgcn_permlane32_swap : [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoCallback, IntrNoFree]>; +// llvm.amdgcn.ashr_pk_i8_i32 int vdst, int src0, int src1 int src2 +def int_amdgcn_ashr_pk_i8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_i8_i32">, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + +// llvm.amdgcn.ashr_pk_u8_i32 int vdst, int src0, int src1 int src2 +def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index a47206f6bf6730..e4e427ef43339b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -414,12 +414,19 @@ def FeatureF16BF16ToFP6BF6ConversionScaleInsts : SubtargetFeature<"f16bf16-to-fp "Has f16bf16 to fp6bf6 conversion scale instructions" >; +def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts", + "HasAshrPkInsts", + "true", + "Has Arithmetic Shift Pack instructions" +>; + def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", "GFX950Insts", "true", "Additional instructions for GFX950+", [FeaturePermlane16Swap, FeaturePermlane32Swap, + FeatureAshrPkInsts, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts, @@ -2474,6 +2481,9 @@ def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">; def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, AssemblerPredicate<(all_of FeatureXF32Insts)>; +def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, + AssemblerPredicate<(all_of FeatureAshrPkInsts)>; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 2fe9e3dd9b18ed..7651c84eb0ae65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4546,6 +4546,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_f16: case Intrinsic::amdgcn_cvt_scalef32_pk32_fp6_bf16: case Intrinsic::amdgcn_cvt_scalef32_pk32_bf6_bf16: + case Intrinsic::amdgcn_ashr_pk_i8_i32: + case Intrinsic::amdgcn_ashr_pk_u8_i32: case Intrinsic::amdgcn_cvt_scalef32_2xpk16_fp6_f32: case Intrinsic::amdgcn_cvt_scalef32_2xpk16_bf6_f32: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index c9cbc546720759..390849dd2e0564 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -245,8 +245,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; + bool HasAshrPkInsts = false; bool HasMinimum3Maximum3F32 = false; bool HasMinimum3Maximum3F16 = false; + bool RequiresCOV6 = false; // Dummy feature to use for assembler in tablegen. @@ -1326,6 +1328,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPermlane16Swap() const { return HasPermlane16Swap; } bool hasPermlane32Swap() const { return HasPermlane32Swap; } + bool hasAshrPkInsts() const { return HasAshrPkInsts; } bool hasMinimum3Maximum3F32() const { return HasMinimum3Maximum3F32; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 10af6480f6dfb6..f9cb6bb8d297a9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2856,6 +2856,7 @@ def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>; def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>; def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>; +def VOP_I16_I32_I32_I32 : VOPProfile <[i16, i32, i32, i32]>; def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>; def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 66e3b355418885..11700a4c34f9f3 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1189,6 +1189,11 @@ let SubtargetPredicate = HasPseudoScalarTrans in { def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>; } +let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in { + defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>; + defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>; +} // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 + //===----------------------------------------------------------------------===// // Integer Clamp Patterns //===----------------------------------------------------------------------===// @@ -1978,5 +1983,8 @@ defm V_CVT_SCALEF32_PK32_BF6_F16 : VOP3_Real_gfx9<0x25a, "v_cvt_scalef32_pk32_b defm V_CVT_SCALEF32_PK32_BF6_BF16 : VOP3_Real_gfx9<0x25b, "v_cvt_scalef32_pk32_bf6_bf16">; } +defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>; +defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>; + defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">; defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 34c7989b9d0b86..0e19696a32f86f 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1455,6 +1455,7 @@ def VOP3_CLAMP : VOP3Features<1, 0, 0, 0>; def VOP3_OPSEL : VOP3Features<1, 1, 0, 0>; def VOP3_PACKED : VOP3Features<1, 1, 1, 0>; def VOP3_MAI : VOP3Features<0, 0, 0, 1>; +def VOP3_OPSEL_ONLY : VOP3Features<0, 1, 0, 0>; // Packed is misleading, but it enables the appropriate op_sel // modifiers. diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 6da48da608cc14..23532b9214a892 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -474,6 +474,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["prng-inst"] = true; Features["permlane16-swap"] = true; Features["permlane32-swap"] = true; + Features["ashr-pk-insts"] = true; Features["gfx950-insts"] = true; [[fallthrough]]; case GK_GFX942: diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s index da5566c1049649..5f5e5057117059 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_vop3.s @@ -74,3 +74,75 @@ v_bitop3_b16 v5, v1, v2, s3 bitop3:161 // GFX940-ERR: error: instruction not supported on this GPU // GFX950: v_bitop3_b16 v5, v1, v2, s3 bitop3:0xa1 ; encoding: [0x05,0x04,0x33,0xd2,0x01,0x05,0x0e,0x30] // GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_i8_i32 v2, s4, v7, v8 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_i8_i32 v2, v4, 0, 1 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_i8_i32 v2, v4, 3, s2 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_i8_i32 v2, s4, 4, v2 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_i8_i32 v2, v4, v7, 0.5 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_u8_i32 v2, s4, v7, v8 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_u8_i32 v2, v4, 0, 1 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_u8_i32 v2, v4, 3, s2 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_u8_i32 v2, s4, 4, v2 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_u8_i32 v2, v4, v7, -2.0 +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03] +// GFX12-ERR: error: instruction not supported on this GPU + +v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] +// GFX906-ERR: error: instruction not supported on this GPU +// GFX940-ERR: error: instruction not supported on this GPU +// GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04] +// GFX12-ERR: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt index 5ae970005a3b86..ccc55352413777 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt @@ -744,6 +744,42 @@ # GFX950: v_cvt_scalef32_pk_fp4_bf16 v1, -|s2|, v3 ; encoding: [0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20] 0x01,0x01,0x4d,0xd2,0x02,0x06,0x02,0x20 +# GFX950: v_ashr_pk_i8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04] +0x02,0x00,0x65,0xd2,0x04,0x08,0x09,0x04 + +# GFX950: v_ashr_pk_i8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04] +0x02,0x00,0x65,0xd2,0x04,0x0e,0x22,0x04 + +# GFX950: v_ashr_pk_i8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02] +0x02,0x00,0x65,0xd2,0x04,0x01,0x05,0x02 + +# GFX950: v_ashr_pk_i8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00] +0x02,0x00,0x65,0xd2,0x04,0x07,0x09,0x00 + +# GFX950: v_ashr_pk_i8_i32 v2, v4, v7, 0.5 ; encoding: [0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03] +0x02,0x00,0x65,0xd2,0x04,0x0f,0xc2,0x03 + +# GFX950: v_ashr_pk_i8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04] +0x01,0x40,0x65,0xd2,0x02,0x07,0x12,0x04 + +# GFX950: v_ashr_pk_u8_i32 v2, s4, 4, v2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04] +0x02,0x00,0x66,0xd2,0x04,0x08,0x09,0x04 + +# GFX950: v_ashr_pk_u8_i32 v2, s4, v7, v8 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04] +0x02,0x00,0x66,0xd2,0x04,0x0e,0x22,0x04 + +# GFX950: v_ashr_pk_u8_i32 v2, v4, 0, 1 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02] +0x02,0x00,0x66,0xd2,0x04,0x01,0x05,0x02 + +# GFX950: v_ashr_pk_u8_i32 v2, v4, 3, s2 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00] +0x02,0x00,0x66,0xd2,0x04,0x07,0x09,0x00 + +# GFX950: v_ashr_pk_u8_i32 v2, v4, v7, -2.0 ; encoding: [0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03] +0x02,0x00,0x66,0xd2,0x04,0x0f,0xd6,0x03 + +# GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04] +0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04 + # GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04] 0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits