Author: Momchil Velikov Date: 2025-01-27T17:32:47Z New Revision: 99bd2e3f123baf9a14acc9b31ee0f557288118a6
URL: https://github.com/llvm/llvm-project/commit/99bd2e3f123baf9a14acc9b31ee0f557288118a6 DIFF: https://github.com/llvm/llvm-project/commit/99bd2e3f123baf9a14acc9b31ee0f557288118a6.diff LOG: [AArch64] Add Neon FP8 conversion intrinsics (#123612) The patch adds the following intrinsics: bfloat16x8_t vcvt1_bf16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm) bfloat16x8_t vcvt1_low_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) bfloat16x8_t vcvt2_bf16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm) bfloat16x8_t vcvt2_low_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) bfloat16x8_t vcvt1_high_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) bfloat16x8_t vcvt2_high_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) float16x8_t vcvt1_f16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm) float16x8_t vcvt1_low_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) float16x8_t vcvt2_f16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm) float16x8_t vcvt2_low_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) float16x8_t vcvt1_high_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) float16x8_t vcvt2_high_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm) mfloat8x8_t vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) mfloat8x16_t vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn, float32x4_t vm, fpm_t fpm) mfloat8x8_t vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) mfloat8x16_t vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) Co-Authored-By: Caroline Concatto <caroline.conca...@arm.com> Added: clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll Modified: clang/include/clang/Basic/arm_neon.td clang/include/clang/Basic/arm_neon_incl.td clang/lib/CodeGen/CGBuiltin.cpp clang/lib/CodeGen/CodeGenFunction.h clang/utils/TableGen/NeonEmitter.cpp llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/lib/Target/AArch64/AArch64InstrFormats.td llvm/lib/Target/AArch64/AArch64InstrInfo.td Removed: ################################################################################ diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ddc5391eb3fa23..9a6a77640ef5d3 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2119,6 +2119,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { } } +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { + def VBF1CVT_BF16_MF8 : VInst<"vcvt1_bf16_mf8_fpm", "(QB).V", "m">; + def VBF1CVT_LOW_BF16_MF8 : VInst<"vcvt1_low_bf16_mf8_fpm", "B.V", "Hm">; + def VBF2CVTL_BF16_MF8 : VInst<"vcvt2_bf16_mf8_fpm", "(QB).V", "m">; + def VBF2CVTL_LOW_BF16_MF8 : VInst<"vcvt2_low_bf16_mf8_fpm", "B.V", "Hm">; + def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V", "Hm">; + def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V", "Hm">; +} + +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { + def VF1CVT_F16_MF8 : VInst<"vcvt1_f16_mf8_fpm", "(>QF).V", "m">; + def VF1CVT_LOW_F16_MF8 : VInst<"vcvt1_low_f16_mf8_fpm", "(>F).V", "Hm">; + def VF2CVTL_F16_MF8 : VInst<"vcvt2_f16_mf8_fpm", "(>QF).V", "m">; + def VF2CVTL_LOW_F16_MF8 : VInst<"vcvt2_low_f16_mf8_fpm", "(>F).V", "Hm">; + def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V", "Hm">; + def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V", "Hm">; + + def VCVTN_LOW_F8_F32 : VInst<"vcvt_mf8_f32_fpm", ".(>>QF)(>>QF)V", "m">; + def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Hm">; + def VCVTN_F8_F16 : VInst<"vcvt_mf8_f16_fpm", ".(>F)(>F)V", "mQm">; +} + let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in { def FAMIN : WInst<"vamin", "...", "fhQdQfQh">; def FAMAX : WInst<"vamax", "...", "fhQdQfQh">; diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td index fd800e5a6278e4..91a2bf3020b9a3 100644 --- a/clang/include/clang/Basic/arm_neon_incl.td +++ b/clang/include/clang/Basic/arm_neon_incl.td @@ -243,6 +243,7 @@ def OP_UNAVAILABLE : Operation { // B: change to BFloat16 // P: change to polynomial category. // p: change polynomial to equivalent integer category. Otherwise nop. +// V: change to fpm_t // // >: double element width (vector size unchanged). // <: half element width (vector size unchanged). @@ -301,6 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{ class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {} class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {} class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {} +class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {} // The following instruction classes are implemented via operators // instead of builtins. As such these declarations are only used for diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 5162ac503b8ebd..0a06ce028a9160 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6759,12 +6759,36 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops, return Builder.CreateCall(F, Ops, name); } +Value *CodeGenFunction::EmitFP8NeonCall(Function *F, + SmallVectorImpl<Value *> &Ops, + Value *FPM, const char *name) { + Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM); + return EmitNeonCall(F, Ops, name); +} + Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty, bool neg) { int SV = cast<ConstantInt>(V)->getSExtValue(); return ConstantInt::get(Ty, neg ? -SV : SV); } +Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0, + llvm::Type *Ty1, bool Extract, + SmallVectorImpl<llvm::Value *> &Ops, + const CallExpr *E, + const char *name) { + llvm::Type *Tys[] = {Ty0, Ty1}; + if (Extract) { + // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of + // the vector. + Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8); + Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); + } + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name); +} + // Right-shift a vector by a constant. Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift, llvm::Type *Ty, bool usgn, @@ -12736,6 +12760,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return V; unsigned Int; + bool ExtractLow = false; switch (BuiltinID) { default: return nullptr; case NEON::BI__builtin_neon_vbsl_v: @@ -13950,7 +13975,59 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); } - + case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, + llvm::FixedVectorType::get(BFloatTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); + case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm: + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, + llvm::FixedVectorType::get(BFloatTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2"); + case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm: + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, + llvm::FixedVectorType::get(HalfTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); + case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm: + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, + llvm::FixedVectorType::get(HalfTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2"); + case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, + llvm::FixedVectorType::get(Int8Ty, 8), + Ops[0]->getType(), false, Ops, E, "vfcvtn"); + case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, + llvm::FixedVectorType::get(Int8Ty, 8), + llvm::FixedVectorType::get(HalfTy, 4), false, Ops, + E, "vfcvtn"); + case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, + llvm::FixedVectorType::get(Int8Ty, 16), + llvm::FixedVectorType::get(HalfTy, 8), false, Ops, + E, "vfcvtn"); + case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: { + llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16); + Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0], + Builder.getInt64(0)); + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, Ty, + Ops[1]->getType(), false, Ops, E, "vfcvtn2"); + } case NEON::BI__builtin_neon_vamin_f16: case NEON::BI__builtin_neon_vaminq_f16: case NEON::BI__builtin_neon_vamin_f32: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index bdd6e3bd55a7fd..a5416ab91c8d61 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4692,6 +4692,13 @@ class CodeGenFunction : public CodeGenTypeCache { SmallVectorImpl<llvm::Value*> &O, const char *name, unsigned shift = 0, bool rightshift = false); + llvm::Value *EmitFP8NeonCall(llvm::Function *F, + SmallVectorImpl<llvm::Value *> &O, + llvm::Value *FPM, const char *name); + llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0, + llvm::Type *Ty1, bool Extract, + SmallVectorImpl<llvm::Value *> &Ops, + const CallExpr *E, const char *name); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, const llvm::ElementCount &Count); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c new file mode 100644 index 00000000000000..4305b840f2a05b --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c @@ -0,0 +1,316 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include <arm_neon.h> + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpm13__Mfloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt1_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_low_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpm13__Mfloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt2_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_low_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_high_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_high_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpm13__Mfloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt1_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_low_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpm13__Mfloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt2_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_low_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_high_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpm14__Mfloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_high_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm( +// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-NEXT: ret <8 x i8> [[VFCVTN_I]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m( +// CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN_I]] +// +mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) { + return vcvt_mf8_f32_fpm(vn, vm, fpm); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm( +// CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpm13__Mfloat8x8_t13__Float32x4_tS0_m( +// CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn, + float32x4_t vm, fpm_t fpm) { + return vcvt_high_mf8_f32_fpm(vd, vn, vm, fpm); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm( +// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-NEXT: ret <8 x i8> [[VFCVTN2_I]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m( +// CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN2_I]] +// +mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) { + return vcvt_mf8_f16_fpm(vn, vm, fpm); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm( +// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m( +// CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) { + return vcvtq_mf8_f16_fpm(vn, vm, fpm); +} diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c new file mode 100644 index 00000000000000..2c7004c7968a46 --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -emit-llvm -verify %s -o /dev/null + +// REQUIRES: aarch64-registered-target + +#include <arm_neon.h> + +void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, + mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) { + (void) vcvt1_bf16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt1_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt1_low_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_low_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_bf16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt2_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_low_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_low_bf16_mf8_fpm' requires target feature 'fp8'}} + + (void) vcvt1_high_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_high_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_high_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_high_bf16_mf8_fpm' requires target feature 'fp8'}} + + (void) vcvt1_f16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt1_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt1_low_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_low_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_f16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt2_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_low_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_low_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt1_high_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_high_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_high_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_high_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt_mf8_f32_fpm(va4, va4, fpm); + // expected-error@-1 {{'vcvt_mf8_f32_fpm' requires target feature 'fp8'}} + (void) vcvt_high_mf8_f32_fpm(v8, va4, va4, fpm); + // expected-error@-1 {{'vcvt_high_mf8_f32_fpm' requires target feature 'fp8'}} + (void) vcvt_mf8_f16_fpm(vd4, vd4, fpm); + // expected-error@-1 {{'vcvt_mf8_f16_fpm' requires target feature 'fp8'}} + (void) vcvtq_mf8_f16_fpm(vd8, vd8, fpm); + // expected-error@-1 {{'vcvtq_mf8_f16_fpm' requires target feature 'fp8'}} +} diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 7299a49252f0d2..11f33ca17fda8e 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -74,6 +74,7 @@ enum ClassKind { ClassI, // generic integer instruction, e.g., "i8" suffix ClassS, // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix ClassW, // width-specific instruction, e.g., "8" suffix + ClassV, // void-suffix instruction, no suffix ClassB, // bitcast arguments with enum argument to specify type ClassL, // Logical instructions which are op instructions // but we need to not emit any suffix for in our @@ -144,7 +145,7 @@ class Type { private: TypeSpec TS; - enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8 }; + enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8, FPM }; TypeKind Kind; bool Immediate, Constant, Pointer; // ScalarForMangling and NoManglingQ are really not suited to live here as @@ -198,6 +199,7 @@ class Type { bool isVoid() const { return Kind == Void; } bool isBFloat16() const { return Kind == BFloat16; } bool isMFloat8() const { return Kind == MFloat8; } + bool isFPM() const { return Kind == FPM; } unsigned getNumElements() const { return Bitwidth / ElementBitwidth; } unsigned getSizeInBits() const { return Bitwidth; } unsigned getElementSizeInBits() const { return ElementBitwidth; } @@ -600,6 +602,7 @@ class NeonEmitter { const Record *SI = R.getClass("SInst"); const Record *II = R.getClass("IInst"); const Record *WI = R.getClass("WInst"); + const Record *VI = R.getClass("VInst"); const Record *SOpI = R.getClass("SOpInst"); const Record *IOpI = R.getClass("IOpInst"); const Record *WOpI = R.getClass("WOpInst"); @@ -609,6 +612,7 @@ class NeonEmitter { ClassMap[SI] = ClassS; ClassMap[II] = ClassI; ClassMap[WI] = ClassW; + ClassMap[VI] = ClassV; ClassMap[SOpI] = ClassS; ClassMap[IOpI] = ClassI; ClassMap[WOpI] = ClassW; @@ -641,6 +645,9 @@ class NeonEmitter { std::string Type::str() const { if (isVoid()) return "void"; + if (isFPM()) + return "fpm_t"; + std::string S; if (isInteger() && !isSigned()) @@ -699,6 +706,8 @@ std::string Type::builtin_str() const { } else if (isMFloat8()) { assert(ElementBitwidth == 8 && "MFloat8 can only be 8 bits"); S += "m"; + } else if (isFPM()) { + S += "UWi"; } else switch (ElementBitwidth) { case 16: S += "h"; break; @@ -925,6 +934,13 @@ void Type::applyModifiers(StringRef Mods) { case 'P': Kind = Poly; break; + case 'V': + Kind = FPM; + Bitwidth = ElementBitwidth = 64; + NumVectors = 0; + Immediate = Constant = Pointer = false; + ScalarForMangling = NoManglingQ = true; + break; case '>': assert(ElementBitwidth < 128); ElementBitwidth *= 2; @@ -1000,6 +1016,9 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const { if (CK == ClassB && TargetGuard == "neon") return ""; + if (this->CK == ClassV) + return ""; + if (T.isBFloat16()) return "bf16"; diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index b31a65d9bcc02a..31c9546376c820 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -993,6 +993,28 @@ def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>; def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>; def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>; + // + // Neon FP8 intrinsics + // + + // Conversions + class AdvSIMD_FP8_1VectorArg_Long_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrReadMem, IntrInaccessibleMemOnly]>; + + def int_aarch64_neon_fp8_cvtl1 : AdvSIMD_FP8_1VectorArg_Long_Intrinsic; + def int_aarch64_neon_fp8_cvtl2 : AdvSIMD_FP8_1VectorArg_Long_Intrinsic; + + def int_aarch64_neon_fp8_fcvtn + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrReadMem, IntrInaccessibleMemOnly]>; + def int_aarch64_neon_fp8_fcvtn2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrReadMem, IntrInaccessibleMemOnly]>; } def llvm_nxv1i1_ty : LLVMType<nxv1i1>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 6a3a9492e031c6..67b43664548457 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6559,17 +6559,30 @@ class BaseSIMDThreeVectors<bit Q, bit U, bits<2> size, bits<4> op, // FCVTN (FP16 to FP8) -multiclass SIMDThreeSameSizeVectorCvt<string asm> { - def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">; - def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110, V128, V128, asm, ".16b", ".8h">; +multiclass SIMD_FP8_CVTN_F16<string asm, SDPatternOperator Op> { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">; + def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110, V128, V128, asm, ".16b", ".8h">; + } + def : Pat<(v8i8 (Op (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (!cast<Instruction>(NAME # v8f8) V64:$Rn, V64:$Rm)>; + def : Pat<(v16i8 (Op (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (!cast<Instruction>(NAME # v16f8) V128:$Rn, V128:$Rm)>; } -// TODO : Create v16f8 value type // FCVTN, FCVTN2 (FP32 to FP8) -multiclass SIMDThreeVectorCvt<string asm> { - def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">; - def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s", - V128, v16i8, v4f32, null_frag>; +multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">; + def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s", + V128, v16i8, v4f32, null_frag>; + } + + def : Pat<(v8i8 (Op (v4f32 V128:$Rn), (v4f32 V128:$Rm))), + (!cast<Instruction>(NAME # v8f8) V128:$Rn, V128:$Rm)>; + + def : Pat<(v16i8 (!cast<SDPatternOperator>(Op # 2) (v16i8 V128:$_Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm))), + (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>; } // TODO: Create a new Value Type v8f8 and v16f8 @@ -7033,11 +7046,18 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm, //---------------------------------------------------------------------------- // FP8 Advanced SIMD two-register miscellaneous //---------------------------------------------------------------------------- -multiclass SIMDMixedTwoVectorFP8<bits<2>sz, string asm> { - def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128, - asm, ".8h", ".8b", []>; - def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128, - asm#2, ".8h", ".16b", []>; +multiclass SIMD_FP8_CVTL<bits<2>sz, string asm, ValueType dty, SDPatternOperator Op> { + let Uses=[FPMR, FPCR], mayLoad = 1 in { + def NAME : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128, + asm, ".8h", ".8b", []>; + def NAME#2 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128, + asm#2, ".8h", ".16b", []>; + } + def : Pat<(dty (Op (v8i8 V64:$Rn))), + (!cast<Instruction>(NAME) V64:$Rn)>; + + def : Pat<(dty (Op (v16i8 V128:$Rn))), + (!cast<Instruction>(NAME#2) V128:$Rn)>; } class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9d0bd44544134c..881af6eb951177 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -10324,13 +10324,13 @@ let Predicates = [HasD128] in { // 2023 Architecture Extensions: //===----------------------------===// -let Uses = [FPMR, FPCR], Predicates = [HasFP8] in { - defm F1CVTL : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">; - defm F2CVTL : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">; - defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">; - defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">; - defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">; - defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">; +let Predicates = [HasFP8] in { + defm F1CVTL : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>; + defm F2CVTL : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>; + defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>; + defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>; + defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>; + defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>; defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>; } // End let Predicates = [HasFP8] diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll new file mode 100644 index 00000000000000..6070380d24234b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s + +define <8 x bfloat> @test_vbfcvtl1_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl1_low: +; CHECK: // %bb.0: +; CHECK-NEXT: bf1cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> %vn) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_vbfcvtl1_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl1_high: +; CHECK: // %bb.0: +; CHECK-NEXT: bf1cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> %vn) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_vbfcvtl2_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl2_low: +; CHECK: // %bb.0: +; CHECK-NEXT: bf2cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> %vn) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_vbfcvtl2_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl2_high: +; CHECK: // %bb.0: +; CHECK-NEXT: bf2cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> %vn) + ret <8 x bfloat> %res +} + + +define <8 x half> @test_vfcvtl1_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl1_low: +; CHECK: // %bb.0: +; CHECK-NEXT: f1cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> %vn) + ret <8 x half> %res +} + +define <8 x half> @test_vfcvtl1_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl1_high: +; CHECK: // %bb.0: +; CHECK-NEXT: f1cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> %vn) + ret <8 x half> %res +} + +define <8 x half> @test_vfcvtl2_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl2_low: +; CHECK: // %bb.0: +; CHECK-NEXT: f2cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> %vn) + ret <8 x half> %res +} + +define <8 x half> @test_vfcvtl2_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl2_high: +; CHECK: // %bb.0: +; CHECK-NEXT: f2cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> %vn) + ret <8 x half> %res +} + +define <8 x i8> @test_vcvtn_low_f8_f32(<4 x float> %vn, <4 x float> %vm) { +; CHECK-LABEL: test_vcvtn_low_f8_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn v0.8b, v0.4s, v1.4s +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> %vn, <4 x float> %vm) + ret <8 x i8> %res +} + +define <16 x i8> @test_vcvtn_high_f8_f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) { +; CHECK-LABEL: test_vcvtn_high_f8_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn2 v0.16b, v1.4s, v2.4s +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) + ret <16 x i8> %res +} + + +define <8 x i8> @test_vcvtn_f8_f16(<4 x half> %vn, <4 x half> %vm) { +; CHECK-LABEL: test_vcvtn_f8_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn v0.8b, v0.4h, v1.4h +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> %vn, <4 x half> %vm) + ret <8 x i8> %res +} + +define <16 x i8> @test_vcvtn2_f8_f16(<8 x half> %vn, <8 x half> %vm) { +; CHECK-LABEL: test_vcvtn2_f8_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn v0.16b, v0.8h, v1.8h +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> %vn, <8 x half> %vm) + ret <16 x i8> %res +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits