https://github.com/SpencerAbson updated https://github.com/llvm/llvm-project/pull/116959
>From 296492155525985942e1a0fc56b6f0db34e8a7a4 Mon Sep 17 00:00:00 2001 From: Spencer Abson <spencer.ab...@arm.com> Date: Wed, 20 Nov 2024 10:57:49 +0000 Subject: [PATCH 1/7] [AArch64] Add intrinsics for F1CVTL/F2CVTL and BF1CVTL/BF2CVTL --- clang/include/clang/Basic/TargetBuiltins.h | 1 + clang/include/clang/Basic/arm_sme.td | 7 ++ clang/include/clang/Basic/arm_sve_sme_incl.td | 2 + clang/lib/CodeGen/CGBuiltin.cpp | 4 + .../fp8-intrinsics/acle_sme2_fp8_cvt.c | 81 +++++++++++++++++++ clang/utils/TableGen/SveEmitter.cpp | 6 ++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 17 ++++ .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 34 ++++++++ llvm/lib/Target/AArch64/SMEInstrFormats.td | 2 +- .../AArch64/sme2-fp8-intrinsics-cvt.ll | 48 +++++++++++ 10 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index 89ebf5758a5b55..a14fd2c4b224d8 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -336,6 +336,7 @@ namespace clang { bool isTupleSet() const { return Flags & IsTupleSet; } bool isReadZA() const { return Flags & IsReadZA; } bool isWriteZA() const { return Flags & IsWriteZA; } + bool setsFPMR() const { return Flags & SetsFPMR; } bool isReductionQV() const { return Flags & IsReductionQV; } uint64_t getBits() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag; } diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 0f689e82bdb742..8e7e4395411c6c 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -824,4 +824,11 @@ let SMETargetGuard = "sme-lutv2" in { def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; } +let SMETargetGuard = "sme2,fp8" in { + // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector + def SVF1CVTL : Inst<"svcvtl1_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; + def SVF1CVTL_BF : Inst<"svcvtl1_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; + def SVF2CVTL : Inst<"svcvtl2_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; + def SVF2CVTL_BF : Inst<"svcvtl2_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; +} } // let SVETargetGuard = InvalidMode diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index 50911fb63e818e..7fdf732e506a2e 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -103,6 +103,7 @@ include "arm_immcheck_incl.td" // M: svfloat32_t // N: svfloat64_t // $: svbfloat16_t +// ~: svmfloat8_t // J: Prefetch type (sv_prfop) @@ -235,6 +236,7 @@ def IsInOutZA : FlagType<0x200000000000>; def IsInZT0 : FlagType<0x400000000000>; def IsOutZT0 : FlagType<0x800000000000>; def IsInOutZT0 : FlagType<0x1000000000000>; +def SetsFPMR : FlagType<0x2000000000000>; defvar InvalidMode = ""; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 0916e14f182ddd..568ba0ade6422f 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -11182,6 +11182,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, BuiltinID == SME::BI__builtin_sme_svstr_za) return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic); + // Emit set FPMR for intrinsics that require it + if (TypeFlags.setsFPMR()) + Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), + Ops.pop_back_val()); // Handle builtins which require their multi-vector operands to be swapped swapCommutativeSMEOperands(BuiltinID, Ops); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c new file mode 100644 index 00000000000000..da2a505a897996 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c @@ -0,0 +1,81 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include <arm_sme.h> + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#endif + +// CHECK-LABEL: @test_cvt1l_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_cvt1l_f16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] +// +svfloat16x2_t test_cvt1l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SME_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr); +} + +// CHECK-LABEL: @test_cvt2l_f16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z17test_cvt2l_f16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] +// +svfloat16x2_t test_cvt2l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SME_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr); +} + +// CHECK-LABEL: @test_cvt1l_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z18test_cvt1l_bf16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] +// +svbfloat16x2_t test_cvt1l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SME_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr); +} + +// CHECK-LABEL: @test_cvt2l_bf16_x2( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] +// +// CPP-CHECK-LABEL: @_Z18test_cvt2l_bf16_x2u13__SVMfloat8_tm( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] +// +svbfloat16x2_t test_cvt2l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SME_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index e8883488f32356..ab1bb419e10ab8 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -926,6 +926,12 @@ void SVEType::applyModifier(char Mod) { Float = false; BFloat = false; break; + case '~': + Float = false; + BFloat = false; + MFloat = true; + ElementBitwidth = 8; + break; case '.': llvm_unreachable(". is never a type in itself"); break; diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6a09a8647096f9..cd766f0dda7209 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3813,6 +3813,23 @@ let TargetPrefix = "aarch64" in { LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; + class SME2_FP8_CVT_X2_Single_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty, llvm_nxv8f16_ty], + [llvm_nxv16i8_ty], + [IntrReadMem, IntrInaccessibleMemOnly]>; + + class SME2_FP8_CVT_X2_Single_BF16_Intrinsic + : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty, llvm_nxv8bf16_ty], + [llvm_nxv16i8_ty], + [IntrReadMem, IntrInaccessibleMemOnly]>; + // + // CVT from half-precision/BFloat16 to delinterleaved FP8 multi-vectors + // + def int_aarch64_sme_fp8_f1cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; + def int_aarch64_sme_fp8_f2cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; + + def int_aarch64_sme_fp8_bf1cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic; + def int_aarch64_sme_fp8_bf2cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic; } // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 10dad7675f4eaf..6512efa698c5ca 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -383,6 +383,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { void SelectPExtPair(SDNode *N, unsigned Opc); void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); + void SelectCVTIntrinsicFP8(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs, bool IsTupleInput, unsigned Opc); @@ -1866,6 +1867,27 @@ void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectCVTIntrinsicFP8(SDNode *N, unsigned NumVecs, + unsigned Opcode) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SmallVector<SDValue, 4> Ops(N->op_begin() + 2, N->op_end()); + Ops.push_back(/*Chain*/ N->getOperand(0)); + + SDNode *Instruction = + CurDAG->getMachineNode(Opcode, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg = SDValue(Instruction, 0); + + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Instruction, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, @@ -5547,6 +5569,18 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z); return; } + case Intrinsic::aarch64_sme_fp8_bf1cvtl_x2: + SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH); + return; + case Intrinsic::aarch64_sme_fp8_f1cvtl_x2: + SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH); + return; + case Intrinsic::aarch64_sme_fp8_bf2cvtl_x2: + SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH); + return; + case Intrinsic::aarch64_sme_fp8_f2cvtl_x2: + SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH); + return; } } break; case ISD::INTRINSIC_WO_CHAIN: { diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 8c256b5818ee88..776472e72af05a 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -2412,7 +2412,7 @@ multiclass sme2p1_fp_cvt_vector_vg2_single<string mnemonic, bit l> { // SME2 multi-vec FP8 up convert two registers multiclass sme2p1_fp8_cvt_vector_vg2_single<string mnemonic, bits<2> opc, bit L> { - def _NAME : sme2_cvt_unpk_vector_vg2<opc, 0b110, L, ZZ_h_mul_r, ZPR8, mnemonic>{ + def NAME : sme2_cvt_unpk_vector_vg2<opc, 0b110, L, ZZ_h_mul_r, ZPR8, mnemonic>{ let Uses = [FPMR, FPCR]; } } diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll new file mode 100644 index 00000000000000..a6102988d1df17 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+fp8 -verify-machineinstrs -force-streaming < %s | FileCheck %s + +; F1CVTL / F2CVTL + +define { <vscale x 8 x half>, <vscale x 8 x half> } @f1cvtl(<vscale x 16 x i8> %zm) { +; CHECK-LABEL: f1cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: f1cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> %zm) + ret { <vscale x 8 x half>, <vscale x 8 x half> } %res +} + +define { <vscale x 8 x half>, <vscale x 8 x half> } @f2cvtl(<vscale x 16 x i8> %zm) { +; CHECK-LABEL: f2cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: f2cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> %zm) + ret { <vscale x 8 x half>, <vscale x 8 x half> } %res +} + +; BF1CVTL / BF2CVTL + +define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf1cvtl(<vscale x 16 x i8> %zm) { +; CHECK-LABEL: bf1cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: bf1cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> %zm) + ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res +} + +define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf2cvtl( <vscale x 16 x i8> %zm) { +; CHECK-LABEL: bf2cvtl: +; CHECK: // %bb.0: +; CHECK-NEXT: bf2cvtl { z0.h, z1.h }, z0.b +; CHECK-NEXT: ret + %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> %zm) + ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res +} + + +declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8>) +declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8>) +declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8>) +declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8>) \ No newline at end of file >From ec7a1cbf4df94bcadda42cfa8cfbd43b8df0f10d Mon Sep 17 00:00:00 2001 From: Spencer Abson <spencer.ab...@arm.com> Date: Wed, 20 Nov 2024 11:55:03 +0000 Subject: [PATCH 2/7] [NFC] Fixup format --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 6512efa698c5ca..b51b3bf5038e5d 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5570,16 +5570,16 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; } case Intrinsic::aarch64_sme_fp8_bf1cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH); + SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH); return; case Intrinsic::aarch64_sme_fp8_f1cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH); + SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH); return; case Intrinsic::aarch64_sme_fp8_bf2cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH); + SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH); return; case Intrinsic::aarch64_sme_fp8_f2cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH); + SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH); return; } } break; >From 20f5b01b57e177a00de52e22e27b64c0e8fa2f55 Mon Sep 17 00:00:00 2001 From: Spencer Abson <spencer.ab...@arm.com> Date: Wed, 20 Nov 2024 14:05:36 +0000 Subject: [PATCH 3/7] [NFC] Remove unnecessary declarations from test --- llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll index a6102988d1df17..b6f1fd45af010b 100644 --- a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll +++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll @@ -40,9 +40,3 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf2cvtl( <vscale x 16 x %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> %zm) ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res } - - -declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8>) -declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8>) -declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8>) -declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8>) \ No newline at end of file >From 2f1ed17d158105bbf7f497a158c314ad6fd76cc5 Mon Sep 17 00:00:00 2001 From: Spencer Abson <spencer.ab...@arm.com> Date: Thu, 21 Nov 2024 10:34:46 +0000 Subject: [PATCH 4/7] Prefer polymorphic intrinsic and move definition to arm_sve.td --- clang/include/clang/Basic/arm_sme.td | 7 --- clang/include/clang/Basic/arm_sve.td | 10 ++-- clang/lib/CodeGen/CGBuiltin.cpp | 8 +-- .../fp8-intrinsics/acle_sme2_fp8_cvt.c | 60 +++++++++---------- llvm/include/llvm/IR/IntrinsicsAArch64.td | 14 +---- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 20 +++---- .../AArch64/sme2-fp8-intrinsics-cvt.ll | 8 +-- 7 files changed, 57 insertions(+), 70 deletions(-) diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 8e7e4395411c6c..0f689e82bdb742 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -824,11 +824,4 @@ let SMETargetGuard = "sme-lutv2" in { def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; } -let SMETargetGuard = "sme2,fp8" in { - // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector - def SVF1CVTL : Inst<"svcvtl1_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; - def SVF1CVTL_BF : Inst<"svcvtl1_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; - def SVF2CVTL : Inst<"svcvtl2_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; - def SVF2CVTL_BF : Inst<"svcvtl2_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>; -} } // let SVETargetGuard = InvalidMode diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index d492fae4145b92..c81aaee71eda0d 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2422,14 +2422,16 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in { def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>; } -// -// Multi-vector scaling -// -let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { +let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { + // Multi-vector scaling def FSCALE_SINGLE_X2 : Inst<"svscale[_single_{d}_x2]", "22x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x2", [IsStreaming],[]>; def FSCALE_SINGLE_X4 : Inst<"svscale[_single_{d}_x4]", "44x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x4", [IsStreaming],[]>; def FSCALE_X2 : Inst<"svscale[_{d}_x2]", "222.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x2", [IsStreaming],[]>; def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; + + // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector + def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; } let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 568ba0ade6422f..27e5453628a5a9 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10811,6 +10811,10 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, else if (TypeFlags.isUndef()) return UndefValue::get(Ty); else if (Builtin->LLVMIntrinsic != 0) { + // Emit set FPMR for intrinsics that require it + if (TypeFlags.setsFPMR()) + Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), + Ops.pop_back_val()); if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp) InsertExplicitZeroOperand(Builder, Ty, Ops); @@ -11182,10 +11186,6 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, BuiltinID == SME::BI__builtin_sme_svstr_za) return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic); - // Emit set FPMR for intrinsics that require it - if (TypeFlags.setsFPMR()) - Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), - Ops.pop_back_val()); // Handle builtins which require their multi-vector operands to be swapped swapCommutativeSMEOperands(BuiltinID, Ops); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c index da2a505a897996..4a1fc81a05b953 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c @@ -4,78 +4,78 @@ // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -#include <arm_sme.h> +#include <arm_sve.h> -#ifdef SME_OVERLOADED_FORMS -#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 +#ifdef SVE_OVERLOADED_FORMS +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3 #else -#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3 +#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3 #endif -// CHECK-LABEL: @test_cvt1l_f16_x2( +// CHECK-LABEL: @test_cvtl1_f16_x2( // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]]) // CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] // -// CPP-CHECK-LABEL: @_Z17test_cvt1l_f16_x2u13__SVMfloat8_tm( +// CPP-CHECK-LABEL: @_Z17test_cvtl1_f16_x2u13__SVMfloat8_tm( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] // -svfloat16x2_t test_cvt1l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { - return SME_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr); +svfloat16x2_t test_cvtl1_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr); } -// CHECK-LABEL: @test_cvt2l_f16_x2( +// CHECK-LABEL: @test_cvtl2_f16_x2( // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]]) // CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] // -// CPP-CHECK-LABEL: @_Z17test_cvt2l_f16_x2u13__SVMfloat8_tm( +// CPP-CHECK-LABEL: @_Z17test_cvtl2_f16_x2u13__SVMfloat8_tm( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] // -svfloat16x2_t test_cvt2l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { - return SME_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr); +svfloat16x2_t test_cvtl2_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr); } -// CHECK-LABEL: @test_cvt1l_bf16_x2( +// CHECK-LABEL: @test_cvtl1_bf16_x2( // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]]) // CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] // -// CPP-CHECK-LABEL: @_Z18test_cvt1l_bf16_x2u13__SVMfloat8_tm( +// CPP-CHECK-LABEL: @_Z18test_cvtl1_bf16_x2u13__SVMfloat8_tm( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] // -svbfloat16x2_t test_cvt1l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { - return SME_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr); +svbfloat16x2_t test_cvtl1_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr); } -// CHECK-LABEL: @test_cvt2l_bf16_x2( +// CHECK-LABEL: @test_cvtl2_bf16_x2( // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]]) // CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] // -// CPP-CHECK-LABEL: @_Z18test_cvt2l_bf16_x2u13__SVMfloat8_tm( +// CPP-CHECK-LABEL: @_Z18test_cvtl2_bf16_x2u13__SVMfloat8_tm( // CPP-CHECK-NEXT: entry: // CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]]) -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]]) +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] // -svbfloat16x2_t test_cvt2l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { - return SME_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr); +svbfloat16x2_t test_cvtl2_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + return SVE_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr); } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index cd766f0dda7209..f09114b57431f8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3814,22 +3814,14 @@ let TargetPrefix = "aarch64" in { [IntrNoMem]>; class SME2_FP8_CVT_X2_Single_Intrinsic - : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty, llvm_nxv8f16_ty], - [llvm_nxv16i8_ty], - [IntrReadMem, IntrInaccessibleMemOnly]>; - - class SME2_FP8_CVT_X2_Single_BF16_Intrinsic - : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty, llvm_nxv8bf16_ty], + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_nxv16i8_ty], [IntrReadMem, IntrInaccessibleMemOnly]>; // // CVT from half-precision/BFloat16 to delinterleaved FP8 multi-vectors // - def int_aarch64_sme_fp8_f1cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; - def int_aarch64_sme_fp8_f2cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; - - def int_aarch64_sme_fp8_bf1cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic; - def int_aarch64_sme_fp8_bf2cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic; + def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; + def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; } // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index b51b3bf5038e5d..f9a4d65eac0922 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -5569,17 +5569,17 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z); return; } - case Intrinsic::aarch64_sme_fp8_bf1cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH); - return; - case Intrinsic::aarch64_sme_fp8_f1cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH); - return; - case Intrinsic::aarch64_sme_fp8_bf2cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH); + case Intrinsic::aarch64_sve_fp8_cvtl1_x2: + if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::FP>( + Node->getValueType(0), + {AArch64::BF1CVTL_2ZZ_BtoH, AArch64::F1CVTL_2ZZ_BtoH})) + SelectCVTIntrinsicFP8(Node, 2, Opc); return; - case Intrinsic::aarch64_sme_fp8_f2cvtl_x2: - SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH); + case Intrinsic::aarch64_sve_fp8_cvtl2_x2: + if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::FP>( + Node->getValueType(0), + {AArch64::BF2CVTL_2ZZ_BtoH, AArch64::F2CVTL_2ZZ_BtoH})) + SelectCVTIntrinsicFP8(Node, 2, Opc); return; } } break; diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll index b6f1fd45af010b..076a3ad34eac3c 100644 --- a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll +++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll @@ -8,7 +8,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half> } @f1cvtl(<vscale x 16 x i8> % ; CHECK: // %bb.0: ; CHECK-NEXT: f1cvtl { z0.h, z1.h }, z0.b ; CHECK-NEXT: ret - %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> %zm) + %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16(<vscale x 16 x i8> %zm) ret { <vscale x 8 x half>, <vscale x 8 x half> } %res } @@ -17,7 +17,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half> } @f2cvtl(<vscale x 16 x i8> % ; CHECK: // %bb.0: ; CHECK-NEXT: f2cvtl { z0.h, z1.h }, z0.b ; CHECK-NEXT: ret - %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> %zm) + %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxvbf16(<vscale x 16 x i8> %zm) ret { <vscale x 8 x half>, <vscale x 8 x half> } %res } @@ -28,7 +28,7 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf1cvtl(<vscale x 16 x ; CHECK: // %bb.0: ; CHECK-NEXT: bf1cvtl { z0.h, z1.h }, z0.b ; CHECK-NEXT: ret - %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> %zm) + %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16(<vscale x 16 x i8> %zm) ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res } @@ -37,6 +37,6 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf2cvtl( <vscale x 16 x ; CHECK: // %bb.0: ; CHECK-NEXT: bf2cvtl { z0.h, z1.h }, z0.b ; CHECK-NEXT: ret - %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> %zm) + %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16(<vscale x 16 x i8> %zm) ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res } >From 460b69be442b7c3d181662f1a3ca95038a34e854 Mon Sep 17 00:00:00 2001 From: Spencer Abson <spencer.ab...@arm.com> Date: Sun, 24 Nov 2024 10:19:54 +0000 Subject: [PATCH 5/7] [NFC] Fix comments --- clang/include/clang/Basic/arm_sve.td | 2 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index c81aaee71eda0d..6899a3387cc8c4 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2429,7 +2429,7 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { def FSCALE_X2 : Inst<"svscale[_{d}_x2]", "222.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x2", [IsStreaming],[]>; def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; - // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector + // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index f09114b57431f8..a91616b9556828 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3818,7 +3818,7 @@ let TargetPrefix = "aarch64" in { [llvm_nxv16i8_ty], [IntrReadMem, IntrInaccessibleMemOnly]>; // - // CVT from half-precision/BFloat16 to delinterleaved FP8 multi-vectors + // CVT from FP8 to deinterleaved half-precision/BFloat16 multi-vector // def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic; >From 49481a89cc0d488333041eb7489760f4de4e00eb Mon Sep 17 00:00:00 2001 From: Spencer Abson <spencer.ab...@arm.com> Date: Tue, 26 Nov 2024 17:26:57 +0000 Subject: [PATCH 6/7] [NFC] Add target guard tests --- .../aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c new file mode 100644 index 00000000000000..4c0c5fd9d2ec11 --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -verify -emit-llvm-only %s + +// REQUIRES: aarch64-registered-target + +#include <arm_sve.h> + + +void test_features_sme2_fp8(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { + // expected-error@+1 {{'svcvtl1_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl1_f16_mf8_x2_fpm(zn, fpmr); + // expected-error@+1 {{'svcvtl2_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl2_f16_mf8_x2_fpm(zn, fpmr); + // expected-error@+1 {{'svcvtl1_bf16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl1_bf16_mf8_x2_fpm(zn, fpmr); + // expected-error@+1 {{'svcvtl2_bf16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} + svcvtl2_bf16_mf8_x2_fpm(zn, fpmr); +} \ No newline at end of file >From 0627133f1a0556a6e8797fa1802377be6a689269 Mon Sep 17 00:00:00 2001 From: Spencer Abson <spencer.ab...@arm.com> Date: Thu, 28 Nov 2024 10:15:33 +0000 Subject: [PATCH 7/7] Use fpm_t instead of uint64_t for fpmr --- clang/include/clang/Basic/arm_sve.td | 4 ++-- clang/include/clang/Basic/arm_sve_sme_incl.td | 1 + .../AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c | 8 ++++---- .../Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c | 2 +- clang/utils/TableGen/SveEmitter.cpp | 11 +++++++++-- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 6899a3387cc8c4..2dec17aae2af55 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2430,8 +2430,8 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>; // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector - def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; - def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; + def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>; + def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~>", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>; } let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in { diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td index 7fdf732e506a2e..de10be7bdce0db 100644 --- a/clang/include/clang/Basic/arm_sve_sme_incl.td +++ b/clang/include/clang/Basic/arm_sve_sme_incl.td @@ -94,6 +94,7 @@ include "arm_immcheck_incl.td" // l: int64_t // m: uint32_t // n: uint64_t +// >: fpm_t // [: svuint8_t // t: svint32_t diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c index 4a1fc81a05b953..5ba76671ff5d5b 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c @@ -28,7 +28,7 @@ // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] // -svfloat16x2_t test_cvtl1_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { +svfloat16x2_t test_cvtl1_f16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { return SVE_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr); } @@ -44,7 +44,7 @@ svfloat16x2_t test_cvtl1_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]] // -svfloat16x2_t test_cvtl2_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { +svfloat16x2_t test_cvtl2_f16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { return SVE_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr); } @@ -60,7 +60,7 @@ svfloat16x2_t test_cvtl2_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] // -svbfloat16x2_t test_cvtl1_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { +svbfloat16x2_t test_cvtl1_bf16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { return SVE_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr); } @@ -76,6 +76,6 @@ svbfloat16x2_t test_cvtl1_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streamin // CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]]) // CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]] // -svbfloat16x2_t test_cvtl2_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { +svbfloat16x2_t test_cvtl2_bf16_x2(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { return SVE_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr); } diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c index 4c0c5fd9d2ec11..09a80c9dff03ea 100644 --- a/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_sme2_fp8_cvt.c @@ -5,7 +5,7 @@ #include <arm_sve.h> -void test_features_sme2_fp8(svmfloat8_t zn, uint64_t fpmr) __arm_streaming { +void test_features_sme2_fp8(svmfloat8_t zn, fpm_t fpmr) __arm_streaming { // expected-error@+1 {{'svcvtl1_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} svcvtl1_f16_mf8_x2_fpm(zn, fpmr); // expected-error@+1 {{'svcvtl2_f16_mf8_x2_fpm' needs target feature sme,sme2,fp8}} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index ab1bb419e10ab8..e9fa01ea98dced 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -52,7 +52,7 @@ namespace { class SVEType { bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat, MFloat; bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp, - Svcount; + Svcount, Fpm; unsigned Bitwidth, ElementBitwidth, NumVectors; public: @@ -62,7 +62,7 @@ class SVEType { : Float(false), Signed(true), Immediate(false), Void(false), Constant(false), Pointer(false), BFloat(false), MFloat(false), DefaultType(false), IsScalable(true), Predicate(false), - PredicatePattern(false), PrefetchOp(false), Svcount(false), + PredicatePattern(false), PrefetchOp(false), Svcount(false), Fpm(false), Bitwidth(128), ElementBitwidth(~0U), NumVectors(NumVectors) { if (!TS.empty()) applyTypespec(TS); @@ -101,6 +101,7 @@ class SVEType { bool isPrefetchOp() const { return PrefetchOp; } bool isSvcount() const { return Svcount; } bool isConstant() const { return Constant; } + bool isFpm() const { return Fpm; } unsigned getElementSizeInBits() const { return ElementBitwidth; } unsigned getNumVectors() const { return NumVectors; } @@ -497,6 +498,9 @@ std::string SVEType::str() const { if (isPrefetchOp()) return "enum svprfop"; + if (isFpm()) + return "fpm_t"; + std::string S; if (Void) S += "void"; @@ -752,6 +756,9 @@ void SVEType::applyModifier(char Mod) { ElementBitwidth = Bitwidth = 32; NumVectors = 0; break; + case '>': + Fpm = true; + [[fallthrough]]; case 'n': Predicate = false; Svcount = false; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits