<[email protected]> writes:
> The AArch64 FEAT_FP8 extension introduces instructions for conversion
> and scaling.
>
> This patch introduces the following intrinsics:
> 1. vcvt{1|2}_{bf16|high_bf16|low_bf16}_mf8_fpm.
> 2. vcvt{q}_mf8_f16_fpm.
> 3. vcvt_{high}_mf8_f32_fpm.
> 4. vscale{q}_{f16|f32|f64}.
>
> We introduced two aarch64_builtin_signatures enum variants, unary and
> ternary, and added support for these variants in the functions
> aarch64_fntype and aarch64_expand_pragma_builtin.
>
> We added new simd_types for integers (s32, s32q, and s64q) and for
> floating points (f8 and f8q).
>
> Because we added support for fp8 intrinsics here, we modified the check
> in acle/fp8.c that was checking that __ARM_FEATURE_FP8 macro is not
> defined.
Since Saurabh is currently on holiday, I've done a review in the form
of a patch. The main changes are:
* Rebase on top of the committed FEAT_LUT work.
* Add USES_FPMR to the existing flags, rather than treating it as
a separate boolean.
* Automatically add the fpmr argument to the type signature, based
on USES_FPMR
* Represent the highpart operations using a combination of generic
RTL and the corresponding lowpart operation. This should allow more
optimisation, though it's difficult to test without later patches.
* Use a generic "insn" int attribute for mnemonics, rather than
individual per-instruction attributes.
* Use "0" constraints for inputs that are tied to outputs.
* Add tests that __ARM_FEATURE_FP8 is defined.
Tested on aarch64-linux-gnu. I'll commit in about 24 hours or so
if there are no comments before then, but please let me know if you'd
like more time.
Thanks,
Richard
gcc/ChangeLog:
* config/aarch64/aarch64-builtins.cc
(FLAG_USES_FPMR, FLAG_FP8): New flags.
(ENTRY): Modified to support ternary operations.
(enum class): New variants to support new signatures.
(struct aarch64_pragma_builtins_data): Extend types to 4 elements.
(aarch64_fntype): Handle new signatures.
(aarch64_get_low_unspec): New function.
(aarch64_convert_to_v64): New function, split out from...
(aarch64_expand_pragma_builtin): ...here. Handle new signatures.
* config/aarch64/aarch64-c.cc
(aarch64_update_cpp_builtins): New flag for FP8.
* config/aarch64/aarch64-simd-pragma-builtins.def: Define new fp8
intrinsics.
(ENTRY_BINARY, ENTRY_BINARY_LANE): Update for new ENTRY interface.
(ENTRY_UNARY, ENTRY_TERNARY, ENTRY_UNARY_FPM): New macros.
(ENTRY_BINARY_VHSDF_SIGNED): Likewise.
* config/aarch64/aarch64-simd.md
(@aarch64_<fpm_uns_op><mode>): New pattern.
(@aarch64_<fpm_uns_op><mode>_high): Likewise.
(@aarch64_<fpm_uns_op><mode>_high_be): Likewise.
(@aarch64_<fpm_uns_op><mode>_high_le): Likewise.
* config/aarch64/iterators.md (V4SF_ONLY, VQ_BHF): New mode iterators.
(UNSPEC_FCVTN_FP8, UNSPEC_FCVTN2_FP8, UNSPEC_F1CVTL_FP8)
(UNSPEC_F1CVTL2_FP8, UNSPEC_F2CVTL_FP8, UNSPEC_F2CVTL2_FP8)
(UNSPEC_FSCALE): New unspecs.
(VPACKB, VPACKBtype): New mode attributes.
(b): Add support for V[48][BH]F.
(FPM_UNARY_UNS, FPM_BINARY_UNS, SCALE_UNS): New int iterators.
(insn): New int attribute.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/acle/fp8.c: Remove check that fp8 feature
macro doesn't exist and...
* gcc.target/aarch64/pragma_cpp_predefs_4.c: ...test that it does here.
* gcc.target/aarch64/simd/scale_fpm.c: New test.
* gcc.target/aarch64/simd/vcvt_fpm.c: New test.
Co-authored-by: Richard Sandiford <[email protected]>
---
gcc/config/aarch64/aarch64-builtins.cc | 128 ++++++++++--
gcc/config/aarch64/aarch64-c.cc | 2 +
.../aarch64/aarch64-simd-pragma-builtins.def | 47 ++++-
gcc/config/aarch64/aarch64-simd.md | 73 +++++++
gcc/config/aarch64/iterators.md | 37 +++-
gcc/testsuite/gcc.target/aarch64/acle/fp8.c | 10 -
.../gcc.target/aarch64/pragma_cpp_predefs_4.c | 10 +
.../gcc.target/aarch64/simd/scale_fpm.c | 60 ++++++
.../gcc.target/aarch64/simd/vcvt_fpm.c | 197 ++++++++++++++++++
9 files changed, 536 insertions(+), 28 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
diff --git a/gcc/config/aarch64/aarch64-builtins.cc
b/gcc/config/aarch64/aarch64-builtins.cc
index f528592a17d..39a85699e51 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -198,10 +198,11 @@ const unsigned int FLAG_RAISE_FP_EXCEPTIONS = 1U << 1;
const unsigned int FLAG_READ_MEMORY = 1U << 2;
const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
+const unsigned int FLAG_USES_FPMR = 1U << 5;
/* Indicates that READ_FPCR and RAISE_FP_EXCEPTIONS should be set for
floating-point modes but not for integer modes. */
-const unsigned int FLAG_AUTO_FP = 1U << 5;
+const unsigned int FLAG_AUTO_FP = 1U << 6;
const unsigned int FLAG_QUIET = 0;
const unsigned int FLAG_DEFAULT = FLAG_AUTO_FP;
@@ -210,6 +211,7 @@ const unsigned int FLAG_ALL = FLAG_READ_FPCR |
FLAG_RAISE_FP_EXCEPTIONS
| FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY;
const unsigned int FLAG_LOAD = FLAG_READ_MEMORY;
+const unsigned int FLAG_FP8 = FLAG_FP | FLAG_USES_FPMR;
typedef struct
{
@@ -783,7 +785,7 @@ typedef struct
AARCH64_SIMD_BUILTIN_##T##_##N##A,
#undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F) \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F) \
AARCH64_##N,
enum aarch64_builtins
@@ -1604,6 +1606,8 @@ enum class aarch64_builtin_signatures
{
binary,
binary_lane,
+ ternary,
+ unary,
};
namespace {
@@ -1618,6 +1622,8 @@ struct simd_type {
};
namespace simd_types {
+ constexpr simd_type f8 { V8QImode, qualifier_modal_float };
+ constexpr simd_type f8q { V16QImode, qualifier_modal_float };
constexpr simd_type p8 { V8QImode, qualifier_poly };
constexpr simd_type p8q { V16QImode, qualifier_poly };
constexpr simd_type s8 { V8QImode, qualifier_none };
@@ -1644,7 +1650,11 @@ namespace simd_types {
constexpr simd_type f32 { V2SFmode, qualifier_none };
constexpr simd_type f32q { V4SFmode, qualifier_none };
+ constexpr simd_type s32 { V2SImode, qualifier_none };
+ constexpr simd_type s32q { V4SImode, qualifier_none };
+
constexpr simd_type f64q { V2DFmode, qualifier_none };
+ constexpr simd_type s64q { V2DImode, qualifier_none };
constexpr simd_type none { VOIDmode, qualifier_none };
}
@@ -1652,10 +1662,10 @@ namespace simd_types {
}
#undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F) \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F) \
{#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
- simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS, \
- FLAG_##F},
+ simd_types::T2, simd_types::T3, U, \
+ aarch64_required_extensions::REQUIRED_EXTENSIONS, FLAG_##F},
/* Initialize pragma builtins. */
@@ -1663,7 +1673,7 @@ struct aarch64_pragma_builtins_data
{
const char *name;
aarch64_builtin_signatures signature;
- simd_type types[3];
+ simd_type types[4];
int unspec;
aarch64_required_extensions required_extensions;
unsigned int flags;
@@ -1687,6 +1697,17 @@ aarch64_fntype (const aarch64_pragma_builtins_data
&builtin_data)
for (int i = 1; i <= 2; ++i)
arg_types.quick_push (builtin_data.types[i].type ());
break;
+
+ case aarch64_builtin_signatures::ternary:
+ return_type = builtin_data.types[0].type ();
+ for (int i = 1; i <= 3; ++i)
+ arg_types.quick_push (builtin_data.types[i].type ());
+ break;
+
+ case aarch64_builtin_signatures::unary:
+ return_type = builtin_data.types[0].type ();
+ arg_types.quick_push (builtin_data.types[1].type ());
+ break;
}
switch (builtin_data.signature)
{
@@ -1697,6 +1718,8 @@ aarch64_fntype (const aarch64_pragma_builtins_data
&builtin_data)
default:
break;
}
+ if (builtin_data.flags & FLAG_USES_FPMR)
+ arg_types.quick_push (uint64_type_node);
return build_function_type_array (return_type, arg_types.length (),
arg_types.address ());
}
@@ -3538,6 +3561,36 @@ aarch64_expand_builtin_data_intrinsic (unsigned int
fcode, tree exp, rtx target)
return ops[0].value;
}
+/* If OP is a 128-bit vector, convert it to the equivalent 64-bit vector.
+ Do nothing otherwise. */
+static void
+aarch64_convert_to_v64 (expand_operand *op)
+{
+ if (known_eq (GET_MODE_BITSIZE (op->mode), 128u))
+ {
+ op->mode = aarch64_v64_mode (GET_MODE_INNER (op->mode)).require ();
+ op->value = gen_lowpart (op->mode, op->value);
+ }
+}
+
+/* UNSPEC is a high unspec, indicated by "2" in mnemonics and "_high" in
+ intrinsic names. Return the equivalent low unspec. */
+static int
+aarch64_get_low_unspec (int unspec)
+{
+ switch (unspec)
+ {
+ case UNSPEC_FCVTN2_FP8:
+ return UNSPEC_FCVTN_FP8;
+ case UNSPEC_F1CVTL2_FP8:
+ return UNSPEC_F1CVTL_FP8;
+ case UNSPEC_F2CVTL2_FP8:
+ return UNSPEC_F2CVTL_FP8;
+ default:
+ gcc_unreachable ();
+ }
+}
+
/* Expand CALL_EXPR EXP, given that it is a call to the function described
by BUILTIN_DATA, and return the function's return value. Put the result
in TARGET if convenient. */
@@ -3557,14 +3610,28 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
TYPE_MODE (TREE_TYPE (arg)));
}
- /* LUTI2 treats the first argument as a vector of 4 elements. The forms
- with 128-bit inputs are only provided as a convenience; the upper halves
- don't actually matter. */
- if (builtin_data.unspec == UNSPEC_LUTI2
- && known_eq (GET_MODE_BITSIZE (ops[1].mode), 128u))
+ if (builtin_data.flags & FLAG_USES_FPMR)
+ {
+ auto fpm_input = ops.pop ().value;
+ auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
+ emit_move_insn (fpmr, fpm_input);
+ }
+
+ switch (builtin_data.unspec)
{
- ops[1].mode = aarch64_v64_mode (GET_MODE_INNER (ops[1].mode)).require ();
- ops[1].value = gen_lowpart (ops[1].mode, ops[1].value);
+ case UNSPEC_F1CVTL_FP8:
+ case UNSPEC_F2CVTL_FP8:
+ /* Convert _low forms (which take 128-bit vectors) to the base
+ 64-bit forms. */
+ aarch64_convert_to_v64 (&ops[1]);
+ break;
+
+ case UNSPEC_LUTI2:
+ /* LUTI2 treats the first argument as a vector of 4 elements. The forms
+ with 128-bit inputs are only provided as a convenience; the upper
+ halves don't actually matter. */
+ aarch64_convert_to_v64 (&ops[1]);
+ break;
}
insn_code icode;
@@ -3572,10 +3639,41 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
{
case UNSPEC_FAMAX:
case UNSPEC_FAMIN:
- icode = code_for_aarch64 (builtin_data.unspec,
- builtin_data.types[0].mode);
+ case UNSPEC_F1CVTL_FP8:
+ case UNSPEC_F2CVTL_FP8:
+ case UNSPEC_FSCALE:
+ icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode);
+ break;
+
+ case UNSPEC_F1CVTL2_FP8:
+ case UNSPEC_F2CVTL2_FP8:
+ {
+ /* Add a high-part selector for the vec_merge. */
+ auto src_mode = ops.last ().mode;
+ auto nunits = GET_MODE_NUNITS (src_mode).to_constant ();
+ rtx par = aarch64_simd_vect_par_cnst_half (src_mode, nunits, true);
+ create_fixed_operand (ops.safe_push ({}), par);
+
+ auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+ icode = code_for_aarch64_high (unspec, ops[0].mode);
+ break;
+ }
+
+ case UNSPEC_FCVTN_FP8:
+ icode = code_for_aarch64 (builtin_data.unspec, ops[1].mode);
break;
+ case UNSPEC_FCVTN2_FP8:
+ {
+ auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+ auto mode = ops.last ().mode;
+ if (BYTES_BIG_ENDIAN)
+ icode = code_for_aarch64_high_be (unspec, mode);
+ else
+ icode = code_for_aarch64_high_le (unspec, mode);
+ break;
+ }
+
case UNSPEC_LUTI2:
case UNSPEC_LUTI4:
create_integer_operand (ops.safe_push ({}),
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index dba103a7fb1..ae255889f5e 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -268,6 +268,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
aarch64_def_or_undef (TARGET_SVE_BF16,
"__ARM_FEATURE_SVE_BF16", pfile);
+ aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
+
aarch64_def_or_undef (TARGET_LS64,
"__ARM_FEATURE_LS64", pfile);
aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index bc9a63b968a..6221652b38f 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -20,11 +20,19 @@
#undef ENTRY_BINARY
#define ENTRY_BINARY(N, T0, T1, T2, U, F) \
- ENTRY (N, binary, T0, T1, T2, U, F)
+ ENTRY (N, binary, T0, T1, T2, none, U, F)
#undef ENTRY_BINARY_LANE
#define ENTRY_BINARY_LANE(N, T0, T1, T2, U, F) \
- ENTRY (N, binary_lane, T0, T1, T2, U, F)
+ ENTRY (N, binary_lane, T0, T1, T2, none, U, F)
+
+#undef ENTRY_TERNARY
+#define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F) \
+ ENTRY (N, ternary, T0, T1, T2, T3, U, F)
+
+#undef ENTRY_UNARY
+#define ENTRY_UNARY(N, T0, T1, U, F) \
+ ENTRY (N, unary, T0, T1, none, none, U, F)
#undef ENTRY_BINARY_VHSDF
#define ENTRY_BINARY_VHSDF(NAME, UNSPEC, FLAGS) \
@@ -34,6 +42,14 @@
ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC, FLAGS) \
ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC, FLAGS)
+#undef ENTRY_BINARY_VHSDF_SIGNED
+#define ENTRY_BINARY_VHSDF_SIGNED(NAME, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##_f16, f16, f16, s16, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##q_f16, f16q, f16q, s16q, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##_f32, f32, f32, s32, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##q_f32, f32q, f32q, s32q, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##q_f64, f64q, f64q, s64q, UNSPEC, FLAGS)
+
#undef ENTRY_TERNARY_VLUT8
#define ENTRY_TERNARY_VLUT8(T) \
ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8, \
@@ -64,6 +80,11 @@
ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q, \
UNSPEC_LUTI4, QUIET)
+#undef ENTRY_UNARY_VQ_BHF
+#define ENTRY_UNARY_VQ_BHF(N, T1, UNSPEC, FLAGS) \
+ ENTRY_UNARY (N##_bf16_mf8_fpm, bf16q, T1, UNSPEC, FLAGS) \
+ ENTRY_UNARY (N##_f16_mf8_fpm, f16q, T1, UNSPEC, FLAGS)
+
// faminmax
#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX, FP)
@@ -82,3 +103,25 @@ ENTRY_TERNARY_VLUT16 (p)
ENTRY_TERNARY_VLUT16 (s)
ENTRY_TERNARY_VLUT16 (u)
#undef REQUIRED_EXTENSIONS
+
+// fpm conversion
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1, f8, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_high, f8q, UNSPEC_F1CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_low, f8q, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2, f8, UNSPEC_F2CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_high, f8q, UNSPEC_F2CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_low, f8q, UNSPEC_F2CVTL_FP8, FP8)
+
+ENTRY_BINARY (vcvt_mf8_f16_fpm, f8, f16, f16, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvtq_mf8_f16_fpm, f8q, f16q, f16q, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvt_mf8_f32_fpm, f8, f32q, f32q, UNSPEC_FCVTN_FP8, FP8)
+
+ENTRY_TERNARY (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q,
+ UNSPEC_FCVTN2_FP8, FP8)
+#undef REQUIRED_EXTENSIONS
+
+// fpm scaling
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_BINARY_VHSDF_SIGNED (vscale, UNSPEC_FSCALE, FP)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index 05cbd38372d..f38bad72781 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -10024,3 +10024,76 @@ (define_insn "@aarch64_lut<VLUTx2:mode><VB:mode>"
"TARGET_LUT && INTVAL (operands[4]) == 4"
"luti%4\t%0.8h, {%S1.8h, %T1.8h}, %2[%3]"
)
+
+;; fpm unary instructions (low part).
+(define_insn "@aarch64_<insn><mode>"
+ [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+ (unspec:VQ_BHF
+ [(match_operand:V8QI 1 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_UNARY_UNS))]
+ "TARGET_FP8"
+ "<b><insn>\t%0.<Vtype>, %1.8b"
+)
+
+;; fpm unary instructions (high part).
+(define_insn "@aarch64_<insn><mode>_high"
+ [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+ (unspec:VQ_BHF
+ [(vec_select:V8QI
+ (match_operand:V16QI 1 "register_operand" "w")
+ (match_operand:V16QI 2 "vect_par_cnst_hi_half"))
+ (reg:DI FPM_REGNUM)]
+ FPM_UNARY_UNS))]
+ "TARGET_FP8"
+ "<b><insn>2\t%0.<Vtype>, %1.16b"
+)
+
+;; fpm binary instructions.
+(define_insn "@aarch64_<insn><mode>"
+ [(set (match_operand:<VPACKB> 0 "register_operand" "=w")
+ (unspec:<VPACKB>
+ [(match_operand:VCVTFPM 1 "register_operand" "w")
+ (match_operand:VCVTFPM 2 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_BINARY_UNS))]
+ "TARGET_FP8"
+ "<insn>\t%0.<VPACKBtype>, %1.<Vtype>, %2.<Vtype>"
+)
+
+;; fpm binary instructions & merge with low.
+(define_insn "@aarch64_<insn><mode>_high_le"
+ [(set (match_operand:V16QI 0 "register_operand" "=w")
+ (vec_concat:V16QI
+ (match_operand:V8QI 1 "register_operand" "0")
+ (unspec:V8QI
+ [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+ (match_operand:V4SF_ONLY 3 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_BINARY_UNS)))]
+ "TARGET_FP8 && !BYTES_BIG_ENDIAN"
+ "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+(define_insn "@aarch64_<insn><mode>_high_be"
+ [(set (match_operand:V16QI 0 "register_operand" "=w")
+ (vec_concat:V16QI
+ (unspec:V8QI
+ [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+ (match_operand:V4SF_ONLY 3 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_BINARY_UNS)
+ (match_operand:V8QI 1 "register_operand" "0")))]
+ "TARGET_FP8 && BYTES_BIG_ENDIAN"
+ "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+;; fscale instructions
+(define_insn "@aarch64_<insn><mode>"
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+ (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+ (match_operand:<FCVT_TARGET> 2 "register_operand" "w")]
+ FSCALE_UNS))]
+ "TARGET_FP8"
+ "<insn>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 90725c7faeb..7b426aae7a8 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -41,6 +41,7 @@ (define_mode_iterator SHORT [QI HI])
;; Iterators for single modes, for "@" patterns.
(define_mode_iterator SI_ONLY [SI])
(define_mode_iterator DI_ONLY [DI])
+(define_mode_iterator V4SF_ONLY [V4SF])
;; Iterator for all integer modes (up to 64-bit)
(define_mode_iterator ALLI [QI HI SI DI])
@@ -181,6 +182,9 @@ (define_mode_iterator VSFDF [V2SF V4SF V2DF DF SF])
;; Advanced SIMD single Float modes.
(define_mode_iterator VDQSF [V2SF V4SF])
+;; Quad vector float modes with half/bfloat elements.
+(define_mode_iterator VQ_BHF [V8HF V8BF])
+
;; Quad vector Float modes with half/single elements.
(define_mode_iterator VQ_HSF [V8HF V4SF])
@@ -430,6 +434,9 @@ (define_mode_iterator VMULD [V4HI V8HI V2SI V4SI
(define_mode_iterator VLUT [V8QI V16QI V4HI V4HF V4BF])
(define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF])
+;; Modes available for Advanced SIMD FP8 conversion operations.
+(define_mode_iterator VCVTFPM [V4HF V8HF V4SF])
+
;; Iterators for single modes, for "@" patterns.
(define_mode_iterator VNx16QI_ONLY [VNx16QI])
(define_mode_iterator VNx16SI_ONLY [VNx16SI])
@@ -715,6 +722,12 @@ (define_c_enum "unspec"
UNSPEC_ASHIFT_SIGNED ; Used in aarch-simd.md.
UNSPEC_ASHIFT_UNSIGNED ; Used in aarch64-simd.md.
UNSPEC_ABS ; Used in aarch64-simd.md.
+ UNSPEC_FCVTN_FP8 ; Used in aarch64-simd.md.
+ UNSPEC_FCVTN2_FP8 ; Used in aarch64-builtins.cc.
+ UNSPEC_F1CVTL_FP8 ; Used in aarch64-simd.md.
+ UNSPEC_F1CVTL2_FP8 ; Used in aarch64-builtins.cc.
+ UNSPEC_F2CVTL_FP8 ; Used in aarch64-simd.md.
+ UNSPEC_F2CVTL2_FP8 ; Used in aarch64-builtins.cc.
UNSPEC_FMAX ; Used in aarch64-simd.md.
UNSPEC_FMAXNMV ; Used in aarch64-simd.md.
UNSPEC_FMAXV ; Used in aarch64-simd.md.
@@ -723,6 +736,7 @@ (define_c_enum "unspec"
UNSPEC_FMINV ; Used in aarch64-simd.md.
UNSPEC_FADDV ; Used in aarch64-simd.md.
UNSPEC_FNEG ; Used in aarch64-simd.md.
+ UNSPEC_FSCALE ; Used in aarch64-simd.md.
UNSPEC_ADDV ; Used in aarch64-simd.md.
UNSPEC_SMAXV ; Used in aarch64-simd.md.
UNSPEC_SMINV ; Used in aarch64-simd.md.
@@ -1790,6 +1804,11 @@ (define_mode_attr Vntype [(V8HI "8b") (V4SI "4h")
(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
(V2DI "4s")])
+;; The result of FCVTN on two vectors of the given mode. The result has
+;; twice as many QI elements as the input.
+(define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")])
+(define_mode_attr VPACKBtype [(V4HF "8b") (V8HF "16b") (V4SF "8b")])
+
;; Widened modes of vector modes.
(define_mode_attr VWIDE [(V8QI "V8HI") (V4HI "V4SI")
(V2SI "V2DI") (V16QI "V8HI")
@@ -2547,7 +2566,8 @@ (define_mode_attr vec_or_offset [(V8QI "vec") (V16QI
"vec") (V4HI "vec")
(V8HI "vec") (V2SI "vec") (V4SI "vec")
(V2DI "vec") (DI "offset")])
-(define_mode_attr b [(VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
+(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
+ (VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
(VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "")
(VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")])
@@ -3794,10 +3814,25 @@ (define_int_iterator SVE2_FP8_TERNARY_LANE_VNX4SF
UNSPEC_FMLALLTB_FP8
UNSPEC_FMLALLTT_FP8])
+;; Iterators for fpm instructions
+
+(define_int_iterator FPM_UNARY_UNS [UNSPEC_F1CVTL_FP8 UNSPEC_F2CVTL_FP8])
+
+(define_int_iterator FPM_BINARY_UNS [UNSPEC_FCVTN_FP8])
+
+(define_int_iterator FSCALE_UNS [UNSPEC_FSCALE])
+
;; -------------------------------------------------------------------
;; Int Iterators Attributes.
;; -------------------------------------------------------------------
+;; The AArch64 insn mnemonic associated with an unspec.
+(define_int_attr insn
+ [(UNSPEC_F1CVTL_FP8 "f1cvtl")
+ (UNSPEC_F2CVTL_FP8 "f2cvtl")
+ (UNSPEC_FCVTN_FP8 "fcvtn")
+ (UNSPEC_FSCALE "fscale")])
+
;; The optab associated with an operation. Note that for ANDF, IORF
;; and XORF, the optab pattern is not actually defined; we just use this
;; name for consistency with the integer patterns.
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
index afb44f83f60..635a7eaf4a2 100644
--- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
@@ -5,19 +5,9 @@
#include <arm_acle.h>
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
#pragma GCC push_options
#pragma GCC target("arch=armv9.4-a+fp8")
-/* We do not define __ARM_FEATURE_FP8 until all
- relevant features have been added. */
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
/*
**test_write_fpmr_sysreg_asm_64:
** msr fpmr, x0
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
index 37bd844f581..e5a19aaefb6 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
@@ -263,3 +263,13 @@
#ifdef __ARM_FEATURE_GCS
#error Foo
#endif
+
+#pragma GCC target "arch=armv9-a"
+#ifdef __ARM_FEATURE_FP8
+#error Foo
+#endif
+
+#pragma GCC target "arch=armv9-a+fp8"
+#ifndef __ARM_FEATURE_FP8
+#error Foo
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
new file mode 100644
index 00000000000..d95a861fcfd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vscale_f16:
+** fscale v0.4h, v0.4h, v1.4h
+** ret
+*/
+float16x4_t
+test_vscale_f16 (float16x4_t a, int16x4_t b)
+{
+ return vscale_f16 (a, b);
+}
+
+/*
+** test_vscaleq_f16:
+** fscale v0.8h, v0.8h, v1.8h
+** ret
+*/
+float16x8_t
+test_vscaleq_f16 (float16x8_t a, int16x8_t b)
+{
+ return vscaleq_f16 (a, b);
+}
+
+/*
+** test_vscale_f32:
+** fscale v0.2s, v0.2s, v1.2s
+** ret
+*/
+float32x2_t
+test_vscale_f32 (float32x2_t a, int32x2_t b)
+{
+ return vscale_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f32:
+** fscale v0.4s, v0.4s, v1.4s
+** ret
+*/
+float32x4_t
+test_vscaleq_f32 (float32x4_t a, int32x4_t b)
+{
+ return vscaleq_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f64:
+** fscale v0.2d, v0.2d, v1.2d
+** ret
+*/
+float64x2_t
+test_vscaleq_f64 (float64x2_t a, int64x2_t b)
+{
+ return vscaleq_f64 (a, b);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
new file mode 100644
index 00000000000..39076684345
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
@@ -0,0 +1,197 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vcvt1_bf16:
+** msr fpmr, x0
+** bf1cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_vcvt1_bf16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt1_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_bf16:
+** msr fpmr, x0
+** bf1cvtl2 v0.8h, v0.16b
+** ret
+*/
+bfloat16x8_t
+test_high_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_bf16:
+** msr fpmr, x0
+** bf1cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_low_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt1_f16:
+** msr fpmr, x0
+** f1cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_vcvt1_f16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt1_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_f16:
+** msr fpmr, x0
+** f1cvtl2 v0.8h, v0.16b
+** ret
+*/
+float16x8_t
+test_high_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_f16:
+** msr fpmr, x0
+** f1cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_low_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_bf16:
+** msr fpmr, x0
+** bf2cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_vcvt2_bf16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt2_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_bf16:
+** msr fpmr, x0
+** bf2cvtl2 v0.8h, v0.16b
+** ret
+*/
+bfloat16x8_t
+test_high_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt2_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_bf16:
+** msr fpmr, x0
+** bf1cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_low_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_f16:
+** msr fpmr, x0
+** f2cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_vcvt2_f16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt2_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_f16:
+** msr fpmr, x0
+** f2cvtl2 v0.8h, v0.16b
+** ret
+*/
+float16x8_t
+test_high_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt2_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_f16:
+** msr fpmr, x0
+** f1cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_low_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt_f16:
+** msr fpmr, x0
+** fcvtn v0.8b, v0.4h, v1.4h
+** ret
+*/
+mfloat8x8_t
+test_vcvt_f16 (float16x4_t a, float16x4_t b, fpm_t c)
+{
+ return vcvt_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvtq_f16:
+** msr fpmr, x0
+** fcvtn v0.16b, v0.8h, v1.8h
+** ret
+*/
+mfloat8x16_t
+test_vcvtq_f16 (float16x8_t a, float16x8_t b, fpm_t c)
+{
+ return vcvtq_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_f32:
+** msr fpmr, x0
+** fcvtn v0.8b, v0.4s, v1.4s
+** ret
+*/
+mfloat8x8_t
+test_vcvt_f32 (float32x4_t a, float32x4_t b, fpm_t c)
+{
+ return vcvt_mf8_f32_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_high_f32:
+** msr fpmr, x0
+** fcvtn2 v0.16b, v1.4s, v2.4s
+** ret
+*/
+mfloat8x16_t
+test_vcvt_high_f32 (mfloat8x8_t a, float32x4_t b, float32x4_t c, fpm_t d)
+{
+ return vcvt_high_mf8_f32_fpm(a, b, c, d);
+}
--
2.25.1