On 21/11/2024 14:33, Richard Sandiford wrote:
Claudio Bantaloukas <claudio.bantalou...@arm.com> writes:[...] @@ -4004,6 +4008,44 @@ SHAPE (ternary_bfloat_lane) typedef ternary_bfloat_lane_base<2> ternary_bfloat_lanex2_def; SHAPE (ternary_bfloat_lanex2)+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svmfloat8_t, svmfloat8_t, uint64_t) + + where the final argument is an integer constant expression in the range + [0, 15]. */ +struct ternary_mfloat8_lane_def + : public ternary_resize2_lane_base<8, TYPE_mfloat, TYPE_mfloat> +{ + void + build (function_builder &b, const function_group_info &group) const override + { + gcc_assert (group.fpm_mode == FPM_set); + b.add_overloaded_functions (group, MODE_none); + build_all (b, "v0,v0,vM,vM,su64", group, MODE_none); + } + + bool + check (function_checker &c) const override + { + return c.require_immediate_lane_index (3, 2, 1); + } + + tree + resolve (function_resolver &r) const override + { + type_suffix_index type; + if (!r.check_num_arguments (5) + || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES + || !r.require_vector_type (1, VECTOR_TYPE_svmfloat8_t) + || !r.require_vector_type (2, VECTOR_TYPE_svmfloat8_t) + || !r.require_integer_immediate (3) + || !r.require_scalar_type (4, "int64_t"))uint64_t
done, although I wonder if "fpm_t, aka uint64_t" would be better.
+ return error_mark_node; + + return r.resolve_to (r.mode_suffix_id, type, TYPE_SUFFIX_mf8, GROUP_none); + } +}; +SHAPE (ternary_mfloat8_lane) + /* sv<t0>_t svfoo[_t0](sv<t0>_t, svbfloatt16_t, svbfloat16_t) sv<t0>_t svfoo[_n_t0](sv<t0>_t, svbfloat16_t, bfloat16_t). */ struct ternary_bfloat_opt_n_def @@ -4019,6 +4061,46 @@ struct ternary_bfloat_opt_n_def }; SHAPE (ternary_bfloat_opt_n)+/* sv<t0>_t svfoo[_t0](sv<t0>_t, svmfloatt8_t, svmfloat8_t)+ sv<t0>_t svfoo[_n_t0](sv<t0>_t, svmfloat8_t, bfloat8_t). */ +struct ternary_mfloat8_opt_n_def + : public ternary_resize2_opt_n_base<8, TYPE_mfloat, TYPE_mfloat> +{ + void + build (function_builder &b, const function_group_info &group) const override + { + gcc_assert (group.fpm_mode == FPM_set); + b.add_overloaded_functions (group, MODE_none); + build_all (b, "v0,v0,vM,vM", group, MODE_none); + build_all (b, "v0,v0,vM,sM", group, MODE_n); + } + + tree + resolve (function_resolver &r) const override + { + type_suffix_index type; + if (!r.check_num_arguments (4) + || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES + || !r.require_vector_type (1, VECTOR_TYPE_svmfloat8_t) + || !r.require_scalar_type (3, "int64_t")) + return error_mark_node; + + tree scalar_form + = r.lookup_form (MODE_n, type, TYPE_SUFFIX_mf8, GROUP_none); + if (r.scalar_argument_p (2)) + { + if (scalar_form) + return scalar_form; + return error_mark_node;It looks like this would return error_mark_node without reporting an error first.+ } + if (scalar_form && !r.require_vector_or_scalar_type (2)) + return error_mark_node; + + return r.resolve_to (r.mode_suffix_id, type, TYPE_SUFFIX_mf8, GROUP_none); + }In this context (unlike finish_opt_n_resolution) we know that there is a bijection between the vector and scalar forms. So I think we can just add require_vector_or_scalar_type to the initial checks: if (!r.check_num_arguments (4) || (type = r.infer_vector_type (0)) == NUM_TYPE_SUFFIXES || !r.require_vector_type (1, VECTOR_TYPE_svmfloat8_t) || !r.require_vector_or_scalar_type (2) || !r.require_scalar_type (3, "int64_t")) return error_mark_node; auto mode = r.mode_suffix_id; if (r.scalar_argument_p (2)) mode = MODE_n; else if (!r.require_vector_type (2, VECTOR_TYPE_svmfloat8_t)) return error_mark_node; return r.resolve_to (mode, type, TYPE_SUFFIX_mf8, GROUP_none); (untested).
Done, all tests pass.
[...] +;; ------------------------------------------------------------------------- +;; ---- [FP] Mfloat8 Multiply-and-accumulate operations +;; ------------------------------------------------------------------------- +;; Includes: +;; - FMLALB (vectors, FP8 to FP16) +;; - FMLALT (vectors, FP8 to FP16) +;; - FMLALB (indexed, FP8 to FP16) +;; - FMLALT (indexed, FP8 to FP16) +;; - FMLALLBB (vectors) +;; - FMLALLBB (indexed) +;; - FMLALLBT (vectors) +;; - FMLALLBT (indexed) +;; - FMLALLTB (vectors) +;; - FMLALLTB (indexed) +;; - FMLALLTT (vectors) +;; - FMLALLTT (indexed) +;; ------------------------------------------------------------------------- + +(define_insn "@aarch64_sve_add_<sve2_fp8_fma_op><mode>" + [(set (match_operand:SVE_FULL_HSF 0 "register_operand") + (unspec:SVE_FULL_HSF + [(match_operand:SVE_FULL_HSF 1 "register_operand") + (match_operand:VNx16QI 2 "register_operand") + (match_operand:VNx16QI 3 "register_operand") + (reg:DI FPM_REGNUM)] + SVE2_FP8_TERNARY))] + "TARGET_SSVE_FP8FMA" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w , 0 , w , w ; * ] <sve2_fp8_fma_op>\t%0.<Vetype>, %2.b, %3.b + [ ?&w , w , w , w ; yes ] movprfx\t%0, %1\;<sve2_fp8_fma_op>\t%0.<Vetype>, %2.b, %3.b + } +) + +(define_insn "@aarch64_sve_add_lane_<sve2_fp8_fma_op><mode>" + [(set (match_operand:SVE_FULL_HSF 0 "register_operand") + (unspec:SVE_FULL_HSF + [(match_operand:SVE_FULL_HSF 1 "register_operand") + (match_operand:VNx16QI 2 "register_operand") + (match_operand:VNx16QI 3 "register_operand") + (match_operand:SI 4 "const_int_operand") + (reg:DI FPM_REGNUM)] + SVE2_FP8_TERNARY_LANE))] + "TARGET_SSVE_FP8FMA" + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] + [ w , 0 , w , y ; * ] <sve2_fp8_fma_op>\t%0.<Vetype>, %2.b, %3.b[%4] + [ ?&w , w , w , y ; yes ] movprfx\t%0, %1\;<sve2_fp8_fma_op>\t%0.<Vetype>, %2.b, %3.b[%4] + } +) +It goes against my instincts to ask for more cut-&-paste, but: I think we should split the operator list into HF-only and SF-only, rather than define invalid combinations. [ Hope I didn't suggest the opposite earlier -- always a risk, unfortunately. :( ]
Done. I'm new to this so I assumed invalid combinations are best caught by the shapes.
[...] +/* SVE2 versions of fp8 multiply-accumulate instructions are enabled through +ssve-fp8fma. */ +#define TARGET_SSVE_FP8FMA ((\ + (TARGET_SVE2 && TARGET_FP8FMA) || TARGET_STREAMING) \ + && (AARCH64_HAVE_ISA(SSVE_FP8FMA) || TARGET_NON_STREAMING))Formatting nits, sorry, but: long line for the comment, and missing space in the final line. Also, the comment doesn't cover the non-streaming case.
Done
Maybe: /* SVE2 versions of fp8 multiply-accumulate instructions are enabled for non-streaming mode by +fp8fma and for streaming mode by +ssve-fp8fma. */ #define TARGET_SSVE_FP8FMA \ ((TARGET_SVE2 && TARGET_FP8FMA) || TARGET_STREAMING) \ && (AARCH64_HAVE_ISA (SSVE_FP8FMA) || TARGET_NON_STREAMING))diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 93e096bc9d5..119f636dc16 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -21824,6 +21824,10 @@ Enable support for Armv8.9-a/9.4-a translation hardening extension. Enable the RCpc3 (Release Consistency) extension. @item fp8 Enable the fp8 (8-bit floating point) extension. +@item fp8fma +Enable the fp8 (8-bit floating point) multiply accumulate extension. +@item ssve-fp8fma +Enable the fp8 (8-bit floating point) multiply accumulate extension streaming mode.Maybe "in streaming mode"? Also: the usual 80-character line limit applies here too, where possible.
Done
[...] diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/mlalb_lane_mf8.c b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/mlalb_lane_mf8.c new file mode 100644 index 00000000000..5b43f4d6611 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/mlalb_lane_mf8.c @@ -0,0 +1,88 @@ +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ +/* { dg-additional-options "-march=armv8.5-a+sve2+fp8fma" } */ +/* { dg-require-effective-target aarch64_asm_fp8fma_ok } */ +/* { dg-require-effective-target aarch64_asm_ssve-fp8fma_ok } */ +/* { dg-skip-if "" { *-*-* } { "-DSTREAMING_COMPATIBLE" } { "" } } */ + +#include "test_sve_acle.h"Following on from the comment on patch 3, the corresponding change here would probably be: /* { dg-do assemble { target aarch64_asm_ssve-fp8fma_ok } } */ /* { dg-do compile { target { ! aarch64_asm_ssve-fp8fma_ok } } } */ /* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ #include "test_sve_acle.h" #pragma GCC target "+fp8fma" #ifdef STREAMING_COMPATIBLE #pragma GCC target "+ssve-fp8fma" #endif (which assumes that +ssve-fp8fma is good for +fp8fma too).
Done for all tests
+/* +** mlalb_lane_0_f16_tied1: +** msr fpmr, x0 +** fmlalb z0\.h, z4\.b, z5\.b\[0\] +** ret +*/ +TEST_DUAL_Z (mlalb_lane_0_f16_tied1, svfloat16_t, svmfloat8_t, + z0 = svmlalb_lane_f16_mf8_fpm (z0, z4, z5, 0, fpm0), + z0 = svmlalb_lane_fpm (z0, z4, z5, 0, fpm0)) + +/* +** mlalb_lane_0_f16_tied2: +** msr fpmr, x0 +** mov (z[0-9]+)\.d, z0\.d +** movprfx z0, z4 +** fmlalb z0\.h, \1\.b, z1\.b\[0\] +** ret +*/ +TEST_DUAL_Z_REV (mlalb_lane_0_f16_tied2, svfloat16_t, svmfloat8_t, + z0_res = svmlalb_lane_f16_mf8_fpm (z4, z0, z1, 0, fpm0), + z0_res = svmlalb_lane_fpm (z4, z0, z1, 0, fpm0)) + +/* +** mlalb_lane_0_f16_tied3: +** msr fpmr, x0 +** mov (z[0-9]+)\.d, z0\.d +** movprfx z0, z4 +** fmlalb z0\.h, z1\.b, \1\.b\[0\] +** ret +*/ +TEST_DUAL_Z_REV (mlalb_lane_0_f16_tied3, svfloat16_t, svmfloat8_t, + z0_res = svmlalb_lane_f16_mf8_fpm (z4, z1, z0, 0, fpm0), + z0_res = svmlalb_lane_fpm (z4, z1, z0, 0, fpm0)) + +/* +** mlalb_lane_0_f16_untied: +** msr fpmr, x0 +** movprfx z0, z1 +** fmlalb z0\.h, z4\.b, z5\.b\[0\] +** ret +*/ +TEST_DUAL_Z (mlalb_lane_0_f16_untied, svfloat16_t, svmfloat8_t, + z0 = svmlalb_lane_f16_mf8_fpm (z1, z4, z5, 0, fpm0), + z0 = svmlalb_lane_fpm (z1, z4, z5, 0, fpm0)) + +/* +** mlalb_lane_1_f16: +** msr fpmr, x0 +** fmlalb z0\.h, z4\.b, z5\.b\[1\] +** ret +*/ +TEST_DUAL_Z (mlalb_lane_1_f16, svfloat16_t, svmfloat8_t, + z0 = svmlalb_lane_f16_mf8_fpm (z0, z4, z5, 1, fpm0), + z0 = svmlalb_lane_fpm (z0, z4, z5, 1, fpm0)) + +/* +** mlalb_lane_z8_f16: +** ... +** msr fpmr, x0 +** mov (z[0-7])\.d, z8\.d +** fmlalb z0\.h, z1\.b, \1\.b\[1\] +** ldr d8, \[sp\], 32 +** ret +*/ +TEST_DUAL_LANE_REG (mlalb_lane_z8_f16, svfloat16_t, svmfloat8_t, z8, + z0 = svmlalb_lane_f16_mf8_fpm (z0, z1, z8, 1, fpm0), + z0 = svmlalb_lane_fpm (z0, z1, z8, 1, fpm0)) + +/* +** mlalb_lane_z16_f16: +** ... +** msr fpmr, x0 +** mov (z[0-7])\.d, z16\.d +** fmlalb z0\.h, z1\.b, \1\.b\[1\] +** ... +** ret +*/ +TEST_DUAL_LANE_REG (mlalb_lane_z16_f16, svfloat16_t, svmfloat8_t, z16, + z0 = svmlalb_lane_f16_mf8_fpm (z0, z1, z16, 1, fpm0), + z0 = svmlalb_lane_fpm (z0, z1, z16, 1, fpm0))It would be good to have a test for the upper limit of the index range, like for the _f32 tests. Same for svmlalt_lane.
Done, mlalt_lane_z16_f16 already covers the fourth arg being 15. Apologies for the long time to reply. C.
Looks good to me otherwise, thanks, Richard