Re: [PATCH v2 2/3] aarch64: Add support for fp8dot2 and fp8dot4

Richard Sandiford Mon, 09 Dec 2024 05:08:07 -0800

<[email protected]> writes:
> The AArch64 FEAT_FP8DOT2 and FEAT_FP8DOT4 extension introduces
> instructions for dot product of vectors.
>
> This patch introduces the following intrinsics:
> 1. vdot{q}_{fp16|fp32}_mf8_fpm.
> 2. vdot{q}_lane{q}_{fp16|fp32}_mf8_fpm.
>
> It introduces two flags: fp8dot2 and fp8dot4.
>
> We had to add space for another type in aarch64_pragma_builtins_data
> struct. The macros were updated to reflect that.
>
> We added a new aarch64_builtin_signature variant, quaternary, and added
> support for it in the functions aarch64_fntype and
> aarch64_expand_pragma_builtin.
>
> We added a new namespace, function_checker, to implement range checks
> for functions defined using the new pragma approach. The old intrinsic
> range checks will continue to work. All the new AdvSIMD intrinsics we
> define that need lane checks should be using the function in this
> namespace to implement the checks.


As explained in the reply to 1/3, this review is in the form of a patch.
The changes are along the same lines as for 1/3, along with:

* Move the mode-based target requirements to the mode iterator.

* Add a require_immediate_lane_index helper for checking lane indices.

* Add tests for the lane index checks.

Tested on aarch64-linux-gnu.  I'll commit in about 24 hours or so
if there are no comments before then, but please let me know if you'd
like more time.

Thanks,
Richard


gcc/ChangeLog:

        * config/aarch64/aarch64-builtins.cc
        (enum class): Add ternary_lane.
        (aarch64_fntype): Hnadle ternary_lane.
        (aarch64_pragma_builtins_checker::require_immediate_lane_index): New
        function.
        (aarch64_pragma_builtins_checker::check): Handle the new intrinsics.
        (aarch64_expand_pragma_builtin): Likewise.
        * config/aarch64/aarch64-c.cc
        (aarch64_update_cpp_builtins): Define TARGET_FP8DOT2 and
        TARGET_FP8DOT4.
        * config/aarch64/aarch64-simd-pragma-builtins.def: Define vdot
        and vdot_lane intrinsics.
        * config/aarch64/aarch64-simd.md
        (@aarch64_<fpm_uns_op><mode>): New pattern.
        (@aarch64_<fpm_uns_op>_lane<VQ_HSF_VDOT:mode><VB:mode>): Likewise.
        * config/aarch64/iterators.md (VQ_HSF_VDOT): New mode iterator.
        (UNSPEC_VDOT, UNSPEC_VDOT_LANE): New unspecs.
        (fpm_uns_op): Handle them.
        (VNARROWB, Vnbtype): New mode attributes.
        (FPM_VDOT, FPM_VDOT_LANE): New int iterators.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/pragma_cpp_predefs_4.c: Test fp8dot2 and fp8dot4.
        * gcc.target/aarch64/simd/vdot2_fpm.c: New test.
        * gcc.target/aarch64/simd/vdot4_fpm.c: New test.
        * gcc.target/aarch64/simd/vdot_lane_indices_1.c: New test.

Co-authored-by: Richard Sandiford <[email protected]>
---
 gcc/config/aarch64/aarch64-builtins.cc        |  29 ++++
 gcc/config/aarch64/aarch64-c.cc               |   4 +
 .../aarch64/aarch64-simd-pragma-builtins.def  |  29 ++++
 gcc/config/aarch64/aarch64-simd.md            |  27 ++++
 gcc/config/aarch64/iterators.md               |  24 ++++
 .../gcc.target/aarch64/pragma_cpp_predefs_4.c |  22 +++
 .../gcc.target/aarch64/simd/vdot2_fpm.c       | 125 ++++++++++++++++++
 .../gcc.target/aarch64/simd/vdot4_fpm.c       | 125 ++++++++++++++++++
 .../aarch64/simd/vdot_lane_indices_1.c        |  45 +++++++
 9 files changed, 430 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vdot_lane_indices_1.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 39a85699e51..756f730b2bf 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -1607,6 +1607,7 @@ enum class aarch64_builtin_signatures
   binary,
   binary_lane,
   ternary,
+  ternary_lane,
   unary,
 };
 
@@ -1699,6 +1700,7 @@ aarch64_fntype (const aarch64_pragma_builtins_data 
&builtin_data)
       break;
 
     case aarch64_builtin_signatures::ternary:
+    case aarch64_builtin_signatures::ternary_lane:
       return_type = builtin_data.types[0].type ();
       for (int i = 1; i <= 3; ++i)
        arg_types.quick_push (builtin_data.types[i].type ());
@@ -1712,6 +1714,7 @@ aarch64_fntype (const aarch64_pragma_builtins_data 
&builtin_data)
   switch (builtin_data.signature)
     {
     case aarch64_builtin_signatures::binary_lane:
+    case aarch64_builtin_signatures::ternary_lane:
       arg_types.quick_push (integer_type_node);
       break;
 
@@ -2577,6 +2580,7 @@ struct aarch64_pragma_builtins_checker
 
   bool require_immediate_range (unsigned int, HOST_WIDE_INT,
                                HOST_WIDE_INT);
+  bool require_immediate_lane_index (unsigned int, unsigned int, unsigned int);
 
   bool check ();
 
@@ -2624,6 +2628,22 @@ require_immediate_range (unsigned int argno, 
HOST_WIDE_INT min,
   return true;
 }
 
+/* Require argument LANE_ARGNO to be an immediate lane index into vector
+   argument VEC_ARGNO, given that each index selects enough data to fill
+   one element of argument ELT_ARGNO.  Return true if the argument
+   is valid.  */
+bool
+aarch64_pragma_builtins_checker::
+require_immediate_lane_index (unsigned int lane_argno, unsigned vec_argno,
+                             unsigned int elt_argno)
+{
+  auto vec_mode = TYPE_MODE (TREE_TYPE (args[vec_argno]));
+  auto elt_mode = TYPE_MODE (TREE_TYPE (args[elt_argno]));
+  auto nunits = exact_div (GET_MODE_SIZE (vec_mode),
+                          GET_MODE_UNIT_SIZE (elt_mode)).to_constant ();
+  return require_immediate_range (lane_argno, 0, nunits - 1);
+}
+
 /* Check the arguments to the intrinsic call and return true if they
    are valid.  */
 bool
@@ -2631,6 +2651,9 @@ aarch64_pragma_builtins_checker::check ()
 {
   switch (builtin_data.unspec)
     {
+    case UNSPEC_FDOT_LANE_FP8:
+      return require_immediate_lane_index (nargs - 2, nargs - 3, 0);
+
     case UNSPEC_LUTI2:
     case UNSPEC_LUTI4:
       {
@@ -3641,6 +3664,7 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
     case UNSPEC_FAMIN:
     case UNSPEC_F1CVTL_FP8:
     case UNSPEC_F2CVTL_FP8:
+    case UNSPEC_FDOT_FP8:
     case UNSPEC_FSCALE:
       icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode);
       break;
@@ -3674,6 +3698,11 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
        break;
       }
 
+    case UNSPEC_FDOT_LANE_FP8:
+      icode = code_for_aarch64_lane (builtin_data.unspec,
+                                    ops[0].mode, ops[3].mode);
+      break;
+
     case UNSPEC_LUTI2:
     case UNSPEC_LUTI4:
       create_integer_operand (ops.safe_push ({}),
diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
index ae255889f5e..b0e3235e669 100644
--- a/gcc/config/aarch64/aarch64-c.cc
+++ b/gcc/config/aarch64/aarch64-c.cc
@@ -270,6 +270,10 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
 
   aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
 
+  aarch64_def_or_undef (TARGET_FP8DOT2, "__ARM_FEATURE_FP8DOT2", pfile);
+
+  aarch64_def_or_undef (TARGET_FP8DOT4, "__ARM_FEATURE_FP8DOT4", pfile);
+
   aarch64_def_or_undef (TARGET_LS64,
                        "__ARM_FEATURE_LS64", pfile);
   aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def 
b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index 6221652b38f..19277860b8c 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -30,6 +30,10 @@
 #define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F) \
   ENTRY (N, ternary, T0, T1, T2, T3, U, F)
 
+#undef ENTRY_TERNARY_LANE
+#define ENTRY_TERNARY_LANE(N, T0, T1, T2, T3, U, F)    \
+  ENTRY (N, ternary_lane, T0, T1, T2, T3, U, F)
+
 #undef ENTRY_UNARY
 #define ENTRY_UNARY(N, T0, T1, U, F)   \
   ENTRY (N, unary, T0, T1, none, none, U, F)
@@ -85,6 +89,21 @@
   ENTRY_UNARY (N##_bf16_mf8_fpm, bf16q, T1, UNSPEC, FLAGS)     \
   ENTRY_UNARY (N##_f16_mf8_fpm, f16q, T1, UNSPEC, FLAGS)
 
+#undef ENTRY_VDOT_FPM
+#define ENTRY_VDOT_FPM(T)                                              \
+  ENTRY_TERNARY (vdot_##T##_mf8_fpm, T, T, f8, f8,                     \
+                UNSPEC_FDOT_FP8, FP8)                                  \
+  ENTRY_TERNARY (vdotq_##T##_mf8_fpm, T##q, T##q, f8q, f8q,            \
+                UNSPEC_FDOT_FP8, FP8)                                  \
+  ENTRY_TERNARY_LANE (vdot_lane_##T##_mf8_fpm, T, T, f8, f8,           \
+                     UNSPEC_FDOT_LANE_FP8, FP8)                        \
+  ENTRY_TERNARY_LANE (vdot_laneq_##T##_mf8_fpm, T, T, f8, f8q,         \
+                     UNSPEC_FDOT_LANE_FP8, FP8)                        \
+  ENTRY_TERNARY_LANE (vdotq_lane_##T##_mf8_fpm, T##q, T##q, f8q, f8,   \
+                     UNSPEC_FDOT_LANE_FP8, FP8)                        \
+  ENTRY_TERNARY_LANE (vdotq_laneq_##T##_mf8_fpm, T##q, T##q, f8q, f8q, \
+                     UNSPEC_FDOT_LANE_FP8, FP8)
+
 // faminmax
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
 ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX, FP)
@@ -125,3 +144,13 @@ ENTRY_TERNARY (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q,
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
 ENTRY_BINARY_VHSDF_SIGNED (vscale, UNSPEC_FSCALE, FP)
 #undef REQUIRED_EXTENSIONS
+
+// fpm dot2 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT2)
+ENTRY_VDOT_FPM (f16)
+#undef REQUIRED_EXTENSIONS
+
+// fpm dot4 product
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8DOT4)
+ENTRY_VDOT_FPM (f32)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index f38bad72781..ddcef6381e2 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -10097,3 +10097,30 @@ (define_insn "@aarch64_<insn><mode>"
   "TARGET_FP8"
   "<insn>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
 )
+
+;; fpm vdot instructions.  The target requirements are enforced by
+;; VDQ_HSF_FDOT.
+(define_insn "@aarch64_<insn><mode>"
+  [(set (match_operand:VDQ_HSF_FDOT 0 "register_operand" "=w")
+       (unspec:VDQ_HSF_FDOT
+        [(match_operand:VDQ_HSF_FDOT 1 "register_operand" "0")
+         (match_operand:<VNARROWB> 2 "register_operand" "w")
+         (match_operand:<VNARROWB> 3 "register_operand" "w")
+         (reg:DI FPM_REGNUM)]
+        FPM_FDOT))]
+  ""
+  "<insn>\t%1.<Vtype>, %2.<Vnbtype>, %3.<Vnbtype>"
+)
+
+(define_insn "@aarch64_<insn>_lane<VDQ_HSF_FDOT:mode><VB:mode>"
+  [(set (match_operand:VDQ_HSF_FDOT 0 "register_operand" "=w")
+       (unspec:VDQ_HSF_FDOT
+        [(match_operand:VDQ_HSF_FDOT 1 "register_operand" "0")
+         (match_operand:<VDQ_HSF_FDOT:VNARROWB> 2 "register_operand" "w")
+         (match_operand:VB 3 "register_operand" "w")
+         (match_operand 4 "const_int_operand")
+         (reg:DI FPM_REGNUM)]
+        FPM_FDOT_LANE))]
+  ""
+  "<insn>\t%1.<VDQ_HSF_FDOT:Vtype>, %2.<VDQ_HSF_FDOT:Vnbtype>, 
%3.<VDQ_HSF_FDOT:Vnbsubtype>[%4]"
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7b426aae7a8..296b1a7c559 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -188,6 +188,11 @@ (define_mode_iterator VQ_BHF [V8HF V8BF])
 ;; Quad vector Float modes with half/single elements.
 (define_mode_iterator VQ_HSF [V8HF V4SF])
 
+(define_mode_iterator VDQ_HSF_FDOT [(V4HF "TARGET_FP8DOT2")
+                                   (V8HF "TARGET_FP8DOT2")
+                                   (V2SF "TARGET_FP8DOT4")
+                                   (V4SF "TARGET_FP8DOT4")])
+
 ;; Modes suitable to use as the return type of a vcond expression.
 (define_mode_iterator VDQF_COND [V2SF V2SI V4SF V4SI V2DF V2DI])
 
@@ -728,6 +733,8 @@ (define_c_enum "unspec"
     UNSPEC_F1CVTL2_FP8 ; Used in aarch64-builtins.cc.
     UNSPEC_F2CVTL_FP8  ; Used in aarch64-simd.md.
     UNSPEC_F2CVTL2_FP8 ; Used in aarch64-builtins.cc.
+    UNSPEC_FDOT_FP8    ; Used in aarch64-simd.md.
+    UNSPEC_FDOT_LANE_FP8 ; Used in aarch64-simd.md.
     UNSPEC_FMAX                ; Used in aarch64-simd.md.
     UNSPEC_FMAXNMV     ; Used in aarch64-simd.md.
     UNSPEC_FMAXV       ; Used in aarch64-simd.md.
@@ -1809,6 +1816,18 @@ (define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
 (define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")])
 (define_mode_attr VPACKBtype [(V4HF "8b") (V8HF "16b") (V4SF "8b")])
 
+;; Modes narrowed all the way to bytes.
+(define_mode_attr VNARROWB [(V4HF "V8QI") (V8HF "V16QI")
+                           (V2SF "V8QI") (V4SF "V16QI")])
+
+;; Register suffix for modes narrowed to bytes.
+(define_mode_attr Vnbtype [(V4HF "8b") (V8HF "16b")
+                          (V2SF "8b") (V4SF "16b")])
+
+;; Register suffix representing one group of byte elements per wider element.
+(define_mode_attr Vnbsubtype [(V4HF "2b") (V8HF "2b")
+                             (V2SF "4b") (V4SF "4b")])
+
 ;; Widened modes of vector modes.
 (define_mode_attr VWIDE [(V8QI  "V8HI")  (V4HI  "V4SI")
                         (V2SI  "V2DI")  (V16QI "V8HI")
@@ -3822,6 +3841,9 @@ (define_int_iterator FPM_BINARY_UNS [UNSPEC_FCVTN_FP8])
 
 (define_int_iterator FSCALE_UNS [UNSPEC_FSCALE])
 
+(define_int_iterator FPM_FDOT [UNSPEC_FDOT_FP8])
+(define_int_iterator FPM_FDOT_LANE [UNSPEC_FDOT_LANE_FP8])
+
 ;; -------------------------------------------------------------------
 ;; Int Iterators Attributes.
 ;; -------------------------------------------------------------------
@@ -3831,6 +3853,8 @@ (define_int_attr insn
   [(UNSPEC_F1CVTL_FP8 "f1cvtl")
    (UNSPEC_F2CVTL_FP8 "f2cvtl")
    (UNSPEC_FCVTN_FP8 "fcvtn")
+   (UNSPEC_FDOT_FP8 "fdot")
+   (UNSPEC_FDOT_LANE_FP8 "fdot")
    (UNSPEC_FSCALE "fscale")])
 
 ;; The optab associated with an operation.  Note that for ANDF, IORF
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c 
b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
index e5a19aaefb6..fb3dc139f1f 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
@@ -273,3 +273,25 @@
 #ifndef __ARM_FEATURE_FP8
 #error Foo
 #endif
+
+#pragma GCC target "arch=armv9-a+fp8dot4"
+#ifndef __ARM_FEATURE_FP8
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_FP8DOT4
+#error Foo
+#endif
+#ifdef __ARM_FEATURE_FP8DOT2
+#error Foo
+#endif
+
+#pragma GCC target "arch=armv9-a+fp8dot2"
+#ifndef __ARM_FEATURE_FP8
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_FP8DOT4
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_FP8DOT2
+#error Foo
+#endif
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
new file mode 100644
index 00000000000..5fe139106c6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot2_fpm.c
@@ -0,0 +1,125 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f16_fpm:
+**     msr     fpmr, x0
+**     fdot    v0.4h, v1.8b, v2.8b
+**     ret
+*/
+float16x4_t
+test_vdot_f16_fpm (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f16_fpm:
+**     msr     fpmr, x0
+**     fdot    v0.8h, v1.16b, v2.16b
+**     ret
+*/
+float16x8_t
+test_vdotq_f16_fpm (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f16_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f16_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.4h, v1.8b, v2.2b\[0\]
+**     ret
+*/
+float16x4_t
+test_vdot_lane_f16_fpm_0 (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f16_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdot_lane_f16_fpm_3:
+**     msr     fpmr, x0
+**     fdot    v0.4h, v1.8b, v2.2b\[3\]
+**     ret
+*/
+float16x4_t
+test_vdot_lane_f16_fpm_3 (float16x4_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f16_mf8_fpm (a, b, c, 3, d);
+}
+
+/*
+** test_vdot_laneq_f16_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.4h, v1.8b, v2.2b\[0\]
+**     ret
+*/
+float16x4_t
+test_vdot_laneq_f16_fpm_0 (float16x4_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t 
d)
+{
+  return vdot_laneq_f16_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdot_laneq_f16_fpm_7:
+**     msr     fpmr, x0
+**     fdot    v0.4h, v1.8b, v2.2b\[7\]
+**     ret
+*/
+float16x4_t
+test_vdot_laneq_f16_fpm_7 (float16x4_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t 
d)
+{
+  return vdot_laneq_f16_mf8_fpm (a, b, c, 7, d);
+}
+
+/*
+** test_vdotq_lane_f16_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.8h, v1.16b, v2.2b\[0\]
+**     ret
+*/
+float16x8_t
+test_vdotq_lane_f16_fpm_0 (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t 
d)
+{
+  return vdotq_lane_f16_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdotq_lane_f16_fpm_3:
+**     msr     fpmr, x0
+**     fdot    v0.8h, v1.16b, v2.2b\[3\]
+**     ret
+*/
+float16x8_t
+test_vdotq_lane_f16_fpm_3 (float16x8_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t 
d)
+{
+  return vdotq_lane_f16_mf8_fpm (a, b, c, 3, d);
+}
+
+/*
+** test_vdotq_laneq_f16_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.8h, v1.16b, v2.2b\[0\]
+**     ret
+*/
+float16x8_t
+test_vdotq_laneq_f16_fpm_0 (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, 
fpm_t d)
+{
+  return vdotq_laneq_f16_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdotq_laneq_f16_fpm_7:
+**     msr     fpmr, x0
+**     fdot    v0.8h, v1.16b, v2.2b\[7\]
+**     ret
+*/
+float16x8_t
+test_vdotq_laneq_f16_fpm_7 (float16x8_t a, mfloat8x16_t b, mfloat8x16_t c, 
fpm_t d)
+{
+  return vdotq_laneq_f16_mf8_fpm (a, b, c, 7, d);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
new file mode 100644
index 00000000000..e47a737e8b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot4_fpm.c
@@ -0,0 +1,125 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8dot4" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vdot_f32_fpm:
+**     msr     fpmr, x0
+**     fdot    v0.2s, v1.8b, v2.8b
+**     ret
+*/
+float32x2_t
+test_vdot_f32_fpm (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdotq_f32_fpm:
+**     msr     fpmr, x0
+**     fdot    v0.4s, v1.16b, v2.16b
+**     ret
+*/
+float32x4_t
+test_vdotq_f32_fpm (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, fpm_t d)
+{
+  return vdotq_f32_mf8_fpm (a, b, c, d);
+}
+
+/*
+** test_vdot_lane_f32_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.2s, v1.8b, v2.4b\[0\]
+**     ret
+*/
+float32x2_t
+test_vdot_lane_f32_fpm_0 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f32_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdot_lane_f32_fpm_1:
+**     msr     fpmr, x0
+**     fdot    v0.2s, v1.8b, v2.4b\[1\]
+**     ret
+*/
+float32x2_t
+test_vdot_lane_f32_fpm_1 (float32x2_t a, mfloat8x8_t b, mfloat8x8_t c, fpm_t d)
+{
+  return vdot_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdot_laneq_f32_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.2s, v1.8b, v2.4b\[0\]
+**     ret
+*/
+float32x2_t
+test_vdot_laneq_f32_fpm_0 (float32x2_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t 
d)
+{
+  return vdot_laneq_f32_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdot_laneq_f32_fpm_3:
+**     msr     fpmr, x0
+**     fdot    v0.2s, v1.8b, v2.4b\[3\]
+**     ret
+*/
+float32x2_t
+test_vdot_laneq_f32_fpm_3 (float32x2_t a, mfloat8x8_t b, mfloat8x16_t c, fpm_t 
d)
+{
+  return vdot_laneq_f32_mf8_fpm (a, b, c, 3, d);
+}
+
+/*
+** test_vdotq_lane_f32_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.4s, v1.16b, v2.4b\[0\]
+**     ret
+*/
+float32x4_t
+test_vdotq_lane_f32_fpm_0 (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t 
d)
+{
+  return vdotq_lane_f32_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdotq_lane_f32_fpm_1:
+**     msr     fpmr, x0
+**     fdot    v0.4s, v1.16b, v2.4b\[1\]
+**     ret
+*/
+float32x4_t
+test_vdotq_lane_f32_fpm_1 (float32x4_t a, mfloat8x16_t b, mfloat8x8_t c, fpm_t 
d)
+{
+  return vdotq_lane_f32_mf8_fpm (a, b, c, 1, d);
+}
+
+/*
+** test_vdotq_laneq_f32_fpm_0:
+**     msr     fpmr, x0
+**     fdot    v0.4s, v1.16b, v2.4b\[0\]
+**     ret
+*/
+float32x4_t
+test_vdotq_laneq_f32_fpm_0 (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, 
fpm_t d)
+{
+  return vdotq_laneq_f32_mf8_fpm (a, b, c, 0, d);
+}
+
+/*
+** test_vdotq_laneq_f32_fpm_3:
+**     msr     fpmr, x0
+**     fdot    v0.4s, v1.16b, v2.4b\[3\]
+**     ret
+*/
+float32x4_t
+test_vdotq_laneq_f32_fpm_3 (float32x4_t a, mfloat8x16_t b, mfloat8x16_t c, 
fpm_t d)
+{
+  return vdotq_laneq_f32_mf8_fpm (a, b, c, 3, d);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vdot_lane_indices_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vdot_lane_indices_1.c
new file mode 100644
index 00000000000..7585cff2646
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vdot_lane_indices_1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+
+#include "arm_neon.h"
+
+#pragma GCC target "+fp8dot4+fp8dot2"
+
+void
+test(float16x4_t f16, float16x8_t f16q, float32x2_t f32,
+     float32x4_t f32q, mfloat8x8_t mf8, mfloat8x16_t mf8q, int x,
+     fpm_t fpm)
+{
+  vdot_lane_f16_mf8_fpm (f16, mf8, mf8, x, fpm); /* { dg-error {argument 4 of 
'vdot_lane_f16_mf8_fpm' must be an integer constant expression} } */
+  vdot_laneq_f16_mf8_fpm (f16, mf8, mf8q, x, fpm); /* { dg-error {argument 4 
of 'vdot_laneq_f16_mf8_fpm' must be an integer constant expression} } */
+  vdotq_lane_f16_mf8_fpm (f16q, mf8q, mf8, x, fpm); /* { dg-error {argument 4 
of 'vdotq_lane_f16_mf8_fpm' must be an integer constant expression} } */
+  vdotq_laneq_f16_mf8_fpm (f16q, mf8q, mf8q, x, fpm); /* { dg-error {argument 
4 of 'vdotq_laneq_f16_mf8_fpm' must be an integer constant expression} } */
+
+  vdot_lane_f32_mf8_fpm (f32, mf8, mf8, x, fpm); /* { dg-error {argument 4 of 
'vdot_lane_f32_mf8_fpm' must be an integer constant expression} } */
+  vdot_laneq_f32_mf8_fpm (f32, mf8, mf8q, x, fpm); /* { dg-error {argument 4 
of 'vdot_laneq_f32_mf8_fpm' must be an integer constant expression} } */
+  vdotq_lane_f32_mf8_fpm (f32q, mf8q, mf8, x, fpm); /* { dg-error {argument 4 
of 'vdotq_lane_f32_mf8_fpm' must be an integer constant expression} } */
+  vdotq_laneq_f32_mf8_fpm (f32q, mf8q, mf8q, x, fpm); /* { dg-error {argument 
4 of 'vdotq_laneq_f32_mf8_fpm' must be an integer constant expression} } */
+
+  vdot_lane_f16_mf8_fpm (f16, mf8, mf8, -1, fpm); /* { dg-error { passing -1 
to argument 4 of 'vdot_lane_f16_mf8_fpm', which expects a value in the range 
\[0, 3\]} } */
+  vdot_lane_f16_mf8_fpm (f16, mf8, mf8, 4, fpm); /* { dg-error { passing 4 to 
argument 4 of 'vdot_lane_f16_mf8_fpm', which expects a value in the range \[0, 
3\]} } */
+
+  vdot_laneq_f16_mf8_fpm (f16, mf8, mf8q, -1, fpm); /* { dg-error { passing -1 
to argument 4 of 'vdot_laneq_f16_mf8_fpm', which expects a value in the range 
\[0, 7\]} } */
+  vdot_laneq_f16_mf8_fpm (f16, mf8, mf8q, 8, fpm); /* { dg-error { passing 8 
to argument 4 of 'vdot_laneq_f16_mf8_fpm', which expects a value in the range 
\[0, 7\]} } */
+
+  vdotq_lane_f16_mf8_fpm (f16q, mf8q, mf8, -1, fpm); /* { dg-error { passing 
-1 to argument 4 of 'vdotq_lane_f16_mf8_fpm', which expects a value in the 
range \[0, 3\]} } */
+  vdotq_lane_f16_mf8_fpm (f16q, mf8q, mf8, 4, fpm); /* { dg-error { passing 4 
to argument 4 of 'vdotq_lane_f16_mf8_fpm', which expects a value in the range 
\[0, 3\]} } */
+
+  vdotq_laneq_f16_mf8_fpm (f16q, mf8q, mf8q, -1, fpm); /* { dg-error { passing 
-1 to argument 4 of 'vdotq_laneq_f16_mf8_fpm', which expects a value in the 
range \[0, 7\]} } */
+  vdotq_laneq_f16_mf8_fpm (f16q, mf8q, mf8q, 8, fpm); /* { dg-error { passing 
8 to argument 4 of 'vdotq_laneq_f16_mf8_fpm', which expects a value in the 
range \[0, 7\]} } */
+
+  vdot_lane_f32_mf8_fpm (f32, mf8, mf8, -1, fpm); /* { dg-error { passing -1 
to argument 4 of 'vdot_lane_f32_mf8_fpm', which expects a value in the range 
\[0, 1\]} } */
+  vdot_lane_f32_mf8_fpm (f32, mf8, mf8, 2, fpm); /* { dg-error { passing 2 to 
argument 4 of 'vdot_lane_f32_mf8_fpm', which expects a value in the range \[0, 
1\]} } */
+
+  vdot_laneq_f32_mf8_fpm (f32, mf8, mf8q, -1, fpm); /* { dg-error { passing -1 
to argument 4 of 'vdot_laneq_f32_mf8_fpm', which expects a value in the range 
\[0, 3\]} } */
+  vdot_laneq_f32_mf8_fpm (f32, mf8, mf8q, 4, fpm); /* { dg-error { passing 4 
to argument 4 of 'vdot_laneq_f32_mf8_fpm', which expects a value in the range 
\[0, 3\]} } */
+
+  vdotq_lane_f32_mf8_fpm (f32q, mf8q, mf8, -1, fpm); /* { dg-error { passing 
-1 to argument 4 of 'vdotq_lane_f32_mf8_fpm', which expects a value in the 
range \[0, 1\]} } */
+  vdotq_lane_f32_mf8_fpm (f32q, mf8q, mf8, 2, fpm); /* { dg-error { passing 2 
to argument 4 of 'vdotq_lane_f32_mf8_fpm', which expects a value in the range 
\[0, 1\]} } */
+
+  vdotq_laneq_f32_mf8_fpm (f32q, mf8q, mf8q, -1, fpm); /* { dg-error { passing 
-1 to argument 4 of 'vdotq_laneq_f32_mf8_fpm', which expects a value in the 
range \[0, 3\]} } */
+  vdotq_laneq_f32_mf8_fpm (f32q, mf8q, mf8q, 4, fpm); /* { dg-error { passing 
4 to argument 4 of 'vdotq_laneq_f32_mf8_fpm', which expects a value in the 
range \[0, 3\]} } */
+}
-- 
2.25.1

Re: [PATCH v2 2/3] aarch64: Add support for fp8dot2 and fp8dot4

Reply via email to