[GCC][PATCH][AArch64]Add ACLE intrinsics for dot product (usdot - vector, dot - by element) for AArch64 AdvSIMD ARMv8.6 Extension

Stam Markianos-Wright Fri, 13 Dec 2019 02:19:27 -0800

Hi all,

This patch adds the ARMv8.6 Extension ACLE intrinsics for dot product
operations (vector/by element) to AArch64.


These are:
usdot (vector), <us/su>dot (by element).

The functions are optional from ARMv8.2-a as -march=armv8.2-a+i8mm
and are then enabled by default from ARMv8.6a.

The functions are declared in arm_neon.h, RTL patterns are defined to
generate assembler and tests are added to verify them and perform 
adequate checks.

Regression testing on aarch64-none-elf passed successfully.

This patch depends on:

https://gcc.gnu.org/ml/gcc-patches/2019-11/msg02415.html

for Aaarch64 CLI updates, and on:

https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html

for the testsuite effective_target update.

Ok for trunk?

Cheers,
Stam


ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

PS. I don't have commit rights, so if someone could commit on my behalf,
that would be great :)



gcc/ChangeLog:

2019-11-28  Stam Markianos-Wright  <stam.markianos-wri...@arm.com>

        * config/aarch64/aarch64-builtins.c: (enum aarch64_type_qualifiers)
          New qualifier_lane_quadtup_index, TYPES_TERNOP_SSUS,
          TYPES_QUADOPSSUS_LANE_QUADTUP, TYPES_QUADOPSSSU_LANE_QUADTUP.
        (aarch64_simd_expand_args): Add case SIMD_ARG_LANE_QUADTUP_INDEX.
        (aarch64_simd_expand_builtin): Add qualifier_lane_quadtup_index.
        * config/aarch64/aarch64-simd-builtins.def (usdot, usdot_lane,
          usdot_laneq, sudot_lane,sudot_laneq): New.
        * config/aarch64/aarch64-simd.md (aarch64_usdot): New .
          (aarch64_<sur>dot_lane): New.
          (aarch64_<sur>dot_laneq): New.
        * config/aarch64/arm_neon.h (vusdot_s32): New.
        (vusdotq_s32): New.
        (vusdot_lane_s32): New.
        (vsudot_lane_s32): New.
        * config/aarch64/iterators.md (DOTPROD_I8MM): New iterator.
          (UNSPEC_USDOT, UNSPEC_SUDOT): New unspecs.

gcc/testsuite/ChangeLog:

2019-11-28  Stam Markianos-Wright  <stam.markianos-wri...@arm.com>

        * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c: New test.
        * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c: New test.
        * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c: New test.
        * gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c: New test.

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index c35a1b1f0299ce5af8ca1a3df0209614f7bd0f25..6bd26889f2f26a9f82dd6d40f50125eaeee41740 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -107,6 +107,9 @@ enum aarch64_type_qualifiers
   /* Lane indices selected in pairs. - must be in range, and flipped for
      bigendian.  */
   qualifier_lane_pair_index = 0x800,
+  /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+     bigendian.  */
+  qualifier_lane_quadtup_index = 0x1000,
 };
 
 typedef struct
@@ -173,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned,
       qualifier_unsigned, qualifier_immediate };
 #define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
 
 
 static enum aarch64_type_qualifiers
@@ -191,6 +198,19 @@ aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
       qualifier_unsigned, qualifier_lane_index };
 #define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers)
 
+static enum aarch64_type_qualifiers
+aarch64_types_quadopssus_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_unsigned,
+      qualifier_none, qualifier_lane_quadtup_index };
+#define TYPES_QUADOPSSUS_LANE_QUADTUP \
+	(aarch64_types_quadopssus_lane_quadtup_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_quadopsssu_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_none, qualifier_none,
+      qualifier_unsigned, qualifier_lane_quadtup_index };
+#define TYPES_QUADOPSSSU_LANE_QUADTUP \
+	(aarch64_types_quadopsssu_lane_quadtup_qualifiers)
+
 static enum aarch64_type_qualifiers
 aarch64_types_quadopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
@@ -1260,6 +1280,7 @@ typedef enum
   SIMD_ARG_LANE_INDEX,
   SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX,
   SIMD_ARG_LANE_PAIR_INDEX,
+  SIMD_ARG_LANE_QUADTUP_INDEX,
   SIMD_ARG_STOP
 } builtin_simd_arg;
 
@@ -1349,9 +1370,25 @@ aarch64_simd_expand_args (rtx target, int icode, int have_retval,
 		  op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane),
 					  SImode);
 		}
-	      /* Fall through - if the lane index isn't a constant then
-		 the next case will error.  */
-	      /* FALLTHRU */
+	      /* If the lane index isn't a constant then error out.  */
+	      goto constant_arg;
+	    case SIMD_ARG_LANE_QUADTUP_INDEX:
+	      /* Must be a previous operand into which this is an index and
+		 index is restricted to nunits / 4.  */
+	      gcc_assert (opc > 0);
+	      if (CONST_INT_P (op[opc]))
+		{
+		  machine_mode vmode = insn_data[icode].operand[opc - 1].mode;
+		  unsigned int nunits
+		    = GET_MODE_NUNITS (vmode).to_constant ();
+		  aarch64_simd_lane_bounds (op[opc], 0, nunits / 4, exp);
+		  /* Keep to GCC-vector-extension lane indices in the RTL.  */
+		  int lane = INTVAL (op[opc]);
+		  op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane),
+					  SImode);
+		}
+	      /* If the lane index isn't a constant then error out.  */
+	      goto constant_arg;
 	    case SIMD_ARG_CONSTANT:
 constant_arg:
 	      if (!(*insn_data[icode].operand[opc].predicate)
@@ -1464,6 +1501,8 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx target)
 	args[k] = SIMD_ARG_LANE_INDEX;
       else if (d->qualifiers[qualifiers_k] & qualifier_lane_pair_index)
 	args[k] = SIMD_ARG_LANE_PAIR_INDEX;
+      else if (d->qualifiers[qualifiers_k] & qualifier_lane_quadtup_index)
+	args[k] = SIMD_ARG_LANE_QUADTUP_INDEX;
       else if (d->qualifiers[qualifiers_k] & qualifier_struct_load_store_lane_index)
 	args[k] = SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX;
       else if (d->qualifiers[qualifiers_k] & qualifier_immediate)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..651aab0f80fba5a40b5e3fa149f503acb6a48702 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -212,10 +212,15 @@
   /* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>.  */
   BUILTIN_VB (TERNOP, sdot, 0)
   BUILTIN_VB (TERNOPU, udot, 0)
+  BUILTIN_VB (TERNOP_SSUS, usdot, 0)
   BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
   BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
   BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
   BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0)
+  BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0)
+  BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0)
 
   /* Implemented by aarch64_fcadd<rot><mode>.   */
   BUILTIN_VHSDF (BINOP, fcadd90, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ad4676bc167f08951e693916c7ef796e3501762a..514fe97c84b595f3bcd08ff3fecb14178a6c221b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -506,6 +506,19 @@
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; These instructions map to the __builtins for the armv8.6a I8MM usdot
+;; (vector) Dot Product operation.
+(define_insn "aarch64_usdot<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+	(plus:VS (match_operand:VS 1 "register_operand" "0")
+		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			    (match_operand:<VSI2QI> 3 "register_operand" "w")]
+		UNSPEC_USDOT)))]
+  "TARGET_SIMD && TARGET_I8MM"
+  "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
 ;; These expands map to the Dot Product optab the vectorizer checks for.
 ;; The auto-vectorizer expects a dot product builtin that also does an
 ;; accumulation into the provided register.
@@ -573,6 +586,44 @@
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; These instructions map to the __builtins for the armv8.6a I8MM usdot, sudot
+;; (by element) Dot Product operations.
+(define_insn "aarch64_<sur>dot_lane<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+	(plus:VS (match_operand:VS 1 "register_operand" "0")
+		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			    (match_operand:V8QI 3 "register_operand" "<h_con>")
+			    (match_operand:SI 4 "immediate_operand" "i")]
+		DOTPROD_I8MM)))]
+  "TARGET_SIMD && TARGET_I8MM"
+  {
+    int nunits = GET_MODE_NUNITS (V8QImode).to_constant ();
+    int lane = INTVAL (operands[4]);
+    operands[4]
+    =  gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode);
+    return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+  }
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "aarch64_<sur>dot_laneq<vsi2qi>"
+  [(set (match_operand:VS 0 "register_operand" "=w")
+	(plus:VS (match_operand:VS 1 "register_operand" "0")
+		(unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			    (match_operand:V16QI 3 "register_operand" "<h_con>")
+			    (match_operand:SI 4 "immediate_operand" "i")]
+		DOTPROD_I8MM)))]
+  "TARGET_SIMD && TARGET_I8MM"
+  {
+    int nunits = GET_MODE_NUNITS (V16QImode).to_constant ();
+    int lane = INTVAL (operands[4]);
+    operands[4]
+    =  gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode);
+    return "<sur>dot\\t%0.<Vtype>, %2.<Vdottype>, %3.4b[%4]";
+  }
+  [(set_attr "type" "neon_dot<q>")]
+)
+
 (define_expand "copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 8b861601a48b2150aa5768d717c61e0d1416747f..95b92dff69343e2b6c74174b39f3cd9d9838ddab 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34606,6 +34606,89 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
+{
+  return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+  return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __index)
+{
+  return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 \
+      (int32x2_t __r, uint8x8_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_lane_s32 \
+      (int32x4_t __r, uint8x16_t __a, int8x8_t __b, const int __index)
+{
+  return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 \
+     (int32x4_t __r, uint8x16_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index)
+{
+  return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 \
+      (int32x2_t __r, int8x8_t __a, uint8x16_t __b, const int __index)
+{
+  return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_lane_s32 \
+      (int32x4_t __r, int8x16_t __a, uint8x8_t __b, const int __index)
+{
+  return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 \
+      (int32x4_t __r, int8x16_t __a, uint8x16_t __b, const int __index)
+{
+  return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 83a0d156e84baf7dde8f9e46eeeca4edfa1f9037..731fbf61ff9500c4fcb09290dd8ea05b336b1435 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -650,6 +650,8 @@
     UNSPEC_UMULHS	; Used in aarch64-sve2.md.
     UNSPEC_UMULHRS	; Used in aarch64-sve2.md.
     UNSPEC_ASRD		; Used in aarch64-sve.md.
+    UNSPEC_USDOT	; Used in aarch64-simd.md.
+    UNSPEC_SUDOT	; Used in aarch64-simd.md.
 ])
 
 ;; ------------------------------------------------------------------
@@ -1853,6 +1855,8 @@
 
 (define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
 
+(define_int_iterator DOTPROD_I8MM [UNSPEC_USDOT UNSPEC_SUDOT])
+
 (define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
 			       UNSPEC_SUBHN UNSPEC_RSUBHN])
 
@@ -2292,6 +2296,7 @@
 		      (UNSPEC_URSHL  "ur") (UNSPEC_SRSHL  "sr")
 		      (UNSPEC_UQRSHL  "u") (UNSPEC_SQRSHL  "s")
 		      (UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
+		      (UNSPEC_USDOT "us") (UNSPEC_SUDOT "su")
 ])
 
 (define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
new file mode 100755
index 0000000000000000000000000000000000000000..a064a7ccc929c3756a009ebfe783142ae1194f15
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
@@ -0,0 +1,75 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "--save-temps" } */
+
+#include <arm_neon.h>
+
+
+/* Unsigned-Signed Dot Product instructions.  */
+
+int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_s32 (r, x, y);
+}
+
+int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_lane_s32 (r, x, y, 0);
+}
+
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+  return vusdotq_lane_s32 (r, x, y, 1);
+}
+
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+  return vsudot_lane_s32 (r, x, y, 0);
+}
+
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+  return vsudotq_lane_s32 (r, x, y, 1);
+}
+
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* { dg-final { scan-assembler {usdot\tv[0-9]+.2s, v[0-9]+.8b, v[0-9]+.8b} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
+
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[0\]} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[2\]} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[1\]} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[3\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[0\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[2\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[1\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[3\]} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c
new file mode 100755
index 0000000000000000000000000000000000000000..54ac4ef5b9cd054d27b75cdc420b34aab89e7b73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c
@@ -0,0 +1,76 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "-mbig-endian --save-temps" } */
+
+
+#include <arm_neon.h>
+
+
+/* Unsigned-Signed Dot Product instructions.  */
+
+int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_s32 (r, x, y);
+}
+
+int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_s32 (r, x, y);
+}
+
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  return vusdot_lane_s32 (r, x, y, 0);
+}
+
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+  return vusdotq_lane_s32 (r, x, y, 1);
+}
+
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* Signed-Unsigned Dot Product instructions.  */
+
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+  return vsudot_lane_s32 (r, x, y, 0);
+}
+
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+  return vsudotq_lane_s32 (r, x, y, 1);
+}
+
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* { dg-final { scan-assembler {usdot\tv[0-9]+.2s, v[0-9]+.8b, v[0-9]+.8b} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b} } } */
+
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[0\]} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[2\]} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[1\]} } } */
+/* { dg-final { scan-assembler {usdot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[3\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[0\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.2s, v[0-9]+\.8b, v[0-9]+\.4b\[2\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[1\]} } } */
+/* { dg-final { scan-assembler {sudot\tv[0-9]+\.4s, v[0-9]+\.16b, v[0-9]+\.4b\[3\]} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c
new file mode 100755
index 0000000000000000000000000000000000000000..18ecabef8dc6b99872d71c8e412b6f4b4809e901
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "--save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vusdot_lane_s32 (r, x, y, -1);
+}
+
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vusdot_laneq_s32 (r, x, y, -1);
+}
+
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+  /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vusdotq_lane_s32 (r, x, y, 2);
+}
+
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vusdotq_laneq_s32 (r, x, y, 4);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..66c87d48694bad9624b491aec4cd1a38b75fbb95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm }  */
+/* { dg-additional-options "--save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vsudot_lane_s32 (r, x, y, -1);
+}
+
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vsudot_laneq_s32 (r, x, y, -1);
+}
+
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+  /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
+  return vsudotq_lane_s32 (r, x, y, 2);
+}
+
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vsudotq_laneq_s32 (r, x, y, 4);
+}

[GCC][PATCH][AArch64]Add ACLE intrinsics for dot product (usdot - vector, dot - by element) for AArch64 AdvSIMD ARMv8.6 Extension

Reply via email to