On 12/20/19 2:13 PM, Richard Sandiford wrote:
> Stam Markianos-Wright <[email protected]> writes:
>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>> b/gcc/config/aarch64/aarch64-simd.md
>> index
>> ad4676bc167f08951e693916c7ef796e3501762a..eba71f004ef67af654f9c512b720aa6cfdd1d7fc
>> 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -506,6 +506,19 @@
>> [(set_attr "type" "neon_dot<q>")]
>> )
>>
>> +;; These instructions map to the __builtins for the armv8.6a I8MM usdot
>> +;; (vector) Dot Product operation.
>> +(define_insn "aarch64_usdot<vsi2qi>"
>> + [(set (match_operand:VS 0 "register_operand" "=w")
>> + (plus:VS (match_operand:VS 1 "register_operand" "0")
>> + (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
>> + (match_operand:<VSI2QI> 3 "register_operand" "w")]
>> + UNSPEC_USDOT)))]
>> + "TARGET_SIMD && TARGET_I8MM"
>> + "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
>> + [(set_attr "type" "neon_dot<q>")]
>> +)
>> +
>> ;; These expands map to the Dot Product optab the vectorizer checks for.
>> ;; The auto-vectorizer expects a dot product builtin that also does an
>> ;; accumulation into the provided register.
>
> Sorry for not raising it last time, but this should just be "TARGET_I8MM".
> TARGET_SIMD is always true when TARGET_I8MM is.
Oh no worries! Thank you so much for the detailed feedback, every time :D
Fixed/
>
>> @@ -573,6 +586,25 @@
>> [(set_attr "type" "neon_dot<q>")]
>> )
>>
>> +;; These instructions map to the __builtins for the armv8.6a I8MM usdot,
>> sudot
>> +;; (by element) Dot Product operations.
>> +(define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi>"
>> + [(set (match_operand:VS 0 "register_operand" "=w")
>> + (plus:VS (match_operand:VS 1 "register_operand" "0")
>> + (unspec:VS [(match_operand:<VS:VSI2QI> 2 "register_operand" "w")
>> + (match_operand:VB 3 "register_operand" "w")
>> + (match_operand:SI 4 "immediate_operand" "i")]
>> + DOTPROD_I8MM)))]
>> + "TARGET_SIMD && TARGET_I8MM"
>> + {
>> + int nunits = GET_MODE_NUNITS (<VB:MODE>mode).to_constant ();
>> + int lane = INTVAL (operands[4]);
>> + operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode);
>> + return "<DOTPROD_I8MM:sur>dot\\t%0.<VS:Vtype>, %2.<VS:Vdottype>,
>> %3.4b[%4]";
>> + }
>> + [(set_attr "type" "neon_dot<VS:q>")]
>> +)
>> +
>> (define_expand "copysign<mode>3"
>> [(match_operand:VHSDF 0 "register_operand")
>> (match_operand:VHSDF 1 "register_operand")
>
> Same here. Another thing I should have noticed last time is that the
> canonical order for (plus ...) is to have the more complicated expression
> first. Operand 1 and the (unpec ...) should therefore be the other
> way around in the expression above. (Having operand 1 "later" than
> operands 2, 3 and 4 is OK.)
Done.
>
>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index
>> 8b861601a48b2150aa5768d717c61e0d1416747f..95b92dff69343e2b6c74174b39f3cd9d9838ddab
>> 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>> @@ -34606,6 +34606,89 @@ vrnd64xq_f64 (float64x2_t __a)
>>
>> #pragma GCC pop_options
>>
>> +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+i8mm")
>> +
>> +__extension__ extern __inline int32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
>> +{
>> + return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline int32x4_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
>> +{
>> + return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
>> +}
>> +
>> +__extension__ extern __inline int32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int
>> __index)
>> +{
>> + return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index);
>> +}
>> +
>> +__extension__ extern __inline int32x2_t
>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>> +vusdot_laneq_s32 \
>> + (int32x2_t __r, uint8x8_t __a, int8x16_t __b, const int __index)
>
> Stray backslash. It's probably easier to split the line after "__b,"
> instead of before "(". Same for later function.
Done
>
>> diff --git
>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
>> new file mode 100755
>> index
>> 0000000000000000000000000000000000000000..6a4ff054589b736c224bb2fabdcfa48439a8a420
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
>> @@ -0,0 +1,133 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
>> +/* { dg-add-options arm_v8_2a_i8mm } */
>> +/* { dg-additional-options "--save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/* Unsigned-Signed Dot Product instructions. */
>> +
>> +/*
>> +**ufoo:
>> +** ...
>> +** usdot\tv[0-9]+.2s, v[0-9]+.8b, v[0-9]+.8b
>
> Can just use a literal tab instead of "\t". Later tests check for
> "\." rather than ".", so might as well do that here too.
>
Done
>> +** ...
>> +** ret
>> +*/
>> +int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
>> +{
>> + return vusdot_s32 (r, x, y);
>> +}
>> +
>
> If we're using check-function-bodies anyway, it might be slightly more
> robust to compile at -O and check for the exact RA. E.g.:
>
> /*
> **ufoo:
> ** usdot v0\.2s, (v1\.8b, v2\.8b|v2\.8b, v1\.8b)
> ** ret
> */
>
> Just a suggestion though -- either way is fine.
done this too and as per our internal discussion also added one
xx_untied tests for usdot and one for usdot_lane
That's one xx_untied test for each of the RTL pattern types added in
aarch64-simd.md. Lmk if this is ok!
Also I found that the way we were using check-function-bodies wasn't
actually checking the assembler correctly, so I've changed that to:
+/* { dg-final { check-function-bodies "**" "" "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
which seems to perform more checks
Cheers,
Stam
>
> OK with those changes (or without the last one), thanks.
>
> Richard
>
diff --git a/gcc/config/aarch64/aarch64-builtins.c
b/gcc/config/aarch64/aarch64-builtins.c
index
c35a1b1f0299ce5af8ca1a3df0209614f7bd0f25..6bd26889f2f26a9f82dd6d40f50125eaeee41740
100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -107,6 +107,9 @@ enum aarch64_type_qualifiers
/* Lane indices selected in pairs. - must be in range, and flipped for
bigendian. */
qualifier_lane_pair_index = 0x800,
+ /* Lane indices selected in quadtuplets. - must be in range, and flipped for
+ bigendian. */
+ qualifier_lane_quadtup_index = 0x1000,
};
typedef struct
@@ -173,6 +176,10 @@ aarch64_types_ternopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_unsigned, qualifier_unsigned,
qualifier_unsigned, qualifier_immediate };
#define TYPES_TERNOPUI (aarch64_types_ternopu_imm_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_ternop_ssus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_TERNOP_SSUS (aarch64_types_ternop_ssus_qualifiers)
static enum aarch64_type_qualifiers
@@ -191,6 +198,19 @@
aarch64_types_quadopu_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
qualifier_unsigned, qualifier_lane_index };
#define TYPES_QUADOPU_LANE (aarch64_types_quadopu_lane_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_quadopssus_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_none, qualifier_unsigned,
+ qualifier_none, qualifier_lane_quadtup_index };
+#define TYPES_QUADOPSSUS_LANE_QUADTUP \
+ (aarch64_types_quadopssus_lane_quadtup_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_quadopsssu_lane_quadtup_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_none, qualifier_none,
+ qualifier_unsigned, qualifier_lane_quadtup_index };
+#define TYPES_QUADOPSSSU_LANE_QUADTUP \
+ (aarch64_types_quadopsssu_lane_quadtup_qualifiers)
+
static enum aarch64_type_qualifiers
aarch64_types_quadopu_imm_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_unsigned, qualifier_unsigned, qualifier_unsigned,
@@ -1260,6 +1280,7 @@ typedef enum
SIMD_ARG_LANE_INDEX,
SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX,
SIMD_ARG_LANE_PAIR_INDEX,
+ SIMD_ARG_LANE_QUADTUP_INDEX,
SIMD_ARG_STOP
} builtin_simd_arg;
@@ -1349,9 +1370,25 @@ aarch64_simd_expand_args (rtx target, int icode, int
have_retval,
op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 2, lane),
SImode);
}
- /* Fall through - if the lane index isn't a constant then
- the next case will error. */
- /* FALLTHRU */
+ /* If the lane index isn't a constant then error out. */
+ goto constant_arg;
+ case SIMD_ARG_LANE_QUADTUP_INDEX:
+ /* Must be a previous operand into which this is an index and
+ index is restricted to nunits / 4. */
+ gcc_assert (opc > 0);
+ if (CONST_INT_P (op[opc]))
+ {
+ machine_mode vmode = insn_data[icode].operand[opc - 1].mode;
+ unsigned int nunits
+ = GET_MODE_NUNITS (vmode).to_constant ();
+ aarch64_simd_lane_bounds (op[opc], 0, nunits / 4, exp);
+ /* Keep to GCC-vector-extension lane indices in the RTL. */
+ int lane = INTVAL (op[opc]);
+ op[opc] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane),
+ SImode);
+ }
+ /* If the lane index isn't a constant then error out. */
+ goto constant_arg;
case SIMD_ARG_CONSTANT:
constant_arg:
if (!(*insn_data[icode].operand[opc].predicate)
@@ -1464,6 +1501,8 @@ aarch64_simd_expand_builtin (int fcode, tree exp, rtx
target)
args[k] = SIMD_ARG_LANE_INDEX;
else if (d->qualifiers[qualifiers_k] & qualifier_lane_pair_index)
args[k] = SIMD_ARG_LANE_PAIR_INDEX;
+ else if (d->qualifiers[qualifiers_k] & qualifier_lane_quadtup_index)
+ args[k] = SIMD_ARG_LANE_QUADTUP_INDEX;
else if (d->qualifiers[qualifiers_k] &
qualifier_struct_load_store_lane_index)
args[k] = SIMD_ARG_STRUCT_LOAD_STORE_LANE_INDEX;
else if (d->qualifiers[qualifiers_k] & qualifier_immediate)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def
b/gcc/config/aarch64/aarch64-simd-builtins.def
index
f4ca35a59704c761fe2ac2b6d401fff7c8aba80d..651aab0f80fba5a40b5e3fa149f503acb6a48702
100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -212,10 +212,15 @@
/* Implemented by aarch64_<sur><dotprod>{_lane}{q}<dot_mode>. */
BUILTIN_VB (TERNOP, sdot, 0)
BUILTIN_VB (TERNOPU, udot, 0)
+ BUILTIN_VB (TERNOP_SSUS, usdot, 0)
BUILTIN_VB (QUADOP_LANE, sdot_lane, 0)
BUILTIN_VB (QUADOPU_LANE, udot_lane, 0)
BUILTIN_VB (QUADOP_LANE, sdot_laneq, 0)
BUILTIN_VB (QUADOPU_LANE, udot_laneq, 0)
+ BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_lane, 0)
+ BUILTIN_VB (QUADOPSSUS_LANE_QUADTUP, usdot_laneq, 0)
+ BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_lane, 0)
+ BUILTIN_VB (QUADOPSSSU_LANE_QUADTUP, sudot_laneq, 0)
/* Implemented by aarch64_fcadd<rot><mode>. */
BUILTIN_VHSDF (BINOP, fcadd90, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md
b/gcc/config/aarch64/aarch64-simd.md
index
ad4676bc167f08951e693916c7ef796e3501762a..627d51acb1a8ce8be29268a067e16a488aff16bb
100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -506,6 +506,20 @@
[(set_attr "type" "neon_dot<q>")]
)
+;; These instructions map to the __builtins for the armv8.6a I8MM usdot
+;; (vector) Dot Product operation.
+(define_insn "aarch64_usdot<vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS
+ (unspec:VS [(match_operand:<VSI2QI> 2 "register_operand" "w")
+ (match_operand:<VSI2QI> 3 "register_operand" "w")]
+ UNSPEC_USDOT)
+ (match_operand:VS 1 "register_operand" "0")))]
+ "TARGET_I8MM"
+ "usdot\\t%0.<Vtype>, %2.<Vdottype>, %3.<Vdottype>"
+ [(set_attr "type" "neon_dot<q>")]
+)
+
;; These expands map to the Dot Product optab the vectorizer checks for.
;; The auto-vectorizer expects a dot product builtin that also does an
;; accumulation into the provided register.
@@ -573,6 +587,26 @@
[(set_attr "type" "neon_dot<q>")]
)
+;; These instructions map to the __builtins for the armv8.6a I8MM usdot, sudot
+;; (by element) Dot Product operations.
+(define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi>"
+ [(set (match_operand:VS 0 "register_operand" "=w")
+ (plus:VS
+ (unspec:VS [(match_operand:<VS:VSI2QI> 2 "register_operand" "w")
+ (match_operand:VB 3 "register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ DOTPROD_I8MM)
+ (match_operand:VS 1 "register_operand" "0")))]
+ "TARGET_I8MM"
+ {
+ int nunits = GET_MODE_NUNITS (<VB:MODE>mode).to_constant ();
+ int lane = INTVAL (operands[4]);
+ operands[4] = gen_int_mode (ENDIAN_LANE_N (nunits / 4, lane), SImode);
+ return "<DOTPROD_I8MM:sur>dot\\t%0.<VS:Vtype>, %2.<VS:Vdottype>,
%3.4b[%4]";
+ }
+ [(set_attr "type" "neon_dot<VS:q>")]
+)
+
(define_expand "copysign<mode>3"
[(match_operand:VHSDF 0 "register_operand")
(match_operand:VHSDF 1 "register_operand")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index
8b861601a48b2150aa5768d717c61e0d1416747f..9be3368f20cde023d7f682a580f23c4fcf7aa7f1
100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34606,6 +34606,89 @@ vrnd64xq_f64 (float64x2_t __a)
#pragma GCC pop_options
+/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
+{
+ return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+ return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __index)
+{
+ return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, int8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_lane_s32 (int32x4_t __r, uint8x16_t __a, int8x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index)
+{
+ return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
#undef __aarch64_vget_lane_any
#undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index
1ca5ed1ef1bc66a4ecb52ee240338f18fd560384..c288de6c3a5bb237318bfcc33924dd0e7788036b
100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -650,6 +650,8 @@
UNSPEC_UMULHS ; Used in aarch64-sve2.md.
UNSPEC_UMULHRS ; Used in aarch64-sve2.md.
UNSPEC_ASRD ; Used in aarch64-sve.md.
+ UNSPEC_USDOT ; Used in aarch64-simd.md.
+ UNSPEC_SUDOT ; Used in aarch64-simd.md.
])
;; ------------------------------------------------------------------
@@ -1299,6 +1301,8 @@
(define_mode_attr f16quad [(V2SF "") (V4SF "q")])
+(define_mode_attr isquadop [(V8QI "") (V16QI "q")])
+
(define_code_attr f16mac [(plus "a") (minus "s")])
;; Map smax to smin and umax to umin.
@@ -1859,6 +1863,8 @@
(define_int_iterator DOTPROD [UNSPEC_SDOT UNSPEC_UDOT])
+(define_int_iterator DOTPROD_I8MM [UNSPEC_USDOT UNSPEC_SUDOT])
+
(define_int_iterator ADDSUBHN [UNSPEC_ADDHN UNSPEC_RADDHN
UNSPEC_SUBHN UNSPEC_RSUBHN])
@@ -2298,6 +2304,7 @@
(UNSPEC_URSHL "ur") (UNSPEC_SRSHL "sr")
(UNSPEC_UQRSHL "u") (UNSPEC_SQRSHL "s")
(UNSPEC_SDOT "s") (UNSPEC_UDOT "u")
+ (UNSPEC_USDOT "us") (UNSPEC_SUDOT "su")
])
(define_int_attr r [(UNSPEC_SQDMULH "") (UNSPEC_SQRDMULH "r")
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
new file mode 100755
index
0000000000000000000000000000000000000000..68dedc8a031b68430200680ca91fe7b1d3e0fcd9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-1.c
@@ -0,0 +1,136 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm } */
+/* { dg-additional-options "-O -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/* Unsigned-Signed Dot Product instructions. */
+
+/*
+**ufoo:
+** usdot v0\.2s, (v1\.8b, v2\.8b|v2\.8b, v1\.8b)
+** ret
+*/
+int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+ return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq:
+** usdot v0\.4s, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+** ret
+*/
+int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_s32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+** usdot v0\.2s, (v1\.8b, v2\.4b\[0\]|v2\.8b, v1\.4b\[0\])
+** ret
+*/
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+ return vusdot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**ufoo_laneq:
+** usdot v0\.2s, (v1\.8b, v2\.4b\[2\]|v2\.8b, v1\.4b\[2\])
+** ret
+*/
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+ return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**ufooq_lane:
+** usdot v0\.4s, (v1\.16b, v2\.4b\[1\]|v2\.16b, v1\.4b\[1\])
+** ret
+*/
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+ return vusdotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**ufooq_laneq:
+** usdot v0\.4s, (v1\.16b, v2\.4b\[3\]|v2\.16b, v1\.4b\[3\])
+** ret
+*/
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* Signed-Unsigned Dot Product instructions. */
+
+/*
+**sfoo_lane:
+** sudot v0\.2s, (v1\.8b, v2\.4b\[0\]|v2\.8b, v1\.4b\[0\])
+** ret
+*/
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+ return vsudot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**sfoo_laneq:
+** sudot v0\.2s, (v1\.8b, v2\.4b\[2\]|v2\.8b, v1\.4b\[2\])
+** ret
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+ return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_lane:
+** sudot v0\.4s, (v1\.16b, v2\.4b\[1\]|v2\.16b, v1\.4b\[1\])
+** ret
+*/
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+ return vsudotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**sfooq_laneq:
+** sudot v0\.4s, (v1\.16b, v2\.4b\[3\]|v2\.16b, v1\.4b\[3\])
+** ret
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+ return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
+/*
+**ufoo_untied:
+** mov v0\.8b, v1\.8b
+** usdot v0\.2s, (v2\.8b, v3\.8b|v3\.8b, v2\.8b)
+** ret
+*/
+int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+ return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq_laneq_untied:
+** mov v0\.16b, v1\.16b
+** usdot v0\.4s, (v2\.16b, v3\.4b\[3\]|v3\.16b, v2\.4b\[3\])
+** ret
+*/
+int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x,
int8x16_t y)
+{
+ return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c
new file mode 100755
index
0000000000000000000000000000000000000000..c0adeb82286a3cd86152eac985376e2b38bf1f01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-2.c
@@ -0,0 +1,137 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm } */
+/* { dg-additional-options "-mbig-endian -O -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/* Unsigned-Signed Dot Product instructions. */
+
+/*
+**ufoo:
+** usdot v0\.2s, (v1\.8b, v2\.8b|v2\.8b, v1\.8b)
+** ret
+*/
+int32x2_t ufoo (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+ return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq:
+** usdot v0\.4s, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+** ret
+*/
+int32x4_t ufooq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_s32 (r, x, y);
+}
+
+/*
+**ufoo_lane:
+** usdot v0\.2s, (v1\.8b, v2\.4b\[0\]|v2\.8b, v1\.4b\[0\])
+** ret
+*/
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+ return vusdot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**ufoo_laneq:
+** usdot v0\.2s, (v1\.8b, v2\.4b\[2\]|v2\.8b, v1\.4b\[2\])
+** ret
+*/
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+ return vusdot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**ufooq_lane:
+** usdot v0\.4s, (v1\.16b, v2\.4b\[1\]|v2\.16b, v1\.4b\[1\])
+** ret
+*/
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+ return vusdotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**ufooq_laneq:
+** usdot v0\.4s, (v1\.16b, v2\.4b\[3\]|v2\.16b, v1\.4b\[3\])
+** ret
+*/
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
+/* Signed-Unsigned Dot Product instructions. */
+
+/*
+**sfoo_lane:
+** sudot v0\.2s, (v1\.8b, v2\.4b\[0\]|v2\.8b, v1\.4b\[0\])
+** ret
+*/
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+ return vsudot_lane_s32 (r, x, y, 0);
+}
+
+/*
+**sfoo_laneq:
+** sudot v0\.2s, (v1\.8b, v2\.4b\[2\]|v2\.8b, v1\.4b\[2\])
+** ret
+*/
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+ return vsudot_laneq_s32 (r, x, y, 2);
+}
+
+/*
+**sfooq_lane:
+** sudot v0\.4s, (v1\.16b, v2\.4b\[1\]|v2\.16b, v1\.4b\[1\])
+** ret
+*/
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+ return vsudotq_lane_s32 (r, x, y, 1);
+}
+
+/*
+**sfooq_laneq:
+** sudot v0\.4s, (v1\.16b, v2\.4b\[3\]|v2\.16b, v1\.4b\[3\])
+** ret
+*/
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+ return vsudotq_laneq_s32 (r, x, y, 3);
+}
+
+/*
+**ufoo_untied:
+** mov v0\.8b, v1\.8b
+** usdot v0\.2s, (v2\.8b, v3\.8b|v3\.8b, v2\.8b)
+** ret
+*/
+int32x2_t ufoo_untied (int32x2_t unused, int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+ return vusdot_s32 (r, x, y);
+}
+
+/*
+**ufooq_laneq_untied:
+** mov v0\.16b, v1\.16b
+** usdot v0\.4s, (v2\.16b, v3\.4b\[3\]|v3\.16b, v2\.4b\[3\])
+** ret
+*/
+int32x4_t ufooq_laneq_untied (int32x2_t unused, int32x4_t r, uint8x16_t x,
int8x16_t y)
+{
+ return vusdotq_laneq_s32 (r, x, y, 3);
+}
+
+
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c
new file mode 100755
index
0000000000000000000000000000000000000000..18ecabef8dc6b99872d71c8e412b6f4b4809e901
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-3.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm } */
+/* { dg-additional-options "--save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+int32x2_t ufoo_lane (int32x2_t r, uint8x8_t x, int8x8_t y)
+{
+ /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
+ return vusdot_lane_s32 (r, x, y, -1);
+}
+
+int32x2_t ufoo_laneq (int32x2_t r, uint8x8_t x, int8x16_t y)
+{
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vusdot_laneq_s32 (r, x, y, -1);
+}
+
+int32x4_t ufooq_lane (int32x4_t r, uint8x16_t x, int8x8_t y)
+{
+ /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
+ return vusdotq_lane_s32 (r, x, y, 2);
+}
+
+int32x4_t ufooq_laneq (int32x4_t r, uint8x16_t x, int8x16_t y)
+{
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vusdotq_laneq_s32 (r, x, y, 4);
+}
diff --git
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c
new file mode 100644
index
0000000000000000000000000000000000000000..66c87d48694bad9624b491aec4cd1a38b75fbb95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vdot-compile-3-4.c
@@ -0,0 +1,31 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_i8mm_ok } */
+/* { dg-add-options arm_v8_2a_i8mm } */
+/* { dg-additional-options "--save-temps" } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+int32x2_t sfoo_lane (int32x2_t r, int8x8_t x, uint8x8_t y)
+{
+ /* { dg-error "lane -1 out of range 0 - 1" "" { target *-*-* } 0 } */
+ return vsudot_lane_s32 (r, x, y, -1);
+}
+
+int32x2_t sfoo_laneq (int32x2_t r, int8x8_t x, uint8x16_t y)
+{
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vsudot_laneq_s32 (r, x, y, -1);
+}
+
+int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, uint8x8_t y)
+{
+ /* { dg-error "lane 2 out of range 0 - 1" "" { target *-*-* } 0 } */
+ return vsudotq_lane_s32 (r, x, y, 2);
+}
+
+int32x4_t sfooq_laneq (int32x4_t r, int8x16_t x, uint8x16_t y)
+{
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ return vsudotq_laneq_s32 (r, x, y, 4);
+}