The AArch64 FEAT_LUT extension is optional from Armv9.2-a and mandatory from Armv9.5-a. It introduces instructions for lookup table reads with bit indices.
This patch adds support for AdvSIMD lut intrinsics. The intrinsics for this extension are implemented as the following builtin functions: * vluti2{q}_lane{q}_{u8|s8|p8} * vluti2{q}_lane{q}_{u16|s16|p16|f16|bf16} * vluti4q_lane{q}_{u8|s8|p8} * vluti4q_lane{q}_{u16|s16|p16|f16|bf16}_x2 We also introduced a new approach to do lane checks for AdvSIMD. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (ENTRY): Add support for one more type. (enum class): Add enum variant for ternary functions. (struct aarch64_pragma_builtins_data): Add support for one more type. (aarch64_get_number_of_args): Add support for ternary functions. (require_integer_constant): Function to check whether an argument is a const integer. (require_immediate_range): Function to check whether the const integer argument fits in a range. (check_simd_lane_bounds): Main function to check the validity of an index argument. (aarch64_general_check_builtin_call): Call function_checker::check_simd_lane_bounds. (aarch64_expand_pragma_builtin): Add support for lut unspecs. * config/aarch64/aarch64-option-extensions.def (AARCH64_OPT_EXTENSION): Add lut option. * config/aarch64/aarch64-simd-pragma-builtins.def (ENTRY_BINARY_LANE): Modify to use new ENTRY macro. (ENTRY_TERNARY_VLUT8): Macro to declare lut intrinsics. (ENTRY_TERNARY_VLUT16): Macro to declare lut intrinsics. (REQUIRED_EXTENSIONS): Declare lut intrinsics. * config/aarch64/aarch64-simd.md (@aarch64_<vluti_uns_op><VLUT:mode><VB:mode>): Instruction pattern for luti2 and luti4 intrinsics. (@aarch64_lutx2<VLUT:mode><VB:mode>): Instruction pattern for luti4x2 intrinsics. * config/aarch64/aarch64.h (TARGET_LUT): lut flag. * config/aarch64/iterators.md: Iterators and attributes for lut. * doc/invoke.texi: Document extension in AArch64 Options. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/lut-incorrect-range.c: New test. * gcc.target/aarch64/simd/lut-no-flag.c: New test. * gcc.target/aarch64/simd/lut.c: New test. --- gcc/config/aarch64/aarch64-builtins.cc | 129 ++- .../aarch64/aarch64-option-extensions.def | 2 + .../aarch64/aarch64-simd-pragma-builtins.def | 40 +- gcc/config/aarch64/aarch64-simd.md | 24 + gcc/config/aarch64/aarch64.h | 4 + gcc/config/aarch64/iterators.md | 55 +- gcc/doc/invoke.texi | 2 + .../aarch64/simd/lut-incorrect-range.c | 212 +++++ .../gcc.target/aarch64/simd/lut-no-flag.c | 10 + gcc/testsuite/gcc.target/aarch64/simd/lut.c | 849 ++++++++++++++++++ 10 files changed, 1304 insertions(+), 23 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut.c
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index f4d719a6b5a..45aeca33e3f 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -780,7 +780,7 @@ typedef struct AARCH64_SIMD_BUILTIN_##T##_##N##A, #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, U) \ +#define ENTRY(N, S, T0, T1, T2, T3, U) \ AARCH64_##N, enum aarch64_builtins @@ -1596,6 +1596,7 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) enum class aarch64_builtin_signatures { binary, + ternary, }; namespace { @@ -1616,18 +1617,25 @@ namespace simd_types { constexpr simd_type f16 { V4HFmode, qualifier_none }; constexpr simd_type f16q { V8HFmode, qualifier_none }; + constexpr simd_type f16qx2 { V2x8HFmode, qualifier_none }; constexpr simd_type p16 { V4HImode, qualifier_poly }; constexpr simd_type p16q { V8HImode, qualifier_poly }; + constexpr simd_type p16qx2 { V2x8HImode, qualifier_poly }; constexpr simd_type s16 { V4HImode, qualifier_none }; constexpr simd_type s16q { V8HImode, qualifier_none }; + constexpr simd_type s16qx2 { V2x8HImode, qualifier_none }; constexpr simd_type u16 { V4HImode, qualifier_unsigned }; constexpr simd_type u16q { V8HImode, qualifier_unsigned }; + constexpr simd_type u16qx2 { V2x8HImode, qualifier_unsigned }; constexpr simd_type bf16 { V4BFmode, qualifier_none }; constexpr simd_type bf16q { V8BFmode, qualifier_none }; + constexpr simd_type bf16qx2 { V2x8BFmode, qualifier_none }; constexpr simd_type f32 { V2SFmode, qualifier_none }; constexpr simd_type f32q { V4SFmode, qualifier_none }; + constexpr simd_type s32_index { SImode, qualifier_lane_index }; + constexpr simd_type f64q { V2DFmode, qualifier_none }; constexpr simd_type none { VOIDmode, qualifier_none }; @@ -1636,9 +1644,10 @@ namespace simd_types { } #undef ENTRY -#define ENTRY(N, S, T0, T1, T2, U) \ +#define ENTRY(N, S, T0, T1, T2, T3, U) \ {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \ - simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS}, + simd_types::T2, simd_types::T3, U, \ + aarch64_required_extensions::REQUIRED_EXTENSIONS}, /* Initialize pragma builtins. */ @@ -1646,7 +1655,7 @@ struct aarch64_pragma_builtins_data { const char *name; aarch64_builtin_signatures signature; - simd_type types[3]; + simd_type types[4]; int unspec; aarch64_required_extensions required_extensions; }; @@ -1658,11 +1667,18 @@ static aarch64_pragma_builtins_data aarch64_pragma_builtins[] = { static unsigned int aarch64_get_number_of_args (const aarch64_pragma_builtins_data &builtin_data) { - if (builtin_data.signature == aarch64_builtin_signatures::binary) - return 2; - else - // No other signature supported. - gcc_unreachable (); + switch (builtin_data.signature) + { + case aarch64_builtin_signatures::binary: + return 2; + case aarch64_builtin_signatures::ternary: + return 3; + default: + // No other signature supported. + gcc_unreachable (); + } + + gcc_unreachable (); } static tree @@ -2520,6 +2536,78 @@ aarch64_general_required_extensions (unsigned int code) return ext::streaming_compatible (0); } +namespace function_checker { + +void +require_integer_constant (location_t location, tree arg) +{ + if (TREE_CODE (arg) != INTEGER_CST) + { + error_at (location, "Constant-type integer argument expected"); + return; + } +} + +void +require_immediate_range (location_t location, tree arg, HOST_WIDE_INT min, + HOST_WIDE_INT max) +{ + if (wi::to_widest (arg) < min || wi::to_widest (arg) > max) + { + error_at (location, "lane out of range %wd - %wd", min, max); + return; + } +} + +/* Validates indexing into a vector using the index's size and the instruction, + where instruction is represented by the unspec. + This only works for intrinsics declared using pragmas in + aarch64-simd-pragma-builtins.def. */ + +void +check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data + *builtin_data, tree *args) +{ + if (builtin_data == NULL) + // Don't check for functions that are not declared in + // aarch64-simd-pragma-builtins.def. + return; + + auto nargs = aarch64_get_number_of_args (*builtin_data); + switch (builtin_data->unspec) + { + case UNSPEC_LUTI2: + case UNSPEC_LUTI4: + case UNSPEC_LUTI4x2: + { + auto index_arg = args[nargs - 1]; + require_integer_constant (location, index_arg); + + auto vector_to_index_mode = builtin_data->types[nargs - 1].mode; + int vector_to_index_nunits + = GET_MODE_NUNITS (vector_to_index_mode).to_constant (); + int output_mode_nunits + = GET_MODE_NUNITS (builtin_data->types[0].mode).to_constant (); + + auto low = 0; + int high; + if (builtin_data->unspec == UNSPEC_LUTI2) + high = (4 * vector_to_index_nunits / output_mode_nunits) - 1; + else + high = (2 * vector_to_index_nunits / output_mode_nunits) - 1; + + require_immediate_range (location, index_arg, low, high); + break; + } + + default: + // Don't need to check lanes for any other operator. + return; + } +} + +} + bool aarch64_general_check_builtin_call (location_t location, vec<location_t>, unsigned int code, tree fndecl, @@ -2531,6 +2619,9 @@ aarch64_general_check_builtin_call (location_t location, vec<location_t>, if (!aarch64_check_required_extensions (location, decl, required_extensions)) return false; + auto builtin_data = aarch64_get_pragma_builtin (code); + function_checker::check_simd_lane_bounds (location, builtin_data, args); + switch (code) { case AARCH64_RSR: @@ -3427,7 +3518,7 @@ aarch64_expand_pragma_builtin (tree exp, rtx target, { auto nargs = aarch64_get_number_of_args (*builtin_data); - expand_operand ops[3]; + expand_operand ops[4]; create_output_operand (&ops[0], target, builtin_data->types[0].mode); for (unsigned int i = 1; i <= nargs; ++i) create_input_operand (&ops[i], @@ -3444,9 +3535,27 @@ aarch64_expand_pragma_builtin (tree exp, rtx target, expand_insn (icode, nargs + 1, ops); target = ops[0].value; break; + + case UNSPEC_LUTI2: + case UNSPEC_LUTI4: + icode = code_for_aarch64 (builtin_data->unspec, + builtin_data->types[1].mode, + builtin_data->types[2].mode); + expand_insn (icode, nargs + 1, ops); + target = ops[0].value; + break; + + case UNSPEC_LUTI4x2: + icode = code_for_aarch64_lutx2 (builtin_data->types[0].mode, + builtin_data->types[2].mode); + expand_insn (icode, nargs + 1, ops); + target = ops[0].value; + break; + default: gcc_unreachable (); } + return target; } diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def index f4cf6618238..f555de50ea6 100644 --- a/gcc/config/aarch64/aarch64-option-extensions.def +++ b/gcc/config/aarch64/aarch64-option-extensions.def @@ -247,6 +247,8 @@ AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8") AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax") +AARCH64_OPT_EXTENSION("lut", LUT, (SIMD), (), (), "lut") + #undef AARCH64_OPT_FMV_EXTENSION #undef AARCH64_OPT_EXTENSION #undef AARCH64_FMV_FEATURE diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def index e49db23cbd1..ab6e520f4d7 100644 --- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def @@ -20,7 +20,11 @@ #undef ENTRY_BINARY #define ENTRY_BINARY(N, T0, T1, T2, U) \ - ENTRY (N, binary, T0, T1, T2, U) + ENTRY (N, binary, T0, T1, T2, none, U) + +#undef ENTRY_BINARY_LANE +#define ENTRY_BINARY_LANE(N, T0, T1, T2, U) \ + ENTRY (N, ternary, T0, T1, T2, s32_index, U) #undef ENTRY_BINARY_VHSDF #define ENTRY_BINARY_VHSDF(NAME, UNSPEC) \ @@ -30,8 +34,42 @@ ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC) \ ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC) +#undef ENTRY_TERNARY_VLUT8 +#define ENTRY_TERNARY_VLUT8(T) \ + ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8, UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti2_laneq_##T##8, T##8q, T##8, u8q, UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti2q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti2q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti4q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI4) \ + ENTRY_BINARY_LANE (vluti4q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI4) + +#undef ENTRY_TERNARY_VLUT16 +#define ENTRY_TERNARY_VLUT16(T) \ + ENTRY_BINARY_LANE (vluti2_lane_##T##16, T##16q, T##16, u8, UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti2_laneq_##T##16, T##16q, T##16, u8q, UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti2q_lane_##T##16, T##16q, T##16q, u8, UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti2q_laneq_##T##16, T##16q, T##16q, u8q, \ + UNSPEC_LUTI2) \ + ENTRY_BINARY_LANE (vluti4q_lane_##T##16_x2, T##16q, T##16qx2, u8, \ + UNSPEC_LUTI4x2) \ + ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q, \ + UNSPEC_LUTI4x2) + // faminmax #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX) ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX) ENTRY_BINARY_VHSDF (vamin, UNSPEC_FAMIN) #undef REQUIRED_EXTENSIONS + +// lut +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_LUT) +ENTRY_TERNARY_VLUT8 (p) +ENTRY_TERNARY_VLUT8 (s) +ENTRY_TERNARY_VLUT8 (u) + +ENTRY_TERNARY_VLUT16 (bf) +ENTRY_TERNARY_VLUT16 (f) +ENTRY_TERNARY_VLUT16 (p) +ENTRY_TERNARY_VLUT16 (s) +ENTRY_TERNARY_VLUT16 (u) +#undef REQUIRED_EXTENSIONS diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index cfe95bd4c31..32ec30ad5d1 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9999,3 +9999,27 @@ "TARGET_FAMINMAX" "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" ) + +;; lut +(define_insn "@aarch64_<vluti_uns_op><VLUT:mode><VB:mode>" + [(set (match_operand:<VLUT:VCONQ> 0 "register_operand" "=w") + (unspec:<VLUT:VCONQ> + [(match_operand:VLUT 1 "register_operand" "w") + (match_operand:VB 2 "register_operand" "w") + (match_operand:SI 3 "const_int_operand" "n")] + VLUT_UNS))] + "TARGET_SIMD" + "<vluti_uns_op>\t%0<VLUT:Vconqtype>, {%1<VLUT:Vconqtype>}, %2[%3]" +) + +;; lutx2 +(define_insn "@aarch64_lutx2<VLUT:mode><VB:mode>" + [(set (match_operand:VLUT 0 "register_operand" "=w") + (unspec:VLUT + [(match_operand:<VLUT:velt_vstructx2> 1 "register_operand" "w") + (match_operand:VB 2 "register_operand" "w") + (match_operand:SI 3 "const_int_operand" "n")] + VLUTx2_UNS))] + "TARGET_SIMD" + "<vluti_uns_op>\t%0<VLUT:Vmtype>, {%S1<VLUT:Vmtype>, %T1<VLUT:Vmtype>}, %2[%3]" +) diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index b063c315fba..a8e91e4cf3f 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -487,6 +487,10 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED #define TARGET_FAMINMAX AARCH64_HAVE_ISA (FAMINMAX) #define TARGET_SVE_FAMINMAX (TARGET_SVE && TARGET_FAMINMAX) +/* Lookup table (LUTI) extension instructions are + enabled through +lut. */ +#define TARGET_LUT AARCH64_HAVE_ISA (LUT) + /* Prefer different predicate registers for the output of a predicated operation over re-using an existing input predicate. */ #define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 023893d35f3..f3c4f161659 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -426,6 +426,10 @@ (V8HF "TARGET_SIMD_F16INST") V2SF V4SF]) +;; Modes available for Advanced SIMD lut operations. +(define_mode_iterator VLUT [V8QI V16QI V4HI V8HI V4HF V8HF V4BF V8BF]) +(define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF]) + ;; Iterators for single modes, for "@" patterns. (define_mode_iterator VNx16QI_ONLY [VNx16QI]) (define_mode_iterator VNx16SI_ONLY [VNx16SI]) @@ -1090,6 +1094,9 @@ UNSPEC_FCVTXN ; Used in aarch64-simd.md. UNSPEC_FAMAX ; Used in aarch64-simd.md. UNSPEC_FAMIN ; Used in aarch64-simd.md. + UNSPEC_LUTI2 ; Used in aarch64-simd.md. + UNSPEC_LUTI4 ; Used in aarch64-simd.md. + UNSPEC_LUTI4x2 ; Used in aarch64-simd.md. ;; All used in aarch64-sve2.md UNSPEC_ADDQV @@ -1536,6 +1543,12 @@ (QI "8b") (HI "8b") (V4BF "8b") (V8BF "16b")]) +;; Mode to double type mapping. +(define_mode_attr Vconqtype [(V8QI ".16b") (V16QI ".16b") + (V4HI ".8h") (V8HI ".8h") + (V4HF ".8h") (V8HF ".8h") + (V4BF ".8h") (V8BF ".8h")]) + ;; Advanced SIMD vector structure to element modes. (define_mode_attr VSTRUCT_ELT [(V2x8QI "V8QI") (V2x4HI "V4HI") (V2x2SI "V2SI") (V2x1DI "DI") @@ -1562,6 +1575,15 @@ (V4x8HF "V8HF") (V4x4SF "V4SF") (V4x2DF "V2DF") (V4x8BF "V8BF")]) +;; Advanced SIMD element to vector structure x2 modes. +(define_mode_attr velt_vstructx2 [(V8QI "V2x8QI") (V4HI "V2x4HI") + (V2SI "V2x2SI") (V4HF "V2x4HF") + (V2SF "V2x2SF") (V4BF "V2x4BF") + (V16QI "V2x16QI") (V8HI "V2x8HI") + (V4SI "V2x4SI") (V2DI "V2x2DI") + (V8HF "V2x8HF") (V4SF "V2x4SF") + (V2DF "V2x2DF") (V8BF "V2x8BF")]) + ;; Advanced SIMD vector structure to element modes in lower case. (define_mode_attr vstruct_elt [(V2x8QI "v8qi") (V2x4HI "v4hi") (V2x2SI "v2si") (V2x1DI "di") @@ -1666,6 +1688,7 @@ (V2SI "V4SI") (V4SI "V4SI") (DI "V2DI") (V2DI "V2DI") (V4HF "V8HF") (V8HF "V8HF") + (V4BF "V8BF") (V8BF "V8BF") (V2SF "V4SF") (V4SF "V4SF") (V2DF "V2DF") (SI "V4SI") (HI "V8HI") (QI "V16QI") @@ -3146,6 +3169,14 @@ (define_int_iterator LAST [UNSPEC_LASTA UNSPEC_LASTB]) +;; Iterators for fp8 operations + +(define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) + +(define_int_iterator VLUT_UNS [UNSPEC_LUTI2 UNSPEC_LUTI4]) + +(define_int_iterator VLUTx2_UNS [UNSPEC_LUTI4x2]) + (define_int_iterator SVE_INT_UNARY [UNSPEC_REVB UNSPEC_REVH UNSPEC_REVW]) @@ -3949,6 +3980,9 @@ (define_code_attr binqops_op_rev [(ss_plus "sqsub") (ss_minus "sqadd")]) +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) + ;; The SVE logical instruction that implements an unspec. (define_int_attr logicalf_op [(UNSPEC_ANDF "and") (UNSPEC_IORF "orr") @@ -4161,6 +4195,15 @@ (define_int_attr frintnzs_op [(UNSPEC_FRINT32Z "frint32z") (UNSPEC_FRINT32X "frint32x") (UNSPEC_FRINT64Z "frint64z") (UNSPEC_FRINT64X "frint64x")]) +(define_int_attr faminmax_cond_uns_op + [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) + +(define_int_attr faminmax_uns_op + [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_int_attr vluti_uns_op + [(UNSPEC_LUTI2 "luti2") (UNSPEC_LUTI4 "luti4") (UNSPEC_LUTI4x2 "luti4")]) + ;; The condition associated with an UNSPEC_COND_<xx>. (define_int_attr cmp_op [(UNSPEC_COND_CMPEQ_WIDE "eq") (UNSPEC_COND_CMPGE_WIDE "ge") @@ -4719,15 +4762,3 @@ (define_int_attr bits_etype [(8 "b") (16 "h") (32 "s") (64 "d")]) -;; Iterators and attributes for faminmax - -(define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN]) - -(define_int_attr faminmax_cond_uns_op - [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")]) - -(define_int_attr faminmax_uns_op - [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) - -(define_code_attr faminmax_op - [(smax "famax") (smin "famin")]) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 346ac1369b8..a42bd1b14db 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -21905,6 +21905,8 @@ Enable the RCpc3 (Release Consistency) extension. Enable the fp8 (8-bit floating point) extension. @item faminmax Enable the Floating Point Absolute Maximum/Minimum extension. +@item lut +Enable the Lookup Table extension. @item sve-b16b16 Enable the SVE non-widening brain floating-point (@code{bf16}) extension. This only has an effect when @code{sve2} or @code{sme2} are also enabled. diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c new file mode 100644 index 00000000000..93fc6d89768 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c @@ -0,0 +1,212 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -march=armv9-a+lut" } */ + +#include "arm_neon.h" + +void +test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t c, uint8x16_t d) +{ + vluti2_lane_u8(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti2_lane_u8(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti2_laneq_u8(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_laneq_u8(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_lane_u8(c, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti2q_lane_u8(c, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti2q_laneq_u8(c, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_laneq_u8(c, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} + +void +test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t c, uint8x16_t d) +{ + vluti2_lane_s8(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti2_lane_s8(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti2_laneq_s8(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_laneq_s8(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_lane_s8(c, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti2q_lane_s8(c, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti2q_laneq_s8(c, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_laneq_s8(c, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} + +void +test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t c, uint8x16_t d) +{ + vluti2_lane_p8(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti2_lane_p8(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti2_laneq_p8(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_laneq_p8(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_lane_p8(c, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti2q_lane_p8(c, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti2q_laneq_p8(c, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_laneq_p8(c, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} + +void +test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t c, uint8x16_t d) +{ + vluti2_lane_u16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_lane_u16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2_laneq_u16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2_laneq_u16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */ + + vluti2q_lane_u16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_lane_u16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_laneq_u16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2q_laneq_u16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */ +} + +void +test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t c, uint8x16_t d) +{ + vluti2_lane_s16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_lane_s16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2_laneq_s16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2_laneq_s16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */ + + vluti2q_lane_s16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_lane_s16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_laneq_s16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2q_laneq_s16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */ +} + +void +test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t c, uint8x16_t d) +{ + vluti2_lane_p16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_lane_p16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2_laneq_p16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2_laneq_p16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */ + + vluti2q_lane_p16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_lane_p16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_laneq_p16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2q_laneq_p16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */ +} + +void +test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t c, uint8x16_t d) +{ + vluti2_lane_f16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_lane_f16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2_laneq_f16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2_laneq_f16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */ + + vluti2q_lane_f16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_lane_f16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_laneq_f16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2q_laneq_f16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */ +} + +void +test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t c, uint8x16_t d) +{ + vluti2_lane_bf16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2_lane_bf16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2_laneq_bf16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2_laneq_bf16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */ + + vluti2q_lane_bf16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti2q_lane_bf16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */ + + vluti2q_laneq_bf16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */ + vluti2q_laneq_bf16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */ +} + +void +test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_u8(a, b, -1); /* { dg-error {lane out of range 0 - 0} } */ + vluti4q_lane_u8(a, b, 1); /* { dg-error {lane out of range 0 - 0} } */ + + vluti4q_laneq_u8(a, d, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_laneq_u8(a, d, 2); /* { dg-error {lane out of range 0 - 1} } */ +} + +void +test_vluti4q_lanes8(int8x16_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_s8(a, b, -1); /* { dg-error {lane out of range 0 - 0} } */ + vluti4q_lane_s8(a, b, 1); /* { dg-error {lane out of range 0 - 0} } */ + + vluti4q_laneq_s8(a, d, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_laneq_s8(a, d, 2); /* { dg-error {lane out of range 0 - 1} } */ +} + +void +test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_p8(a, b, -1); /* { dg-error {lane out of range 0 - 0} } */ + vluti4q_lane_p8(a, b, 1); /* { dg-error {lane out of range 0 - 0} } */ + + vluti4q_laneq_p8(a, d, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_laneq_p8(a, d, 2); /* { dg-error {lane out of range 0 - 1} } */ +} + +void +test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_u16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_lane_u16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti4q_laneq_u16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti4q_laneq_u16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} + +void +test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_s16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_lane_s16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti4q_laneq_s16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti4q_laneq_s16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} + +void +test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_p16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_lane_p16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti4q_laneq_p16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti4q_laneq_p16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} + +void +test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_f16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_lane_f16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti4q_laneq_f16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti4q_laneq_f16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} + +void +test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, uint8x16_t d) +{ + vluti4q_lane_bf16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */ + vluti4q_lane_bf16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */ + + vluti4q_laneq_bf16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */ + vluti4q_laneq_bf16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */ +} diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c new file mode 100644 index 00000000000..d180d8f2150 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=armv9-a" } */ + +#include "arm_neon.h" + +void +test (uint8x8_t a, uint8x8_t b) +{ + vluti2_lane_u8 (a, b, 0); /* { dg-error {ACLE function 'vluti2_lane_u8' requires ISA extension 'lut'} } */ +} diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut.c b/gcc/testsuite/gcc.target/aarch64/simd/lut.c new file mode 100644 index 00000000000..fc89b215a93 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/lut.c @@ -0,0 +1,849 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -march=armv9-a+lut" } */ +/* { dg-final { check-function-bodies "**" ""} } */ + +#include "arm_neon.h" + +/* +** test_vluti2_laneu8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t results[]) +{ + results[0] = vluti2_lane_u8(a, b, 0); + results[1] = vluti2_lane_u8(a, b, 1); +} + +/* +** test_vluti2_lanequ8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_lanequ8(uint8x8_t a, uint8x16_t b, uint8x16_t results[]) +{ + results[0] = vluti2_laneq_u8(a, b, 0); + results[1] = vluti2_laneq_u8(a, b, 1); + results[2] = vluti2_laneq_u8(a, b, 2); + results[3] = vluti2_laneq_u8(a, b, 3); +} + +/* +** test_vluti2q_laneu8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti2q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[]) +{ + results[0] = vluti2q_lane_u8(a, b, 0); + results[1] = vluti2q_lane_u8(a, b, 1); +} + +/* +** test_vluti2q_lanequ8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[]) +{ + results[0] = vluti2q_laneq_u8(a, b, 0); + results[1] = vluti2q_laneq_u8(a, b, 1); + results[2] = vluti2q_laneq_u8(a, b, 2); + results[3] = vluti2q_laneq_u8(a, b, 3); +} + +/* +** test_vluti2_lanes8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t results[]) +{ + results[0] = vluti2_lane_s8(a, b, 0); + results[1] = vluti2_lane_s8(a, b, 1); +} + +/* +** test_vluti2_laneqs8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_laneqs8(int8x8_t a, uint8x16_t b, int8x16_t results[]) +{ + results[0] = vluti2_laneq_s8(a, b, 0); + results[1] = vluti2_laneq_s8(a, b, 1); + results[2] = vluti2_laneq_s8(a, b, 2); + results[3] = vluti2_laneq_s8(a, b, 3); +} + +/* +** test_vluti2q_lanes8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti2q_lanes8(int8x16_t a, uint8x8_t b, int8x16_t results[]) +{ + results[0] = vluti2q_lane_s8(a, b, 0); + results[1] = vluti2q_lane_s8(a, b, 1); +} + +/* +** test_vluti2q_laneqs8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_laneqs8(int8x16_t a, uint8x16_t b, int8x16_t results[]) +{ + results[0] = vluti2q_laneq_s8(a, b, 0); + results[1] = vluti2q_laneq_s8(a, b, 1); + results[2] = vluti2q_laneq_s8(a, b, 2); + results[3] = vluti2q_laneq_s8(a, b, 3); +} + +/* +** test_vluti2_lanep8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t results[]) +{ + results[0] = vluti2_lane_p8(a, b, 0); + results[1] = vluti2_lane_p8(a, b, 1); +} + +/* +** test_vluti2_laneqp8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_laneqp8(poly8x8_t a, uint8x16_t b, poly8x16_t results[]) +{ + results[0] = vluti2_laneq_p8(a, b, 0); + results[1] = vluti2_laneq_p8(a, b, 1); + results[2] = vluti2_laneq_p8(a, b, 2); + results[3] = vluti2_laneq_p8(a, b, 3); +} + +/* +** test_vluti2q_lanep8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti2q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[]) +{ + results[0] = vluti2q_lane_p8(a, b, 0); + results[1] = vluti2q_lane_p8(a, b, 1); +} + +/* +** test_vluti2q_laneqp8: +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\] +** luti2 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[]) +{ + results[0] = vluti2q_laneq_p8(a, b, 0); + results[1] = vluti2q_laneq_p8(a, b, 1); + results[2] = vluti2q_laneq_p8(a, b, 2); + results[3] = vluti2q_laneq_p8(a, b, 3); +} + +/* +** test_vluti2_laneu16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t results[]) +{ + results[0] = vluti2_lane_u16(a, b, 0); + results[1] = vluti2_lane_u16(a, b, 1); + results[2] = vluti2_lane_u16(a, b, 2); + results[3] = vluti2_lane_u16(a, b, 3); +} + +/* +** test_vluti2_lanequ16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2_lanequ16(uint16x4_t a, uint8x16_t b, uint16x8_t results[]) +{ + results[0] = vluti2_laneq_u16(a, b, 0); + results[1] = vluti2_laneq_u16(a, b, 1); + results[2] = vluti2_laneq_u16(a, b, 2); + results[3] = vluti2_laneq_u16(a, b, 3); + results[4] = vluti2_laneq_u16(a, b, 4); + results[5] = vluti2_laneq_u16(a, b, 5); + results[6] = vluti2_laneq_u16(a, b, 6); + results[7] = vluti2_laneq_u16(a, b, 7); +} + +/* +** test_vluti2q_laneu16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_laneu16(uint16x8_t a, uint8x8_t b, uint16x8_t results[]) +{ + results[0] = vluti2q_lane_u16(a, b, 0); + results[1] = vluti2q_lane_u16(a, b, 1); + results[2] = vluti2q_lane_u16(a, b, 2); + results[3] = vluti2q_lane_u16(a, b, 3); +} + +/* +** test_vluti2q_lanequ16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2q_lanequ16(uint16x8_t a, uint8x16_t b, uint16x8_t results[]) +{ + results[0] = vluti2q_laneq_u16(a, b, 0); + results[1] = vluti2q_laneq_u16(a, b, 1); + results[2] = vluti2q_laneq_u16(a, b, 2); + results[3] = vluti2q_laneq_u16(a, b, 3); + results[4] = vluti2q_laneq_u16(a, b, 4); + results[5] = vluti2q_laneq_u16(a, b, 5); + results[6] = vluti2q_laneq_u16(a, b, 6); + results[7] = vluti2q_laneq_u16(a, b, 7); +} + +/* +** test_vluti2_lanes16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t results[]) +{ + results[0] = vluti2_lane_s16(a, b, 0); + results[1] = vluti2_lane_s16(a, b, 1); + results[2] = vluti2_lane_s16(a, b, 2); + results[3] = vluti2_lane_s16(a, b, 3); +} + +/* +** test_vluti2_laneqs16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2_laneqs16(int16x4_t a, uint8x16_t b, int16x8_t results[]) +{ + results[0] = vluti2_laneq_s16(a, b, 0); + results[1] = vluti2_laneq_s16(a, b, 1); + results[2] = vluti2_laneq_s16(a, b, 2); + results[3] = vluti2_laneq_s16(a, b, 3); + results[4] = vluti2_laneq_s16(a, b, 4); + results[5] = vluti2_laneq_s16(a, b, 5); + results[6] = vluti2_laneq_s16(a, b, 6); + results[7] = vluti2_laneq_s16(a, b, 7); +} + +/* +** test_vluti2q_lanes16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_lanes16(int16x8_t a, uint8x8_t b, int16x8_t results[]) +{ + results[0] = vluti2q_lane_s16(a, b, 0); + results[1] = vluti2q_lane_s16(a, b, 1); + results[2] = vluti2q_lane_s16(a, b, 2); + results[3] = vluti2q_lane_s16(a, b, 3); +} + +/* +** test_vluti2q_laneqs16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2q_laneqs16(int16x8_t a, uint8x16_t b, int16x8_t results[]) +{ + results[0] = vluti2q_laneq_s16(a, b, 0); + results[1] = vluti2q_laneq_s16(a, b, 1); + results[2] = vluti2q_laneq_s16(a, b, 2); + results[3] = vluti2q_laneq_s16(a, b, 3); + results[4] = vluti2q_laneq_s16(a, b, 4); + results[5] = vluti2q_laneq_s16(a, b, 5); + results[6] = vluti2q_laneq_s16(a, b, 6); + results[7] = vluti2q_laneq_s16(a, b, 7); +} + +/* +** test_vluti2_lanep16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t results[]) +{ + results[0] = vluti2_lane_p16(a, b, 0); + results[1] = vluti2_lane_p16(a, b, 1); + results[2] = vluti2_lane_p16(a, b, 2); + results[3] = vluti2_lane_p16(a, b, 3); +} + +/* +** test_vluti2_laneqp16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2_laneqp16(poly16x4_t a, uint8x16_t b, poly16x8_t results[]) +{ + results[0] = vluti2_laneq_p16(a, b, 0); + results[1] = vluti2_laneq_p16(a, b, 1); + results[2] = vluti2_laneq_p16(a, b, 2); + results[3] = vluti2_laneq_p16(a, b, 3); + results[4] = vluti2_laneq_p16(a, b, 4); + results[5] = vluti2_laneq_p16(a, b, 5); + results[6] = vluti2_laneq_p16(a, b, 6); + results[7] = vluti2_laneq_p16(a, b, 7); +} + +/* +** test_vluti2q_lanep16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_lanep16(poly16x8_t a, uint8x8_t b, poly16x8_t results[]) +{ + results[0] = vluti2q_lane_p16(a, b, 0); + results[1] = vluti2q_lane_p16(a, b, 1); + results[2] = vluti2q_lane_p16(a, b, 2); + results[3] = vluti2q_lane_p16(a, b, 3); +} + +/* +** test_vluti2q_laneqp16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2q_laneqp16(poly16x8_t a, uint8x16_t b, poly16x8_t results[]) +{ + results[0] = vluti2q_laneq_p16(a, b, 0); + results[1] = vluti2q_laneq_p16(a, b, 1); + results[2] = vluti2q_laneq_p16(a, b, 2); + results[3] = vluti2q_laneq_p16(a, b, 3); + results[4] = vluti2q_laneq_p16(a, b, 4); + results[5] = vluti2q_laneq_p16(a, b, 5); + results[6] = vluti2q_laneq_p16(a, b, 6); + results[7] = vluti2q_laneq_p16(a, b, 7); +} + +/* +** test_vluti2_lanef16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t results[]) +{ + results[0] = vluti2_lane_f16(a, b, 0); + results[1] = vluti2_lane_f16(a, b, 1); + results[2] = vluti2_lane_f16(a, b, 2); + results[3] = vluti2_lane_f16(a, b, 3); +} + +/* +** test_vluti2_laneqf16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2_laneqf16(float16x4_t a, uint8x16_t b, float16x8_t results[]) +{ + results[0] = vluti2_laneq_f16(a, b, 0); + results[1] = vluti2_laneq_f16(a, b, 1); + results[2] = vluti2_laneq_f16(a, b, 2); + results[3] = vluti2_laneq_f16(a, b, 3); + results[4] = vluti2_laneq_f16(a, b, 4); + results[5] = vluti2_laneq_f16(a, b, 5); + results[6] = vluti2_laneq_f16(a, b, 6); + results[7] = vluti2_laneq_f16(a, b, 7); +} + +/* +** test_vluti2q_lanef16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_lanef16(float16x8_t a, uint8x8_t b, float16x8_t results[]) +{ + results[0] = vluti2q_lane_f16(a, b, 0); + results[1] = vluti2q_lane_f16(a, b, 1); + results[2] = vluti2q_lane_f16(a, b, 2); + results[3] = vluti2q_lane_f16(a, b, 3); +} + +/* +** test_vluti2q_laneqf16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2q_laneqf16(float16x8_t a, uint8x16_t b, float16x8_t results[]) +{ + results[0] = vluti2q_laneq_f16(a, b, 0); + results[1] = vluti2q_laneq_f16(a, b, 1); + results[2] = vluti2q_laneq_f16(a, b, 2); + results[3] = vluti2q_laneq_f16(a, b, 3); + results[4] = vluti2q_laneq_f16(a, b, 4); + results[5] = vluti2q_laneq_f16(a, b, 5); + results[6] = vluti2q_laneq_f16(a, b, 6); + results[7] = vluti2q_laneq_f16(a, b, 7); +} + +/* +** test_vluti2_lanebf16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t results[]) +{ + results[0] = vluti2_lane_bf16(a, b, 0); + results[1] = vluti2_lane_bf16(a, b, 1); + results[2] = vluti2_lane_bf16(a, b, 2); + results[3] = vluti2_lane_bf16(a, b, 3); +} + +/* +** test_vluti2_laneqbf16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2_laneqbf16(bfloat16x4_t a, uint8x16_t b, bfloat16x8_t results[]) +{ + results[0] = vluti2_laneq_bf16(a, b, 0); + results[1] = vluti2_laneq_bf16(a, b, 1); + results[2] = vluti2_laneq_bf16(a, b, 2); + results[3] = vluti2_laneq_bf16(a, b, 3); + results[4] = vluti2_laneq_bf16(a, b, 4); + results[5] = vluti2_laneq_bf16(a, b, 5); + results[6] = vluti2_laneq_bf16(a, b, 6); + results[7] = vluti2_laneq_bf16(a, b, 7); +} + +/* +** test_vluti2q_lanebf16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti2q_lanebf16(bfloat16x8_t a, uint8x8_t b, bfloat16x8_t results[]) +{ + results[0] = vluti2q_lane_bf16(a, b, 0); + results[1] = vluti2q_lane_bf16(a, b, 1); + results[2] = vluti2q_lane_bf16(a, b, 2); + results[3] = vluti2q_lane_bf16(a, b, 3); +} + +/* +** test_vluti2q_laneqbf16: +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\] +** luti2 v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\] +** ... +** ret +*/ +void +test_vluti2q_laneqbf16(bfloat16x8_t a, uint8x16_t b, bfloat16x8_t results[]) +{ + results[0] = vluti2q_laneq_bf16(a, b, 0); + results[1] = vluti2q_laneq_bf16(a, b, 1); + results[2] = vluti2q_laneq_bf16(a, b, 2); + results[3] = vluti2q_laneq_bf16(a, b, 3); + results[4] = vluti2q_laneq_bf16(a, b, 4); + results[5] = vluti2q_laneq_bf16(a, b, 5); + results[6] = vluti2q_laneq_bf16(a, b, 6); + results[7] = vluti2q_laneq_bf16(a, b, 7); +} + +/* +** test_vluti4q_laneu8: +** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** ... +** ret +*/ +void +test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[]) +{ + results[0] = vluti4q_lane_u8(a, b, 0); +} + +/* +** test_vluti4q_lanequ8: +** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti4q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[]) +{ + results[0] = vluti4q_laneq_u8(a, b, 0); + results[1] = vluti4q_laneq_u8(a, b, 1); +} + +/* +** test_vluti4q_lanep8: +** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** ... +** ret +*/ +void +test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[]) +{ + results[0] = vluti4q_lane_p8(a, b, 0); +} + +/* +** test_vluti4q_laneqp8: +** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\] +** luti4 v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti4q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[]) +{ + results[0] = vluti4q_laneq_p8(a, b, 0); + results[1] = vluti4q_laneq_p8(a, b, 1); +} + +/* +** test_vluti4q_laneu16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint16x8_t results[]) +{ + results[0] = vluti4q_lane_u16_x2(a, b, 0); + results[1] = vluti4q_lane_u16_x2(a, b, 1); +} + +/* +** test_vluti4q_lanequ16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti4q_lanequ16_x2(uint16x8x2_t a, uint8x16_t b, uint16x8_t results[]) +{ + results[0] = vluti4q_laneq_u16_x2(a, b, 0); + results[1] = vluti4q_laneq_u16_x2(a, b, 1); + results[2] = vluti4q_laneq_u16_x2(a, b, 2); + results[3] = vluti4q_laneq_u16_x2(a, b, 3); +} + +/* +** test_vluti4q_lanes16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, int16x8_t results[]) +{ + results[0] = vluti4q_lane_s16_x2(a, b, 0); + results[1] = vluti4q_lane_s16_x2(a, b, 1); +} + +/* +** test_vluti4q_laneqs16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti4q_laneqs16_x2(int16x8x2_t a, uint8x16_t b, int16x8_t results[]) +{ + results[0] = vluti4q_laneq_s16_x2(a, b, 0); + results[1] = vluti4q_laneq_s16_x2(a, b, 1); + results[2] = vluti4q_laneq_s16_x2(a, b, 2); + results[3] = vluti4q_laneq_s16_x2(a, b, 3); +} + +/* +** test_vluti4q_lanep16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, poly16x8_t results[]) +{ + results[0] = vluti4q_lane_p16_x2(a, b, 0); + results[1] = vluti4q_lane_p16_x2(a, b, 1); +} + +/* +** test_vluti4q_laneqp16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti4q_laneqp16_x2(poly16x8x2_t a, uint8x16_t b, poly16x8_t results[]) +{ + results[0] = vluti4q_laneq_p16_x2(a, b, 0); + results[1] = vluti4q_laneq_p16_x2(a, b, 1); + results[2] = vluti4q_laneq_p16_x2(a, b, 2); + results[3] = vluti4q_laneq_p16_x2(a, b, 3); +} + +/* +** test_vluti4q_lanef16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, float16x8_t results[]) +{ + results[0] = vluti4q_lane_f16_x2(a, b, 0); + results[1] = vluti4q_lane_f16_x2(a, b, 1); +} + +/* +** test_vluti4q_laneqf16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti4q_laneqf16_x2(float16x8x2_t a, uint8x16_t b, float16x8_t results[]) +{ + results[0] = vluti4q_laneq_f16_x2(a, b, 0); + results[1] = vluti4q_laneq_f16_x2(a, b, 1); + results[2] = vluti4q_laneq_f16_x2(a, b, 2); + results[3] = vluti4q_laneq_f16_x2(a, b, 3); +} + +/* +** test_vluti4q_lanebf16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** ... +** ret +*/ +void +test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, bfloat16x8_t results[]) +{ + results[0] = vluti4q_lane_bf16_x2(a, b, 0); + results[1] = vluti4q_lane_bf16_x2(a, b, 1); +} + +/* +** test_vluti4q_laneqbf16_x2: +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\] +** luti4 v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\] +** ... +** ret +*/ +void +test_vluti4q_laneqbf16_x2(bfloat16x8x2_t a, uint8x16_t b, bfloat16x8_t results[]) +{ + results[0] = vluti4q_laneq_bf16_x2(a, b, 0); + results[1] = vluti4q_laneq_bf16_x2(a, b, 1); + results[2] = vluti4q_laneq_bf16_x2(a, b, 2); + results[3] = vluti4q_laneq_bf16_x2(a, b, 3); +}