The AArch64 FEAT_LUT extension is optional from Armv9.2-a and mandatory
from Armv9.5-a. It introduces instructions for lookup table reads with
bit indices.

This patch adds support for AdvSIMD lut intrinsics. The intrinsics for
this extension are implemented as the following builtin functions:
* vluti2{q}_lane{q}_{u8|s8|p8}
* vluti2{q}_lane{q}_{u16|s16|p16|f16|bf16}
* vluti4q_lane{q}_{u8|s8|p8}
* vluti4q_lane{q}_{u16|s16|p16|f16|bf16}_x2

We also introduced a new approach to do lane checks for AdvSIMD.

gcc/ChangeLog:

        * config/aarch64/aarch64-builtins.cc
        (ENTRY): Add support for one more type.
        (enum class): Add enum variant for ternary functions.
        (struct aarch64_pragma_builtins_data): Add support for one more
        type.
        (aarch64_get_number_of_args): Add support for ternary functions.
        (require_integer_constant): Function to check whether an
        argument is a const integer.
        (require_immediate_range): Function to check whether the const
        integer argument fits in a range.
        (check_simd_lane_bounds): Main function to check the validity of
        an index argument.
        (aarch64_general_check_builtin_call): Call
        function_checker::check_simd_lane_bounds.
        (aarch64_expand_pragma_builtin): Add support for lut unspecs.
        * config/aarch64/aarch64-option-extensions.def
        (AARCH64_OPT_EXTENSION): Add lut option.
        * config/aarch64/aarch64-simd-pragma-builtins.def
        (ENTRY_BINARY_LANE): Modify to use new ENTRY macro.
        (ENTRY_TERNARY_VLUT8): Macro to declare lut intrinsics.
        (ENTRY_TERNARY_VLUT16): Macro to declare lut intrinsics.
        (REQUIRED_EXTENSIONS): Declare lut intrinsics.
        * config/aarch64/aarch64-simd.md
        (@aarch64_<vluti_uns_op><VLUT:mode><VB:mode>): Instruction
        pattern for luti2 and luti4 intrinsics.
        (@aarch64_lutx2<VLUT:mode><VB:mode>): Instruction pattern for
        luti4x2 intrinsics.
        * config/aarch64/aarch64.h
        (TARGET_LUT): lut flag.
        * config/aarch64/iterators.md: Iterators and attributes for lut.
        * doc/invoke.texi: Document extension in AArch64 Options.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/lut-incorrect-range.c: New test.
        * gcc.target/aarch64/simd/lut-no-flag.c: New test.
        * gcc.target/aarch64/simd/lut.c: New test.
---
 gcc/config/aarch64/aarch64-builtins.cc        | 129 ++-
 .../aarch64/aarch64-option-extensions.def     |   2 +
 .../aarch64/aarch64-simd-pragma-builtins.def  |  40 +-
 gcc/config/aarch64/aarch64-simd.md            |  24 +
 gcc/config/aarch64/aarch64.h                  |   4 +
 gcc/config/aarch64/iterators.md               |  55 +-
 gcc/doc/invoke.texi                           |   2 +
 .../aarch64/simd/lut-incorrect-range.c        | 212 +++++
 .../gcc.target/aarch64/simd/lut-no-flag.c     |  10 +
 gcc/testsuite/gcc.target/aarch64/simd/lut.c   | 849 ++++++++++++++++++
 10 files changed, 1304 insertions(+), 23 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/lut.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index f4d719a6b5a..45aeca33e3f 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -780,7 +780,7 @@ typedef struct
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
 #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U)		\
+#define ENTRY(N, S, T0, T1, T2, T3, U)		\
   AARCH64_##N,
 
 enum aarch64_builtins
@@ -1596,6 +1596,7 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma)
 enum class aarch64_builtin_signatures
 {
   binary,
+  ternary,
 };
 
 namespace {
@@ -1616,18 +1617,25 @@ namespace simd_types {
 
   constexpr simd_type f16 { V4HFmode, qualifier_none };
   constexpr simd_type f16q { V8HFmode, qualifier_none };
+  constexpr simd_type f16qx2 { V2x8HFmode, qualifier_none };
   constexpr simd_type p16 { V4HImode, qualifier_poly };
   constexpr simd_type p16q { V8HImode, qualifier_poly };
+  constexpr simd_type p16qx2 { V2x8HImode, qualifier_poly };
   constexpr simd_type s16 { V4HImode, qualifier_none };
   constexpr simd_type s16q { V8HImode, qualifier_none };
+  constexpr simd_type s16qx2 { V2x8HImode, qualifier_none };
   constexpr simd_type u16 { V4HImode, qualifier_unsigned };
   constexpr simd_type u16q { V8HImode, qualifier_unsigned };
+  constexpr simd_type u16qx2 { V2x8HImode, qualifier_unsigned };
 
   constexpr simd_type bf16 { V4BFmode, qualifier_none };
   constexpr simd_type bf16q { V8BFmode, qualifier_none };
+  constexpr simd_type bf16qx2 { V2x8BFmode, qualifier_none };
 
   constexpr simd_type f32 { V2SFmode, qualifier_none };
   constexpr simd_type f32q { V4SFmode, qualifier_none };
+  constexpr simd_type s32_index { SImode, qualifier_lane_index };
+
   constexpr simd_type f64q { V2DFmode, qualifier_none };
 
   constexpr simd_type none { VOIDmode, qualifier_none };
@@ -1636,9 +1644,10 @@ namespace simd_types {
 }
 
 #undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U) \
+#define ENTRY(N, S, T0, T1, T2, T3, U)				      \
   {#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
-      simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS},
+      simd_types::T2, simd_types::T3, U,			      \
+      aarch64_required_extensions::REQUIRED_EXTENSIONS},
 
 /* Initialize pragma builtins.  */
 
@@ -1646,7 +1655,7 @@ struct aarch64_pragma_builtins_data
 {
   const char *name;
   aarch64_builtin_signatures signature;
-  simd_type types[3];
+  simd_type types[4];
   int unspec;
   aarch64_required_extensions required_extensions;
 };
@@ -1658,11 +1667,18 @@ static aarch64_pragma_builtins_data aarch64_pragma_builtins[] = {
 static unsigned int
 aarch64_get_number_of_args (const aarch64_pragma_builtins_data &builtin_data)
 {
-  if (builtin_data.signature == aarch64_builtin_signatures::binary)
-    return 2;
-  else
-    // No other signature supported.
-    gcc_unreachable ();
+  switch (builtin_data.signature)
+    {
+    case aarch64_builtin_signatures::binary:
+      return 2;
+    case aarch64_builtin_signatures::ternary:
+      return 3;
+    default:
+      // No other signature supported.
+      gcc_unreachable ();
+    }
+
+  gcc_unreachable ();
 }
 
 static tree
@@ -2520,6 +2536,78 @@ aarch64_general_required_extensions (unsigned int code)
   return ext::streaming_compatible (0);
 }
 
+namespace function_checker {
+
+void
+require_integer_constant (location_t location, tree arg)
+{
+  if (TREE_CODE (arg) != INTEGER_CST)
+    {
+      error_at (location, "Constant-type integer argument expected");
+      return;
+    }
+}
+
+void
+require_immediate_range (location_t location, tree arg, HOST_WIDE_INT min,
+			 HOST_WIDE_INT max)
+{
+  if (wi::to_widest (arg) < min || wi::to_widest (arg) > max)
+    {
+      error_at (location, "lane out of range %wd - %wd", min, max);
+      return;
+    }
+}
+
+/* Validates indexing into a vector using the index's size and the instruction,
+   where instruction is represented by the unspec.
+   This only works for intrinsics declared using pragmas in
+   aarch64-simd-pragma-builtins.def.  */
+
+void
+check_simd_lane_bounds (location_t location, const aarch64_pragma_builtins_data
+			*builtin_data, tree *args)
+{
+  if (builtin_data == NULL)
+    // Don't check for functions that are not declared in
+    // aarch64-simd-pragma-builtins.def.
+    return;
+
+  auto nargs = aarch64_get_number_of_args (*builtin_data);
+  switch (builtin_data->unspec)
+    {
+    case UNSPEC_LUTI2:
+    case UNSPEC_LUTI4:
+    case UNSPEC_LUTI4x2:
+      {
+	auto index_arg = args[nargs - 1];
+	require_integer_constant (location, index_arg);
+
+	auto vector_to_index_mode = builtin_data->types[nargs - 1].mode;
+	int vector_to_index_nunits
+	  = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+	int output_mode_nunits
+	  = GET_MODE_NUNITS (builtin_data->types[0].mode).to_constant ();
+
+	auto low = 0;
+	int high;
+	if (builtin_data->unspec == UNSPEC_LUTI2)
+	  high = (4 * vector_to_index_nunits / output_mode_nunits) - 1;
+	else
+	  high = (2 * vector_to_index_nunits / output_mode_nunits) - 1;
+
+	require_immediate_range (location, index_arg, low, high);
+	break;
+      }
+
+    default:
+      // Don't need to check lanes for any other operator.
+      return;
+    }
+}
+
+}
+
 bool
 aarch64_general_check_builtin_call (location_t location, vec<location_t>,
 				    unsigned int code, tree fndecl,
@@ -2531,6 +2619,9 @@ aarch64_general_check_builtin_call (location_t location, vec<location_t>,
   if (!aarch64_check_required_extensions (location, decl, required_extensions))
     return false;
 
+  auto builtin_data = aarch64_get_pragma_builtin (code);
+  function_checker::check_simd_lane_bounds (location, builtin_data, args);
+
   switch (code)
     {
     case AARCH64_RSR:
@@ -3427,7 +3518,7 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
 {
   auto nargs = aarch64_get_number_of_args (*builtin_data);
 
-  expand_operand ops[3];
+  expand_operand ops[4];
   create_output_operand (&ops[0], target, builtin_data->types[0].mode);
   for (unsigned int i = 1; i <= nargs; ++i)
     create_input_operand (&ops[i],
@@ -3444,9 +3535,27 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
       expand_insn (icode, nargs + 1, ops);
       target = ops[0].value;
       break;
+
+    case UNSPEC_LUTI2:
+    case UNSPEC_LUTI4:
+      icode = code_for_aarch64 (builtin_data->unspec,
+				builtin_data->types[1].mode,
+				builtin_data->types[2].mode);
+      expand_insn (icode, nargs + 1, ops);
+      target = ops[0].value;
+      break;
+
+    case UNSPEC_LUTI4x2:
+      icode = code_for_aarch64_lutx2 (builtin_data->types[0].mode,
+				      builtin_data->types[2].mode);
+      expand_insn (icode, nargs + 1, ops);
+      target = ops[0].value;
+      break;
+
     default:
       gcc_unreachable ();
     }
+
   return target;
 }
 
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index f4cf6618238..f555de50ea6 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -247,6 +247,8 @@ AARCH64_OPT_EXTENSION("fp8", FP8, (SIMD), (), (), "fp8")
 
 AARCH64_OPT_EXTENSION("faminmax", FAMINMAX, (SIMD), (), (), "faminmax")
 
+AARCH64_OPT_EXTENSION("lut", LUT, (SIMD), (), (), "lut")
+
 #undef AARCH64_OPT_FMV_EXTENSION
 #undef AARCH64_OPT_EXTENSION
 #undef AARCH64_FMV_FEATURE
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index e49db23cbd1..ab6e520f4d7 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -20,7 +20,11 @@
 
 #undef ENTRY_BINARY
 #define ENTRY_BINARY(N, T0, T1, T2, U)		\
-  ENTRY (N, binary, T0, T1, T2, U)
+  ENTRY (N, binary, T0, T1, T2, none, U)
+
+#undef ENTRY_BINARY_LANE
+#define ENTRY_BINARY_LANE(N, T0, T1, T2, U)	\
+  ENTRY (N, ternary, T0, T1, T2, s32_index, U)
 
 #undef ENTRY_BINARY_VHSDF
 #define ENTRY_BINARY_VHSDF(NAME, UNSPEC)	       \
@@ -30,8 +34,42 @@
   ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC) \
   ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC)
 
+#undef ENTRY_TERNARY_VLUT8
+#define ENTRY_TERNARY_VLUT8(T)						\
+  ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8, UNSPEC_LUTI2)	\
+  ENTRY_BINARY_LANE (vluti2_laneq_##T##8, T##8q, T##8, u8q, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti4q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI4) \
+  ENTRY_BINARY_LANE (vluti4q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI4)
+
+#undef ENTRY_TERNARY_VLUT16
+#define ENTRY_TERNARY_VLUT16(T)						\
+  ENTRY_BINARY_LANE (vluti2_lane_##T##16, T##16q, T##16, u8, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2_laneq_##T##16, T##16q, T##16, u8q, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_lane_##T##16, T##16q, T##16q, u8, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_laneq_##T##16, T##16q, T##16q, u8q,	\
+		     UNSPEC_LUTI2)					\
+  ENTRY_BINARY_LANE (vluti4q_lane_##T##16_x2, T##16q, T##16qx2, u8,	\
+		     UNSPEC_LUTI4x2)					\
+  ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q,	\
+		     UNSPEC_LUTI4x2)
+
 // faminmax
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
 ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX)
 ENTRY_BINARY_VHSDF (vamin, UNSPEC_FAMIN)
 #undef REQUIRED_EXTENSIONS
+
+// lut
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_LUT)
+ENTRY_TERNARY_VLUT8 (p)
+ENTRY_TERNARY_VLUT8 (s)
+ENTRY_TERNARY_VLUT8 (u)
+
+ENTRY_TERNARY_VLUT16 (bf)
+ENTRY_TERNARY_VLUT16 (f)
+ENTRY_TERNARY_VLUT16 (p)
+ENTRY_TERNARY_VLUT16 (s)
+ENTRY_TERNARY_VLUT16 (u)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cfe95bd4c31..32ec30ad5d1 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9999,3 +9999,27 @@
   "TARGET_FAMINMAX"
   "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
 )
+
+;; lut
+(define_insn "@aarch64_<vluti_uns_op><VLUT:mode><VB:mode>"
+  [(set (match_operand:<VLUT:VCONQ> 0 "register_operand" "=w")
+        (unspec:<VLUT:VCONQ>
+	 [(match_operand:VLUT 1 "register_operand" "w")
+          (match_operand:VB 2 "register_operand" "w")
+          (match_operand:SI 3 "const_int_operand" "n")]
+          VLUT_UNS))]
+  "TARGET_SIMD"
+  "<vluti_uns_op>\t%0<VLUT:Vconqtype>, {%1<VLUT:Vconqtype>}, %2[%3]"
+)
+
+;; lutx2
+(define_insn "@aarch64_lutx2<VLUT:mode><VB:mode>"
+  [(set (match_operand:VLUT 0 "register_operand" "=w")
+        (unspec:VLUT
+	 [(match_operand:<VLUT:velt_vstructx2> 1 "register_operand" "w")
+          (match_operand:VB 2 "register_operand" "w")
+          (match_operand:SI 3 "const_int_operand" "n")]
+          VLUTx2_UNS))]
+  "TARGET_SIMD"
+  "<vluti_uns_op>\t%0<VLUT:Vmtype>, {%S1<VLUT:Vmtype>, %T1<VLUT:Vmtype>}, %2[%3]"
+)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index b063c315fba..a8e91e4cf3f 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -487,6 +487,10 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 #define TARGET_FAMINMAX AARCH64_HAVE_ISA (FAMINMAX)
 #define TARGET_SVE_FAMINMAX (TARGET_SVE && TARGET_FAMINMAX)
 
+/* Lookup table (LUTI) extension instructions are
+   enabled through +lut.  */
+#define TARGET_LUT AARCH64_HAVE_ISA (LUT)
+
 /* Prefer different predicate registers for the output of a predicated
    operation over re-using an existing input predicate.  */
 #define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 023893d35f3..f3c4f161659 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -426,6 +426,10 @@
 			     (V8HF "TARGET_SIMD_F16INST")
 			     V2SF V4SF])
 
+;; Modes available for Advanced SIMD lut operations.
+(define_mode_iterator VLUT [V8QI V16QI V4HI V8HI V4HF V8HF V4BF V8BF])
+(define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF])
+
 ;; Iterators for single modes, for "@" patterns.
 (define_mode_iterator VNx16QI_ONLY [VNx16QI])
 (define_mode_iterator VNx16SI_ONLY [VNx16SI])
@@ -1090,6 +1094,9 @@
     UNSPEC_FCVTXN	; Used in aarch64-simd.md.
     UNSPEC_FAMAX       ; Used in aarch64-simd.md.
     UNSPEC_FAMIN       ; Used in aarch64-simd.md.
+    UNSPEC_LUTI2	; Used in aarch64-simd.md.
+    UNSPEC_LUTI4	; Used in aarch64-simd.md.
+    UNSPEC_LUTI4x2	; Used in aarch64-simd.md.
 
     ;; All used in aarch64-sve2.md
     UNSPEC_ADDQV
@@ -1536,6 +1543,12 @@
 			  (QI   "8b")  (HI    "8b")
 			  (V4BF "8b")  (V8BF  "16b")])
 
+;; Mode to double type mapping.
+(define_mode_attr Vconqtype [(V8QI ".16b") (V16QI ".16b")
+			     (V4HI ".8h") (V8HI ".8h")
+			     (V4HF ".8h") (V8HF ".8h")
+			     (V4BF ".8h") (V8BF ".8h")])
+
 ;; Advanced SIMD vector structure to element modes.
 (define_mode_attr VSTRUCT_ELT [(V2x8QI "V8QI") (V2x4HI "V4HI")
 			       (V2x2SI "V2SI") (V2x1DI "DI")
@@ -1562,6 +1575,15 @@
 			       (V4x8HF "V8HF") (V4x4SF "V4SF")
 			       (V4x2DF "V2DF") (V4x8BF "V8BF")])
 
+;; Advanced SIMD element to vector structure x2 modes.
+(define_mode_attr velt_vstructx2 [(V8QI "V2x8QI") (V4HI "V2x4HI")
+				  (V2SI "V2x2SI") (V4HF "V2x4HF")
+				  (V2SF "V2x2SF") (V4BF "V2x4BF")
+				  (V16QI "V2x16QI") (V8HI "V2x8HI")
+				  (V4SI "V2x4SI") (V2DI "V2x2DI")
+				  (V8HF "V2x8HF") (V4SF "V2x4SF")
+				  (V2DF "V2x2DF") (V8BF "V2x8BF")])
+
 ;; Advanced SIMD vector structure to element modes in lower case.
 (define_mode_attr vstruct_elt [(V2x8QI "v8qi") (V2x4HI "v4hi")
 			       (V2x2SI "v2si") (V2x1DI "di")
@@ -1666,6 +1688,7 @@
 			 (V2SI "V4SI") (V4SI "V4SI")
 			 (DI   "V2DI") (V2DI "V2DI")
 			 (V4HF "V8HF") (V8HF "V8HF")
+			 (V4BF "V8BF") (V8BF "V8BF")
 			 (V2SF "V4SF") (V4SF "V4SF")
 			 (V2DF "V2DF") (SI   "V4SI")
 			 (HI   "V8HI") (QI   "V16QI")
@@ -3146,6 +3169,14 @@
 
 (define_int_iterator LAST [UNSPEC_LASTA UNSPEC_LASTB])
 
+;; Iterators for fp8 operations
+
+(define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN])
+
+(define_int_iterator VLUT_UNS [UNSPEC_LUTI2 UNSPEC_LUTI4])
+
+(define_int_iterator VLUTx2_UNS [UNSPEC_LUTI4x2])
+
 (define_int_iterator SVE_INT_UNARY [UNSPEC_REVB
 				    UNSPEC_REVH UNSPEC_REVW])
 
@@ -3949,6 +3980,9 @@
 (define_code_attr binqops_op_rev [(ss_plus "sqsub")
 				  (ss_minus "sqadd")])
 
+(define_code_attr faminmax_op
+  [(smax "famax") (smin "famin")])
+
 ;; The SVE logical instruction that implements an unspec.
 (define_int_attr logicalf_op [(UNSPEC_ANDF "and")
 		 	      (UNSPEC_IORF "orr")
@@ -4161,6 +4195,15 @@
 (define_int_attr frintnzs_op [(UNSPEC_FRINT32Z "frint32z") (UNSPEC_FRINT32X "frint32x")
 			      (UNSPEC_FRINT64Z "frint64z") (UNSPEC_FRINT64X "frint64x")])
 
+(define_int_attr faminmax_cond_uns_op
+  [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")])
+
+(define_int_attr faminmax_uns_op
+  [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")])
+
+(define_int_attr vluti_uns_op
+  [(UNSPEC_LUTI2 "luti2") (UNSPEC_LUTI4 "luti4") (UNSPEC_LUTI4x2 "luti4")])
+
 ;; The condition associated with an UNSPEC_COND_<xx>.
 (define_int_attr cmp_op [(UNSPEC_COND_CMPEQ_WIDE "eq")
 			 (UNSPEC_COND_CMPGE_WIDE "ge")
@@ -4719,15 +4762,3 @@
 
 (define_int_attr bits_etype [(8 "b") (16 "h") (32 "s") (64 "d")])
 
-;; Iterators and attributes for faminmax
-
-(define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN])
-
-(define_int_attr faminmax_cond_uns_op
-  [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")])
-
-(define_int_attr faminmax_uns_op
-  [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")])
-
-(define_code_attr faminmax_op
-  [(smax "famax") (smin "famin")])
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 346ac1369b8..a42bd1b14db 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -21905,6 +21905,8 @@ Enable the RCpc3 (Release Consistency) extension.
 Enable the fp8 (8-bit floating point) extension.
 @item faminmax
 Enable the Floating Point Absolute Maximum/Minimum extension.
+@item lut
+Enable the Lookup Table extension.
 @item sve-b16b16
 Enable the SVE non-widening brain floating-point (@code{bf16}) extension.
 This only has an effect when @code{sve2} or @code{sme2} are also enabled.
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c
new file mode 100644
index 00000000000..93fc6d89768
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c
@@ -0,0 +1,212 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+lut" } */
+
+#include "arm_neon.h"
+
+void
+test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t c, uint8x16_t d)
+{
+  vluti2_lane_u8(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti2_lane_u8(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti2_laneq_u8(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_laneq_u8(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_lane_u8(c, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti2q_lane_u8(c, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti2q_laneq_u8(c, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_laneq_u8(c, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
+
+void
+test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t c, uint8x16_t d)
+{
+  vluti2_lane_s8(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti2_lane_s8(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti2_laneq_s8(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_laneq_s8(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_lane_s8(c, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti2q_lane_s8(c, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti2q_laneq_s8(c, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_laneq_s8(c, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
+
+void
+test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t c, uint8x16_t d)
+{
+  vluti2_lane_p8(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti2_lane_p8(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti2_laneq_p8(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_laneq_p8(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_lane_p8(c, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti2q_lane_p8(c, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti2q_laneq_p8(c, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_laneq_p8(c, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
+
+void
+test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_u16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_lane_u16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2_laneq_u16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2_laneq_u16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+
+  vluti2q_lane_u16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_lane_u16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_laneq_u16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2q_laneq_u16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+}
+
+void
+test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_s16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_lane_s16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2_laneq_s16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2_laneq_s16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+
+  vluti2q_lane_s16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_lane_s16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_laneq_s16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2q_laneq_s16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+}
+
+void
+test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_p16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_lane_p16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2_laneq_p16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2_laneq_p16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+
+  vluti2q_lane_p16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_lane_p16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_laneq_p16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2q_laneq_p16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+}
+
+void
+test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_f16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_lane_f16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2_laneq_f16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2_laneq_f16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+
+  vluti2q_lane_f16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_lane_f16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_laneq_f16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2q_laneq_f16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+}
+
+void
+test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_bf16(a, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2_lane_bf16(a, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2_laneq_bf16(a, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2_laneq_bf16(a, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+
+  vluti2q_lane_bf16(c, b, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti2q_lane_bf16(c, b, 4); /* { dg-error {lane out of range 0 - 3} } */
+
+  vluti2q_laneq_bf16(c, d, -1); /* { dg-error {lane out of range 0 - 7} } */
+  vluti2q_laneq_bf16(c, d, 8); /* { dg-error {lane out of range 0 - 7} } */
+}
+
+void
+test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_u8(a, b, -1); /* { dg-error {lane out of range 0 - 0} } */
+  vluti4q_lane_u8(a, b, 1); /* { dg-error {lane out of range 0 - 0} } */
+
+  vluti4q_laneq_u8(a, d, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_laneq_u8(a, d, 2); /* { dg-error {lane out of range 0 - 1} } */
+}
+
+void
+test_vluti4q_lanes8(int8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_s8(a, b, -1); /* { dg-error {lane out of range 0 - 0} } */
+  vluti4q_lane_s8(a, b, 1); /* { dg-error {lane out of range 0 - 0} } */
+
+  vluti4q_laneq_s8(a, d, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_laneq_s8(a, d, 2); /* { dg-error {lane out of range 0 - 1} } */
+}
+
+void
+test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_p8(a, b, -1); /* { dg-error {lane out of range 0 - 0} } */
+  vluti4q_lane_p8(a, b, 1); /* { dg-error {lane out of range 0 - 0} } */
+
+  vluti4q_laneq_p8(a, d, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_laneq_p8(a, d, 2); /* { dg-error {lane out of range 0 - 1} } */
+}
+
+void
+test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_u16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_lane_u16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti4q_laneq_u16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti4q_laneq_u16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
+
+void
+test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_s16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_lane_s16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti4q_laneq_s16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti4q_laneq_s16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
+
+void
+test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_p16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_lane_p16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti4q_laneq_p16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti4q_laneq_p16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
+
+void
+test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_f16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_lane_f16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti4q_laneq_f16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti4q_laneq_f16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
+
+void
+test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_bf16_x2(a, b, -1); /* { dg-error {lane out of range 0 - 1} } */
+  vluti4q_lane_bf16_x2(a, b, 2); /* { dg-error {lane out of range 0 - 1} } */
+
+  vluti4q_laneq_bf16_x2(a, d, -1); /* { dg-error {lane out of range 0 - 3} } */
+  vluti4q_laneq_bf16_x2(a, d, 4); /* { dg-error {lane out of range 0 - 3} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c
new file mode 100644
index 00000000000..d180d8f2150
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a" } */
+
+#include "arm_neon.h"
+
+void
+test (uint8x8_t a, uint8x8_t b)
+{
+  vluti2_lane_u8 (a, b, 0); /* { dg-error {ACLE function 'vluti2_lane_u8' requires ISA extension 'lut'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut.c b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
new file mode 100644
index 00000000000..fc89b215a93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
@@ -0,0 +1,849 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+lut" } */
+/* { dg-final { check-function-bodies "**" ""} } */
+
+#include "arm_neon.h"
+
+/*
+** test_vluti2_laneu8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t results[])
+{
+  results[0] = vluti2_lane_u8(a, b, 0);
+  results[1] = vluti2_lane_u8(a, b, 1);
+}
+
+/*
+** test_vluti2_lanequ8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanequ8(uint8x8_t a, uint8x16_t b, uint8x16_t results[])
+{
+  results[0] = vluti2_laneq_u8(a, b, 0);
+  results[1] = vluti2_laneq_u8(a, b, 1);
+  results[2] = vluti2_laneq_u8(a, b, 2);
+  results[3] = vluti2_laneq_u8(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneu8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[])
+{
+  results[0] = vluti2q_lane_u8(a, b, 0);
+  results[1] = vluti2q_lane_u8(a, b, 1);
+}
+
+/*
+** test_vluti2q_lanequ8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[])
+{
+  results[0] = vluti2q_laneq_u8(a, b, 0);
+  results[1] = vluti2q_laneq_u8(a, b, 1);
+  results[2] = vluti2q_laneq_u8(a, b, 2);
+  results[3] = vluti2q_laneq_u8(a, b, 3);
+}
+
+/*
+** test_vluti2_lanes8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t results[])
+{
+  results[0] = vluti2_lane_s8(a, b, 0);
+  results[1] = vluti2_lane_s8(a, b, 1);
+}
+
+/*
+** test_vluti2_laneqs8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneqs8(int8x8_t a, uint8x16_t b, int8x16_t results[])
+{
+  results[0] = vluti2_laneq_s8(a, b, 0);
+  results[1] = vluti2_laneq_s8(a, b, 1);
+  results[2] = vluti2_laneq_s8(a, b, 2);
+  results[3] = vluti2_laneq_s8(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanes8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanes8(int8x16_t a, uint8x8_t b, int8x16_t results[])
+{
+  results[0] = vluti2q_lane_s8(a, b, 0);
+  results[1] = vluti2q_lane_s8(a, b, 1);
+}
+
+/*
+** test_vluti2q_laneqs8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneqs8(int8x16_t a, uint8x16_t b, int8x16_t results[])
+{
+  results[0] = vluti2q_laneq_s8(a, b, 0);
+  results[1] = vluti2q_laneq_s8(a, b, 1);
+  results[2] = vluti2q_laneq_s8(a, b, 2);
+  results[3] = vluti2q_laneq_s8(a, b, 3);
+}
+
+/*
+** test_vluti2_lanep8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t results[])
+{
+  results[0] = vluti2_lane_p8(a, b, 0);
+  results[1] = vluti2_lane_p8(a, b, 1);
+}
+
+/*
+** test_vluti2_laneqp8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneqp8(poly8x8_t a, uint8x16_t b, poly8x16_t results[])
+{
+  results[0] = vluti2_laneq_p8(a, b, 0);
+  results[1] = vluti2_laneq_p8(a, b, 1);
+  results[2] = vluti2_laneq_p8(a, b, 2);
+  results[3] = vluti2_laneq_p8(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanep8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[])
+{
+  results[0] = vluti2q_lane_p8(a, b, 0);
+  results[1] = vluti2q_lane_p8(a, b, 1);
+}
+
+/*
+** test_vluti2q_laneqp8:
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
+{
+  results[0] = vluti2q_laneq_p8(a, b, 0);
+  results[1] = vluti2q_laneq_p8(a, b, 1);
+  results[2] = vluti2q_laneq_p8(a, b, 2);
+  results[3] = vluti2q_laneq_p8(a, b, 3);
+}
+
+/*
+** test_vluti2_laneu16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t results[])
+{
+  results[0] = vluti2_lane_u16(a, b, 0);
+  results[1] = vluti2_lane_u16(a, b, 1);
+  results[2] = vluti2_lane_u16(a, b, 2);
+  results[3] = vluti2_lane_u16(a, b, 3);
+}
+
+/*
+** test_vluti2_lanequ16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanequ16(uint16x4_t a, uint8x16_t b, uint16x8_t results[])
+{
+  results[0] = vluti2_laneq_u16(a, b, 0);
+  results[1] = vluti2_laneq_u16(a, b, 1);
+  results[2] = vluti2_laneq_u16(a, b, 2);
+  results[3] = vluti2_laneq_u16(a, b, 3);
+  results[4] = vluti2_laneq_u16(a, b, 4);
+  results[5] = vluti2_laneq_u16(a, b, 5);
+  results[6] = vluti2_laneq_u16(a, b, 6);
+  results[7] = vluti2_laneq_u16(a, b, 7);
+}
+
+/*
+** test_vluti2q_laneu16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneu16(uint16x8_t a, uint8x8_t b, uint16x8_t results[])
+{
+  results[0] = vluti2q_lane_u16(a, b, 0);
+  results[1] = vluti2q_lane_u16(a, b, 1);
+  results[2] = vluti2q_lane_u16(a, b, 2);
+  results[3] = vluti2q_lane_u16(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanequ16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanequ16(uint16x8_t a, uint8x16_t b, uint16x8_t results[])
+{
+  results[0] = vluti2q_laneq_u16(a, b, 0);
+  results[1] = vluti2q_laneq_u16(a, b, 1);
+  results[2] = vluti2q_laneq_u16(a, b, 2);
+  results[3] = vluti2q_laneq_u16(a, b, 3);
+  results[4] = vluti2q_laneq_u16(a, b, 4);
+  results[5] = vluti2q_laneq_u16(a, b, 5);
+  results[6] = vluti2q_laneq_u16(a, b, 6);
+  results[7] = vluti2q_laneq_u16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanes16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t results[])
+{
+  results[0] = vluti2_lane_s16(a, b, 0);
+  results[1] = vluti2_lane_s16(a, b, 1);
+  results[2] = vluti2_lane_s16(a, b, 2);
+  results[3] = vluti2_lane_s16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqs16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneqs16(int16x4_t a, uint8x16_t b, int16x8_t results[])
+{
+  results[0] = vluti2_laneq_s16(a, b, 0);
+  results[1] = vluti2_laneq_s16(a, b, 1);
+  results[2] = vluti2_laneq_s16(a, b, 2);
+  results[3] = vluti2_laneq_s16(a, b, 3);
+  results[4] = vluti2_laneq_s16(a, b, 4);
+  results[5] = vluti2_laneq_s16(a, b, 5);
+  results[6] = vluti2_laneq_s16(a, b, 6);
+  results[7] = vluti2_laneq_s16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanes16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanes16(int16x8_t a, uint8x8_t b, int16x8_t results[])
+{
+  results[0] = vluti2q_lane_s16(a, b, 0);
+  results[1] = vluti2q_lane_s16(a, b, 1);
+  results[2] = vluti2q_lane_s16(a, b, 2);
+  results[3] = vluti2q_lane_s16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqs16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneqs16(int16x8_t a, uint8x16_t b, int16x8_t results[])
+{
+  results[0] = vluti2q_laneq_s16(a, b, 0);
+  results[1] = vluti2q_laneq_s16(a, b, 1);
+  results[2] = vluti2q_laneq_s16(a, b, 2);
+  results[3] = vluti2q_laneq_s16(a, b, 3);
+  results[4] = vluti2q_laneq_s16(a, b, 4);
+  results[5] = vluti2q_laneq_s16(a, b, 5);
+  results[6] = vluti2q_laneq_s16(a, b, 6);
+  results[7] = vluti2q_laneq_s16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanep16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t results[])
+{
+  results[0] = vluti2_lane_p16(a, b, 0);
+  results[1] = vluti2_lane_p16(a, b, 1);
+  results[2] = vluti2_lane_p16(a, b, 2);
+  results[3] = vluti2_lane_p16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqp16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneqp16(poly16x4_t a, uint8x16_t b, poly16x8_t results[])
+{
+  results[0] = vluti2_laneq_p16(a, b, 0);
+  results[1] = vluti2_laneq_p16(a, b, 1);
+  results[2] = vluti2_laneq_p16(a, b, 2);
+  results[3] = vluti2_laneq_p16(a, b, 3);
+  results[4] = vluti2_laneq_p16(a, b, 4);
+  results[5] = vluti2_laneq_p16(a, b, 5);
+  results[6] = vluti2_laneq_p16(a, b, 6);
+  results[7] = vluti2_laneq_p16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanep16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanep16(poly16x8_t a, uint8x8_t b, poly16x8_t results[])
+{
+  results[0] = vluti2q_lane_p16(a, b, 0);
+  results[1] = vluti2q_lane_p16(a, b, 1);
+  results[2] = vluti2q_lane_p16(a, b, 2);
+  results[3] = vluti2q_lane_p16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqp16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneqp16(poly16x8_t a, uint8x16_t b, poly16x8_t results[])
+{
+  results[0] = vluti2q_laneq_p16(a, b, 0);
+  results[1] = vluti2q_laneq_p16(a, b, 1);
+  results[2] = vluti2q_laneq_p16(a, b, 2);
+  results[3] = vluti2q_laneq_p16(a, b, 3);
+  results[4] = vluti2q_laneq_p16(a, b, 4);
+  results[5] = vluti2q_laneq_p16(a, b, 5);
+  results[6] = vluti2q_laneq_p16(a, b, 6);
+  results[7] = vluti2q_laneq_p16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanef16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t results[])
+{
+  results[0] = vluti2_lane_f16(a, b, 0);
+  results[1] = vluti2_lane_f16(a, b, 1);
+  results[2] = vluti2_lane_f16(a, b, 2);
+  results[3] = vluti2_lane_f16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqf16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneqf16(float16x4_t a, uint8x16_t b, float16x8_t results[])
+{
+  results[0] = vluti2_laneq_f16(a, b, 0);
+  results[1] = vluti2_laneq_f16(a, b, 1);
+  results[2] = vluti2_laneq_f16(a, b, 2);
+  results[3] = vluti2_laneq_f16(a, b, 3);
+  results[4] = vluti2_laneq_f16(a, b, 4);
+  results[5] = vluti2_laneq_f16(a, b, 5);
+  results[6] = vluti2_laneq_f16(a, b, 6);
+  results[7] = vluti2_laneq_f16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanef16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanef16(float16x8_t a, uint8x8_t b, float16x8_t results[])
+{
+  results[0] = vluti2q_lane_f16(a, b, 0);
+  results[1] = vluti2q_lane_f16(a, b, 1);
+  results[2] = vluti2q_lane_f16(a, b, 2);
+  results[3] = vluti2q_lane_f16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqf16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneqf16(float16x8_t a, uint8x16_t b, float16x8_t results[])
+{
+  results[0] = vluti2q_laneq_f16(a, b, 0);
+  results[1] = vluti2q_laneq_f16(a, b, 1);
+  results[2] = vluti2q_laneq_f16(a, b, 2);
+  results[3] = vluti2q_laneq_f16(a, b, 3);
+  results[4] = vluti2q_laneq_f16(a, b, 4);
+  results[5] = vluti2q_laneq_f16(a, b, 5);
+  results[6] = vluti2q_laneq_f16(a, b, 6);
+  results[7] = vluti2q_laneq_f16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanebf16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2_lane_bf16(a, b, 0);
+  results[1] = vluti2_lane_bf16(a, b, 1);
+  results[2] = vluti2_lane_bf16(a, b, 2);
+  results[3] = vluti2_lane_bf16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqbf16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2_laneqbf16(bfloat16x4_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2_laneq_bf16(a, b, 0);
+  results[1] = vluti2_laneq_bf16(a, b, 1);
+  results[2] = vluti2_laneq_bf16(a, b, 2);
+  results[3] = vluti2_laneq_bf16(a, b, 3);
+  results[4] = vluti2_laneq_bf16(a, b, 4);
+  results[5] = vluti2_laneq_bf16(a, b, 5);
+  results[6] = vluti2_laneq_bf16(a, b, 6);
+  results[7] = vluti2_laneq_bf16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanebf16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_lanebf16(bfloat16x8_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2q_lane_bf16(a, b, 0);
+  results[1] = vluti2q_lane_bf16(a, b, 1);
+  results[2] = vluti2q_lane_bf16(a, b, 2);
+  results[3] = vluti2q_lane_bf16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqbf16:
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**	luti2	v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**	...
+**	ret
+*/
+void
+test_vluti2q_laneqbf16(bfloat16x8_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2q_laneq_bf16(a, b, 0);
+  results[1] = vluti2q_laneq_bf16(a, b, 1);
+  results[2] = vluti2q_laneq_bf16(a, b, 2);
+  results[3] = vluti2q_laneq_bf16(a, b, 3);
+  results[4] = vluti2q_laneq_bf16(a, b, 4);
+  results[5] = vluti2q_laneq_bf16(a, b, 5);
+  results[6] = vluti2q_laneq_bf16(a, b, 6);
+  results[7] = vluti2q_laneq_bf16(a, b, 7);
+}
+
+/*
+** test_vluti4q_laneu8:
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[])
+{
+  results[0] = vluti4q_lane_u8(a, b, 0);
+}
+
+/*
+** test_vluti4q_lanequ8:
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[])
+{
+  results[0] = vluti4q_laneq_u8(a, b, 0);
+  results[1] = vluti4q_laneq_u8(a, b, 1);
+}
+
+/*
+** test_vluti4q_lanep8:
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[])
+{
+  results[0] = vluti4q_lane_p8(a, b, 0);
+}
+
+/*
+** test_vluti4q_laneqp8:
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
+{
+  results[0] = vluti4q_laneq_p8(a, b, 0);
+  results[1] = vluti4q_laneq_p8(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneu16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint16x8_t results[])
+{
+  results[0] = vluti4q_lane_u16_x2(a, b, 0);
+  results[1] = vluti4q_lane_u16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_lanequ16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanequ16_x2(uint16x8x2_t a, uint8x16_t b, uint16x8_t results[])
+{
+  results[0] = vluti4q_laneq_u16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_u16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_u16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_u16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanes16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, int16x8_t results[])
+{
+  results[0] = vluti4q_lane_s16_x2(a, b, 0);
+  results[1] = vluti4q_lane_s16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqs16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneqs16_x2(int16x8x2_t a, uint8x16_t b, int16x8_t results[])
+{
+  results[0] = vluti4q_laneq_s16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_s16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_s16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_s16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanep16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, poly16x8_t results[])
+{
+  results[0] = vluti4q_lane_p16_x2(a, b, 0);
+  results[1] = vluti4q_lane_p16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqp16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneqp16_x2(poly16x8x2_t a, uint8x16_t b, poly16x8_t results[])
+{
+  results[0] = vluti4q_laneq_p16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_p16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_p16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_p16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanef16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, float16x8_t results[])
+{
+  results[0] = vluti4q_lane_f16_x2(a, b, 0);
+  results[1] = vluti4q_lane_f16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqf16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneqf16_x2(float16x8x2_t a, uint8x16_t b, float16x8_t results[])
+{
+  results[0] = vluti4q_laneq_f16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_f16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_f16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_f16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanebf16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti4q_lane_bf16_x2(a, b, 0);
+  results[1] = vluti4q_lane_bf16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqbf16_x2:
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**	luti4	v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**	...
+**	ret
+*/
+void
+test_vluti4q_laneqbf16_x2(bfloat16x8x2_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti4q_laneq_bf16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_bf16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_bf16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_bf16_x2(a, b, 3);
+}

Reply via email to