[gcc r15-5884] aarch64: Add support for AdvSIMD lut

Richard Sandiford via Gcc-cvs Tue, 03 Dec 2024 01:54:56 -0800

https://gcc.gnu.org/g:a07a2b8c9e7c2d123f0178875c9110eaf9770b7a


commit r15-5884-ga07a2b8c9e7c2d123f0178875c9110eaf9770b7a
Author: Saurabh Jha <saurabh....@arm.com>
Date:   Tue Dec 3 09:54:01 2024 +0000

    aarch64: Add support for AdvSIMD lut
    
    The AArch64 FEAT_LUT extension is optional from Armv9.2-A and mandatory
    from Armv9.5-A. It introduces instructions for lookup table reads with
    bit indices.
    
    This patch adds support for AdvSIMD lut intrinsics. The intrinsics for
    this extension are implemented as the following builtin functions:
    * vluti2{q}_lane{q}_{u8|s8|p8}
    * vluti2{q}_lane{q}_{u16|s16|p16|f16|bf16}
    * vluti4q_lane{q}_{u8|s8|p8}
    * vluti4q_lane{q}_{u16|s16|p16|f16|bf16}_x2
    
    We also introduced a new approach to do lane checks for AdvSIMD.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-builtins.cc
            (aarch64_builtin_signatures): Add binary_lane.
            (aarch64_fntype): Handle it.
            (simd_types): Add 16-bit x2 types.
            (aarch64_pragma_builtins_checker): New class.
            (aarch64_general_check_builtin_call): Use it.
            (aarch64_expand_pragma_builtin): Add support for lut unspecs.
            * config/aarch64/aarch64-option-extensions.def
            (AARCH64_OPT_EXTENSION): Add lut option.
            * config/aarch64/aarch64-simd-pragma-builtins.def
            (ENTRY_BINARY_LANE): Modify to use new ENTRY macro.
            (ENTRY_TERNARY_VLUT8): Macro to declare lut intrinsics.
            (ENTRY_TERNARY_VLUT16): Macro to declare lut intrinsics.
            (REQUIRED_EXTENSIONS): Declare lut intrinsics.
            * config/aarch64/aarch64-simd.md
            (@aarch64_<vluti_uns_op><VLUT:mode><VB:mode>): Instruction
            pattern for luti2 and luti4 intrinsics.
            (@aarch64_lutx2<VLUT:mode><VB:mode>): Instruction pattern for
            luti4x2 intrinsics.
            * config/aarch64/aarch64.h
            (TARGET_LUT): lut flag.
            * config/aarch64/iterators.md: Iterators and attributes for lut.
            * doc/invoke.texi: Document extension in AArch64 Options.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/simd/lut-incorrect-range.c: New test.
            * gcc.target/aarch64/simd/lut-no-flag.c: New test.
            * gcc.target/aarch64/simd/lut.c: New test.
    
    Co-authored-by: Vladimir Miloserdov <vladimir.miloser...@arm.com>
    Co-authored-by: Richard Sandiford <richard.sandif...@arm.com>

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc             | 132 +++-
 gcc/config/aarch64/aarch64-option-extensions.def   |   2 +
 .../aarch64/aarch64-simd-pragma-builtins.def       |  38 +
 gcc/config/aarch64/aarch64-simd.md                 |  25 +
 gcc/config/aarch64/aarch64.h                       |   3 +
 gcc/config/aarch64/iterators.md                    |  14 +
 gcc/doc/invoke.texi                                |   2 +
 .../gcc.target/aarch64/simd/lut-incorrect-range.c  | 221 ++++++
 .../gcc.target/aarch64/simd/lut-no-flag.c          |  10 +
 gcc/testsuite/gcc.target/aarch64/simd/lut.c        | 849 +++++++++++++++++++++
 10 files changed, 1294 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 8984f0c59b97..f8c8a2721388 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -50,6 +50,8 @@
 #include "builtins.h"
 #include "aarch64-builtins.h"
 
+using namespace aarch64;
+
 #define v8qi_UP  E_V8QImode
 #define v8di_UP  E_V8DImode
 #define v4hi_UP  E_V4HImode
@@ -1600,6 +1602,7 @@ aarch64_init_simd_builtin_functions (bool 
called_from_pragma)
 enum class aarch64_builtin_signatures
 {
   binary,
+  binary_lane,
 };
 
 namespace {
@@ -1623,15 +1626,20 @@ namespace simd_types {
 
   constexpr simd_type f16 { V4HFmode, qualifier_none };
   constexpr simd_type f16q { V8HFmode, qualifier_none };
+  constexpr simd_type f16qx2 { V2x8HFmode, qualifier_none };
   constexpr simd_type p16 { V4HImode, qualifier_poly };
   constexpr simd_type p16q { V8HImode, qualifier_poly };
+  constexpr simd_type p16qx2 { V2x8HImode, qualifier_poly };
   constexpr simd_type s16 { V4HImode, qualifier_none };
   constexpr simd_type s16q { V8HImode, qualifier_none };
+  constexpr simd_type s16qx2 { V2x8HImode, qualifier_none };
   constexpr simd_type u16 { V4HImode, qualifier_unsigned };
   constexpr simd_type u16q { V8HImode, qualifier_unsigned };
+  constexpr simd_type u16qx2 { V2x8HImode, qualifier_unsigned };
 
   constexpr simd_type bf16 { V4BFmode, qualifier_none };
   constexpr simd_type bf16q { V8BFmode, qualifier_none };
+  constexpr simd_type bf16qx2 { V2x8BFmode, qualifier_none };
 
   constexpr simd_type f32 { V2SFmode, qualifier_none };
   constexpr simd_type f32q { V4SFmode, qualifier_none };
@@ -1671,11 +1679,21 @@ aarch64_fntype (const aarch64_pragma_builtins_data 
&builtin_data)
   switch (builtin_data.signature)
     {
     case aarch64_builtin_signatures::binary:
+    case aarch64_builtin_signatures::binary_lane:
       return_type = builtin_data.types[0].type ();
       for (int i = 1; i <= 2; ++i)
        arg_types.quick_push (builtin_data.types[i].type ());
       break;
     }
+  switch (builtin_data.signature)
+    {
+    case aarch64_builtin_signatures::binary_lane:
+      arg_types.quick_push (integer_type_node);
+      break;
+
+    default:
+      break;
+    }
   return build_function_type_array (return_type, arg_types.length (),
                                    arg_types.address ());
 }
@@ -2522,17 +2540,109 @@ aarch64_general_required_extensions (unsigned int code)
   return ext::streaming_compatible (0);
 }
 
+/* Checks calls to intrinsics that are defined using
+   aarch64-simd-pragma-builtins.def.  */
+struct aarch64_pragma_builtins_checker
+{
+  aarch64_pragma_builtins_checker (location_t, tree, unsigned int, tree *,
+                                  const aarch64_pragma_builtins_data &);
+
+  bool require_immediate_range (unsigned int, HOST_WIDE_INT,
+                               HOST_WIDE_INT);
+
+  bool check ();
+
+  location_t location;
+  tree fndecl;
+  unsigned int nargs;
+  array_slice<tree> args;
+  const aarch64_pragma_builtins_data &builtin_data;
+};
+
+/* LOCATION is the location of the call; FNDECL is the FUNCTION_DECL
+   that is being called; NARGS is the number of arguments to the call,
+   which are in a vector starting at FIRST_ARG; and BUILTIN_DATA describes
+   the intrinsic.  */
+aarch64_pragma_builtins_checker::
+aarch64_pragma_builtins_checker (location_t location, tree fndecl,
+                                unsigned int nargs, tree *first_arg,
+                                const aarch64_pragma_builtins_data
+                                   &builtin_data)
+  : location (location), fndecl (fndecl), nargs (nargs),
+    args (first_arg, nargs), builtin_data (builtin_data)
+{
+}
+
+/* Require argument ARGNO to be an integer constant expression in the
+   range [MIN, MAX].  Return true if it was.  */
+bool
+aarch64_pragma_builtins_checker::
+require_immediate_range (unsigned int argno, HOST_WIDE_INT min,
+                        HOST_WIDE_INT max)
+{
+  if (!tree_fits_shwi_p (args[argno]))
+    {
+      report_non_ice (location, fndecl, argno);
+      return false;
+    }
+
+  HOST_WIDE_INT actual = tree_to_shwi (args[argno]);
+  if (actual < min || actual > max)
+    {
+      report_out_of_range (location, fndecl, argno, actual, min, max);
+      return false;
+    }
+
+  return true;
+}
+
+/* Check the arguments to the intrinsic call and return true if they
+   are valid.  */
+bool
+aarch64_pragma_builtins_checker::check ()
+{
+  switch (builtin_data.unspec)
+    {
+    case UNSPEC_LUTI2:
+    case UNSPEC_LUTI4:
+      {
+       auto vector_to_index_mode = builtin_data.types[nargs - 1].mode;
+       int vector_to_index_nunits
+         = GET_MODE_NUNITS (vector_to_index_mode).to_constant ();
+       int output_mode_nunits
+         = GET_MODE_NUNITS (builtin_data.types[0].mode).to_constant ();
+
+       int high;
+       if (builtin_data.unspec == UNSPEC_LUTI2)
+         high = (4 * vector_to_index_nunits / output_mode_nunits) - 1;
+       else
+         high = (2 * vector_to_index_nunits / output_mode_nunits) - 1;
+
+       return require_immediate_range (nargs - 1, 0, high);
+      }
+
+    default:
+      return true;
+    }
+}
+
 bool
 aarch64_general_check_builtin_call (location_t location, vec<location_t>,
                                    unsigned int code, tree fndecl,
-                                   unsigned int nargs ATTRIBUTE_UNUSED,
-                                   tree *args)
+                                   unsigned int nargs, tree *args)
 {
   tree decl = aarch64_builtin_decls[code];
   auto required_extensions = aarch64_general_required_extensions (code);
   if (!aarch64_check_required_extensions (location, decl, required_extensions))
     return false;
 
+  if (auto builtin_data = aarch64_get_pragma_builtin (code))
+    {
+      aarch64_pragma_builtins_checker checker (location, fndecl, nargs, args,
+                                              *builtin_data);
+      return checker.check ();
+    }
+
   switch (code)
     {
     case AARCH64_RSR:
@@ -3442,6 +3552,16 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
                            TYPE_MODE (TREE_TYPE (arg)));
     }
 
+  /* LUTI2 treats the first argument as a vector of 4 elements.  The forms
+     with 128-bit inputs are only provided as a convenience; the upper halves
+     don't actually matter.  */
+  if (builtin_data.unspec == UNSPEC_LUTI2
+      && known_eq (GET_MODE_BITSIZE (ops[1].mode), 128u))
+    {
+      ops[1].mode = aarch64_v64_mode (GET_MODE_INNER (ops[1].mode)).require ();
+      ops[1].value = gen_lowpart (ops[1].mode, ops[1].value);
+    }
+
   insn_code icode;
   switch (builtin_data.unspec)
     {
@@ -3450,6 +3570,14 @@ aarch64_expand_pragma_builtin (tree exp, rtx target,
       icode = code_for_aarch64 (builtin_data.unspec,
                                builtin_data.types[0].mode);
       break;
+
+    case UNSPEC_LUTI2:
+    case UNSPEC_LUTI4:
+      create_integer_operand (ops.safe_push ({}),
+                             builtin_data.unspec == UNSPEC_LUTI2 ? 2 : 4);
+      icode = code_for_aarch64_lut (ops[1].mode, ops[2].mode);
+      break;
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def 
b/gcc/config/aarch64/aarch64-option-extensions.def
index 90abb1c5edd9..0a61b4858b17 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -259,6 +259,8 @@ AARCH64_OPT_EXTENSION("fp8dot2", FP8DOT2, (FP8DOT4), (), 
(), "fp8dot2")
 
 AARCH64_OPT_EXTENSION("ssve-fp8dot2", SSVE_FP8DOT2, (SSVE_FP8DOT4), (), (), 
"ssve-fp8dot2")
 
+AARCH64_OPT_EXTENSION("lut", LUT, (SIMD), (), (), "lut")
+
 #undef AARCH64_OPT_FMV_EXTENSION
 #undef AARCH64_OPT_EXTENSION
 #undef AARCH64_FMV_FEATURE
diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def 
b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
index e49db23cbd18..db40745e9e34 100644
--- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
@@ -22,6 +22,10 @@
 #define ENTRY_BINARY(N, T0, T1, T2, U)         \
   ENTRY (N, binary, T0, T1, T2, U)
 
+#undef ENTRY_BINARY_LANE
+#define ENTRY_BINARY_LANE(N, T0, T1, T2, U)    \
+  ENTRY (N, binary_lane, T0, T1, T2, U)
+
 #undef ENTRY_BINARY_VHSDF
 #define ENTRY_BINARY_VHSDF(NAME, UNSPEC)              \
   ENTRY_BINARY (NAME##_f16, f16, f16, f16, UNSPEC)     \
@@ -30,8 +34,42 @@
   ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC) \
   ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC)
 
+#undef ENTRY_TERNARY_VLUT8
+#define ENTRY_TERNARY_VLUT8(T)                                         \
+  ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8, UNSPEC_LUTI2)        
\
+  ENTRY_BINARY_LANE (vluti2_laneq_##T##8, T##8q, T##8, u8q, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti4q_lane_##T##8, T##8q, T##8q, u8, UNSPEC_LUTI4) \
+  ENTRY_BINARY_LANE (vluti4q_laneq_##T##8, T##8q, T##8q, u8q, UNSPEC_LUTI4)
+
+#undef ENTRY_TERNARY_VLUT16
+#define ENTRY_TERNARY_VLUT16(T)                                                
\
+  ENTRY_BINARY_LANE (vluti2_lane_##T##16, T##16q, T##16, u8, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2_laneq_##T##16, T##16q, T##16, u8q, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_lane_##T##16, T##16q, T##16q, u8, UNSPEC_LUTI2) \
+  ENTRY_BINARY_LANE (vluti2q_laneq_##T##16, T##16q, T##16q, u8q,       \
+                    UNSPEC_LUTI2)                                      \
+  ENTRY_BINARY_LANE (vluti4q_lane_##T##16_x2, T##16q, T##16qx2, u8,    \
+                    UNSPEC_LUTI4)                                      \
+  ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q,  \
+                    UNSPEC_LUTI4)
+
 // faminmax
 #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
 ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX)
 ENTRY_BINARY_VHSDF (vamin, UNSPEC_FAMIN)
 #undef REQUIRED_EXTENSIONS
+
+// lut
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_LUT)
+ENTRY_TERNARY_VLUT8 (p)
+ENTRY_TERNARY_VLUT8 (s)
+ENTRY_TERNARY_VLUT8 (u)
+
+ENTRY_TERNARY_VLUT16 (bf)
+ENTRY_TERNARY_VLUT16 (f)
+ENTRY_TERNARY_VLUT16 (p)
+ENTRY_TERNARY_VLUT16 (s)
+ENTRY_TERNARY_VLUT16 (u)
+#undef REQUIRED_EXTENSIONS
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index cfe95bd4c316..05cbd38372d3 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9999,3 +9999,28 @@
   "TARGET_FAMINMAX"
   "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
 )
+
+(define_insn "@aarch64_lut<VLUT:mode><VB:mode>"
+  [(set (match_operand:<VLUT:VCONQ> 0 "register_operand" "=w")
+        (unspec:<VLUT:VCONQ>
+        [(match_operand:VLUT 1 "register_operand" "w")
+          (match_operand:VB 2 "register_operand" "w")
+          (match_operand:SI 3 "const_int_operand")
+         (match_operand:SI 4 "const_int_operand")]
+         UNSPEC_LUTI))]
+  "TARGET_LUT && INTVAL (operands[4]) <= exact_log2 (<VLUT:nunits>)"
+  "luti%4\t%0<VLUT:Vconqtype>, {%1<VLUT:Vconqtype>}, %2[%3]"
+)
+
+;; lutx2
+(define_insn "@aarch64_lut<VLUTx2:mode><VB:mode>"
+  [(set (match_operand:<VSTRUCT_ELT> 0 "register_operand" "=w")
+        (unspec:<VSTRUCT_ELT>
+        [(match_operand:VLUTx2 1 "register_operand" "w")
+          (match_operand:VB 2 "register_operand" "w")
+          (match_operand:SI 3 "const_int_operand")
+         (match_operand:SI 4 "const_int_operand")]
+         UNSPEC_LUTI))]
+  "TARGET_LUT && INTVAL (operands[4]) == 4"
+  "luti%4\t%0.8h, {%S1.8h, %T1.8h}, %2[%3]"
+)
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index c5dcbe176dfd..b1c694e143f2 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -487,6 +487,9 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
 #define TARGET_FAMINMAX AARCH64_HAVE_ISA (FAMINMAX)
 #define TARGET_SVE_FAMINMAX (TARGET_SVE && TARGET_FAMINMAX)
 
+/* Lookup table (LUTI) extension instructions are enabled through +lut.  */
+#define TARGET_LUT AARCH64_HAVE_ISA (LUT)
+
 /* Prefer different predicate registers for the output of a predicated
    operation over re-using an existing input predicate.  */
 #define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 720d79db8e43..90725c7faebc 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -426,6 +426,10 @@
                             (V8HF "TARGET_SIMD_F16INST")
                             V2SF V4SF])
 
+;; Modes available for Advanced SIMD lut operations.
+(define_mode_iterator VLUT [V8QI V16QI V4HI V4HF V4BF])
+(define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF])
+
 ;; Iterators for single modes, for "@" patterns.
 (define_mode_iterator VNx16QI_ONLY [VNx16QI])
 (define_mode_iterator VNx16SI_ONLY [VNx16SI])
@@ -1109,6 +1113,9 @@
     UNSPEC_FCVTXN      ; Used in aarch64-simd.md.
     UNSPEC_FAMAX       ; Used in aarch64-simd.md.
     UNSPEC_FAMIN       ; Used in aarch64-simd.md.
+    UNSPEC_LUTI                ; Used in aarch64-simd.md.
+    UNSPEC_LUTI2       ; Used in aarch64-simd.md.
+    UNSPEC_LUTI4       ; Used in aarch64-simd.md.
 
     ;; All used in aarch64-sve2.md
     UNSPEC_ADDQV
@@ -1555,6 +1562,12 @@
                          (QI   "8b")  (HI    "8b")
                          (V4BF "8b")  (V8BF  "16b")])
 
+;; Mode to double type mapping.
+(define_mode_attr Vconqtype [(V8QI ".16b") (V16QI ".16b")
+                            (V4HI ".8h") (V8HI ".8h")
+                            (V4HF ".8h") (V8HF ".8h")
+                            (V4BF ".8h") (V8BF ".8h")])
+
 ;; Advanced SIMD vector structure to element modes.
 (define_mode_attr VSTRUCT_ELT [(V2x8QI "V8QI") (V2x4HI "V4HI")
                               (V2x2SI "V2SI") (V2x1DI "DI")
@@ -1685,6 +1698,7 @@
                         (V2SI "V4SI") (V4SI "V4SI")
                         (DI   "V2DI") (V2DI "V2DI")
                         (V4HF "V8HF") (V8HF "V8HF")
+                        (V4BF "V8BF") (V8BF "V8BF")
                         (V2SF "V4SF") (V4SF "V4SF")
                         (V2DF "V2DF") (SI   "V4SI")
                         (HI   "V8HI") (QI   "V16QI")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e27a92c270af..e3c2adc25077 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -22007,6 +22007,8 @@ Enable the fp8 (8-bit floating point) to half-precision 
2-way dot product
 extension in streaming mode.
 @item faminmax
 Enable the Floating Point Absolute Maximum/Minimum extension.
+@item lut
+Enable the Lookup Table extension.
 @item sve-b16b16
 Enable the SVE non-widening brain floating-point (@code{bf16}) extension.
 This only has an effect when @code{sve2} or @code{sme2} are also enabled.
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c 
b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c
new file mode 100644
index 000000000000..24e5d46d353b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-incorrect-range.c
@@ -0,0 +1,221 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+lut" } */
+
+#include "arm_neon.h"
+
+void
+test_var(uint8x16_t a, uint8x8_t b, uint8x16_t c, int x)
+{
+  vluti2q_lane_u8(a, b, x); /* { dg-error {argument 3 of 'vluti2q_lane_u8' 
must be an integer constant expression} } */
+  vluti2q_laneq_u8(a, c, x); /* { dg-error {argument 3 of 'vluti2q_laneq_u8' 
must be an integer constant expression} } */
+  vluti4q_lane_u8(a, b, x); /* { dg-error {argument 3 of 'vluti4q_lane_u8' 
must be an integer constant expression} } */
+  vluti4q_laneq_u8(a, c, x); /* { dg-error {argument 3 of 'vluti4q_laneq_u8' 
must be an integer constant expression} } */
+}
+
+void
+test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t c, uint8x16_t d)
+{
+  vluti2_lane_u8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti2_lane_u8(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti2_laneq_u8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_laneq_u8(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_lane_u8(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti2q_lane_u8(c, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti2q_laneq_u8(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_laneq_u8(c, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t c, uint8x16_t d)
+{
+  vluti2_lane_s8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti2_lane_s8(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti2_laneq_s8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_laneq_s8(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_lane_s8(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti2q_lane_s8(c, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti2q_laneq_s8(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_laneq_s8(c, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t c, uint8x16_t d)
+{
+  vluti2_lane_p8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti2_lane_p8(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti2_laneq_p8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_laneq_p8(a, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_lane_p8(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti2q_lane_p8(c, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti2q_laneq_p8(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_laneq_p8(c, d, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_u16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_lane_u16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2_laneq_u16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2_laneq_u16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+
+  vluti2q_lane_u16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_lane_u16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_laneq_u16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2q_laneq_u16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_s16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_lane_s16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2_laneq_s16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2_laneq_s16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+
+  vluti2q_lane_s16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_lane_s16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_laneq_s16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2q_laneq_s16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_p16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_lane_p16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2_laneq_p16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2_laneq_p16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+
+  vluti2q_lane_p16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_lane_p16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_laneq_p16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2q_laneq_p16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_f16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_lane_f16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2_laneq_f16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2_laneq_f16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+
+  vluti2q_lane_f16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_lane_f16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_laneq_f16(c, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2q_laneq_f16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t c, uint8x16_t d)
+{
+  vluti2_lane_bf16(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2_lane_bf16(a, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2_laneq_bf16(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+  vluti2_laneq_bf16(a, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+
+  vluti2q_lane_bf16(c, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+  vluti2q_lane_bf16(c, b, 4); /* { dg-error {passing 4 to argument 3 [^\n]*, 
which expects a value in the range \[0, 3\]} } */
+
+  vluti2q_laneq_bf16(c, d, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 7\]} } */
+  vluti2q_laneq_bf16(c, d, 8); /* { dg-error {passing 8 to argument 3 [^\n]*, 
which expects a value in the range \[0, 7\]} } */
+}
+
+void
+test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_u8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects the value 0} } */
+  vluti4q_lane_u8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects the value 0} } */
+
+  vluti4q_laneq_u8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti4q_laneq_u8(a, d, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+}
+
+void
+test_vluti4q_lanes8(int8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_s8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects the value 0} } */
+  vluti4q_lane_s8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects the value 0} } */
+
+  vluti4q_laneq_s8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti4q_laneq_s8(a, d, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+}
+
+void
+test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_p8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects the value 0} } */
+  vluti4q_lane_p8(a, b, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects the value 0} } */
+
+  vluti4q_laneq_p8(a, d, -1); /* { dg-error {passing -1 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+  vluti4q_laneq_p8(a, d, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+}
+
+void
+test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_u16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 1\]} } */
+  vluti4q_lane_u16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti4q_laneq_u16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+  vluti4q_laneq_u16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_s16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 1\]} } */
+  vluti4q_lane_s16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti4q_laneq_s16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+  vluti4q_laneq_s16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_p16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 1\]} } */
+  vluti4q_lane_p16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti4q_laneq_p16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+  vluti4q_laneq_p16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_f16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 1\]} } */
+  vluti4q_lane_f16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 [^\n]*, 
which expects a value in the range \[0, 1\]} } */
+
+  vluti4q_laneq_f16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+  vluti4q_laneq_f16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+}
+
+void
+test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, uint8x16_t d)
+{
+  vluti4q_lane_bf16_x2(a, b, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 1\]} } */
+  vluti4q_lane_bf16_x2(a, b, 2); /* { dg-error {passing 2 to argument 3 
[^\n]*, which expects a value in the range \[0, 1\]} } */
+
+  vluti4q_laneq_bf16_x2(a, d, -1); /* { dg-error {passing -1 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+  vluti4q_laneq_bf16_x2(a, d, 4); /* { dg-error {passing 4 to argument 3 
[^\n]*, which expects a value in the range \[0, 3\]} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c 
b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c
new file mode 100644
index 000000000000..d180d8f2150e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut-no-flag.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv9-a" } */
+
+#include "arm_neon.h"
+
+void
+test (uint8x8_t a, uint8x8_t b)
+{
+  vluti2_lane_u8 (a, b, 0); /* { dg-error {ACLE function 'vluti2_lane_u8' 
requires ISA extension 'lut'} } */
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/lut.c 
b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
new file mode 100644
index 000000000000..fc89b215a93b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/lut.c
@@ -0,0 +1,849 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+lut" } */
+/* { dg-final { check-function-bodies "**" ""} } */
+
+#include "arm_neon.h"
+
+/*
+** test_vluti2_laneu8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneu8(uint8x8_t a, uint8x8_t b, uint8x16_t results[])
+{
+  results[0] = vluti2_lane_u8(a, b, 0);
+  results[1] = vluti2_lane_u8(a, b, 1);
+}
+
+/*
+** test_vluti2_lanequ8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanequ8(uint8x8_t a, uint8x16_t b, uint8x16_t results[])
+{
+  results[0] = vluti2_laneq_u8(a, b, 0);
+  results[1] = vluti2_laneq_u8(a, b, 1);
+  results[2] = vluti2_laneq_u8(a, b, 2);
+  results[3] = vluti2_laneq_u8(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneu8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[])
+{
+  results[0] = vluti2q_lane_u8(a, b, 0);
+  results[1] = vluti2q_lane_u8(a, b, 1);
+}
+
+/*
+** test_vluti2q_lanequ8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[])
+{
+  results[0] = vluti2q_laneq_u8(a, b, 0);
+  results[1] = vluti2q_laneq_u8(a, b, 1);
+  results[2] = vluti2q_laneq_u8(a, b, 2);
+  results[3] = vluti2q_laneq_u8(a, b, 3);
+}
+
+/*
+** test_vluti2_lanes8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanes8(int8x8_t a, uint8x8_t b, int8x16_t results[])
+{
+  results[0] = vluti2_lane_s8(a, b, 0);
+  results[1] = vluti2_lane_s8(a, b, 1);
+}
+
+/*
+** test_vluti2_laneqs8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneqs8(int8x8_t a, uint8x16_t b, int8x16_t results[])
+{
+  results[0] = vluti2_laneq_s8(a, b, 0);
+  results[1] = vluti2_laneq_s8(a, b, 1);
+  results[2] = vluti2_laneq_s8(a, b, 2);
+  results[3] = vluti2_laneq_s8(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanes8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanes8(int8x16_t a, uint8x8_t b, int8x16_t results[])
+{
+  results[0] = vluti2q_lane_s8(a, b, 0);
+  results[1] = vluti2q_lane_s8(a, b, 1);
+}
+
+/*
+** test_vluti2q_laneqs8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneqs8(int8x16_t a, uint8x16_t b, int8x16_t results[])
+{
+  results[0] = vluti2q_laneq_s8(a, b, 0);
+  results[1] = vluti2q_laneq_s8(a, b, 1);
+  results[2] = vluti2q_laneq_s8(a, b, 2);
+  results[3] = vluti2q_laneq_s8(a, b, 3);
+}
+
+/*
+** test_vluti2_lanep8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanep8(poly8x8_t a, uint8x8_t b, poly8x16_t results[])
+{
+  results[0] = vluti2_lane_p8(a, b, 0);
+  results[1] = vluti2_lane_p8(a, b, 1);
+}
+
+/*
+** test_vluti2_laneqp8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneqp8(poly8x8_t a, uint8x16_t b, poly8x16_t results[])
+{
+  results[0] = vluti2_laneq_p8(a, b, 0);
+  results[1] = vluti2_laneq_p8(a, b, 1);
+  results[2] = vluti2_laneq_p8(a, b, 2);
+  results[3] = vluti2_laneq_p8(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanep8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[])
+{
+  results[0] = vluti2q_lane_p8(a, b, 0);
+  results[1] = vluti2q_lane_p8(a, b, 1);
+}
+
+/*
+** test_vluti2q_laneqp8:
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
+{
+  results[0] = vluti2q_laneq_p8(a, b, 0);
+  results[1] = vluti2q_laneq_p8(a, b, 1);
+  results[2] = vluti2q_laneq_p8(a, b, 2);
+  results[3] = vluti2q_laneq_p8(a, b, 3);
+}
+
+/*
+** test_vluti2_laneu16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneu16(uint16x4_t a, uint8x8_t b, uint16x8_t results[])
+{
+  results[0] = vluti2_lane_u16(a, b, 0);
+  results[1] = vluti2_lane_u16(a, b, 1);
+  results[2] = vluti2_lane_u16(a, b, 2);
+  results[3] = vluti2_lane_u16(a, b, 3);
+}
+
+/*
+** test_vluti2_lanequ16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanequ16(uint16x4_t a, uint8x16_t b, uint16x8_t results[])
+{
+  results[0] = vluti2_laneq_u16(a, b, 0);
+  results[1] = vluti2_laneq_u16(a, b, 1);
+  results[2] = vluti2_laneq_u16(a, b, 2);
+  results[3] = vluti2_laneq_u16(a, b, 3);
+  results[4] = vluti2_laneq_u16(a, b, 4);
+  results[5] = vluti2_laneq_u16(a, b, 5);
+  results[6] = vluti2_laneq_u16(a, b, 6);
+  results[7] = vluti2_laneq_u16(a, b, 7);
+}
+
+/*
+** test_vluti2q_laneu16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneu16(uint16x8_t a, uint8x8_t b, uint16x8_t results[])
+{
+  results[0] = vluti2q_lane_u16(a, b, 0);
+  results[1] = vluti2q_lane_u16(a, b, 1);
+  results[2] = vluti2q_lane_u16(a, b, 2);
+  results[3] = vluti2q_lane_u16(a, b, 3);
+}
+
+/*
+** test_vluti2q_lanequ16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanequ16(uint16x8_t a, uint8x16_t b, uint16x8_t results[])
+{
+  results[0] = vluti2q_laneq_u16(a, b, 0);
+  results[1] = vluti2q_laneq_u16(a, b, 1);
+  results[2] = vluti2q_laneq_u16(a, b, 2);
+  results[3] = vluti2q_laneq_u16(a, b, 3);
+  results[4] = vluti2q_laneq_u16(a, b, 4);
+  results[5] = vluti2q_laneq_u16(a, b, 5);
+  results[6] = vluti2q_laneq_u16(a, b, 6);
+  results[7] = vluti2q_laneq_u16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanes16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanes16(int16x4_t a, uint8x8_t b, int16x8_t results[])
+{
+  results[0] = vluti2_lane_s16(a, b, 0);
+  results[1] = vluti2_lane_s16(a, b, 1);
+  results[2] = vluti2_lane_s16(a, b, 2);
+  results[3] = vluti2_lane_s16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqs16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneqs16(int16x4_t a, uint8x16_t b, int16x8_t results[])
+{
+  results[0] = vluti2_laneq_s16(a, b, 0);
+  results[1] = vluti2_laneq_s16(a, b, 1);
+  results[2] = vluti2_laneq_s16(a, b, 2);
+  results[3] = vluti2_laneq_s16(a, b, 3);
+  results[4] = vluti2_laneq_s16(a, b, 4);
+  results[5] = vluti2_laneq_s16(a, b, 5);
+  results[6] = vluti2_laneq_s16(a, b, 6);
+  results[7] = vluti2_laneq_s16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanes16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanes16(int16x8_t a, uint8x8_t b, int16x8_t results[])
+{
+  results[0] = vluti2q_lane_s16(a, b, 0);
+  results[1] = vluti2q_lane_s16(a, b, 1);
+  results[2] = vluti2q_lane_s16(a, b, 2);
+  results[3] = vluti2q_lane_s16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqs16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneqs16(int16x8_t a, uint8x16_t b, int16x8_t results[])
+{
+  results[0] = vluti2q_laneq_s16(a, b, 0);
+  results[1] = vluti2q_laneq_s16(a, b, 1);
+  results[2] = vluti2q_laneq_s16(a, b, 2);
+  results[3] = vluti2q_laneq_s16(a, b, 3);
+  results[4] = vluti2q_laneq_s16(a, b, 4);
+  results[5] = vluti2q_laneq_s16(a, b, 5);
+  results[6] = vluti2q_laneq_s16(a, b, 6);
+  results[7] = vluti2q_laneq_s16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanep16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanep16(poly16x4_t a, uint8x8_t b, poly16x8_t results[])
+{
+  results[0] = vluti2_lane_p16(a, b, 0);
+  results[1] = vluti2_lane_p16(a, b, 1);
+  results[2] = vluti2_lane_p16(a, b, 2);
+  results[3] = vluti2_lane_p16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqp16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneqp16(poly16x4_t a, uint8x16_t b, poly16x8_t results[])
+{
+  results[0] = vluti2_laneq_p16(a, b, 0);
+  results[1] = vluti2_laneq_p16(a, b, 1);
+  results[2] = vluti2_laneq_p16(a, b, 2);
+  results[3] = vluti2_laneq_p16(a, b, 3);
+  results[4] = vluti2_laneq_p16(a, b, 4);
+  results[5] = vluti2_laneq_p16(a, b, 5);
+  results[6] = vluti2_laneq_p16(a, b, 6);
+  results[7] = vluti2_laneq_p16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanep16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanep16(poly16x8_t a, uint8x8_t b, poly16x8_t results[])
+{
+  results[0] = vluti2q_lane_p16(a, b, 0);
+  results[1] = vluti2q_lane_p16(a, b, 1);
+  results[2] = vluti2q_lane_p16(a, b, 2);
+  results[3] = vluti2q_lane_p16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqp16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneqp16(poly16x8_t a, uint8x16_t b, poly16x8_t results[])
+{
+  results[0] = vluti2q_laneq_p16(a, b, 0);
+  results[1] = vluti2q_laneq_p16(a, b, 1);
+  results[2] = vluti2q_laneq_p16(a, b, 2);
+  results[3] = vluti2q_laneq_p16(a, b, 3);
+  results[4] = vluti2q_laneq_p16(a, b, 4);
+  results[5] = vluti2q_laneq_p16(a, b, 5);
+  results[6] = vluti2q_laneq_p16(a, b, 6);
+  results[7] = vluti2q_laneq_p16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanef16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanef16(float16x4_t a, uint8x8_t b, float16x8_t results[])
+{
+  results[0] = vluti2_lane_f16(a, b, 0);
+  results[1] = vluti2_lane_f16(a, b, 1);
+  results[2] = vluti2_lane_f16(a, b, 2);
+  results[3] = vluti2_lane_f16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqf16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneqf16(float16x4_t a, uint8x16_t b, float16x8_t results[])
+{
+  results[0] = vluti2_laneq_f16(a, b, 0);
+  results[1] = vluti2_laneq_f16(a, b, 1);
+  results[2] = vluti2_laneq_f16(a, b, 2);
+  results[3] = vluti2_laneq_f16(a, b, 3);
+  results[4] = vluti2_laneq_f16(a, b, 4);
+  results[5] = vluti2_laneq_f16(a, b, 5);
+  results[6] = vluti2_laneq_f16(a, b, 6);
+  results[7] = vluti2_laneq_f16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanef16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanef16(float16x8_t a, uint8x8_t b, float16x8_t results[])
+{
+  results[0] = vluti2q_lane_f16(a, b, 0);
+  results[1] = vluti2q_lane_f16(a, b, 1);
+  results[2] = vluti2q_lane_f16(a, b, 2);
+  results[3] = vluti2q_lane_f16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqf16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneqf16(float16x8_t a, uint8x16_t b, float16x8_t results[])
+{
+  results[0] = vluti2q_laneq_f16(a, b, 0);
+  results[1] = vluti2q_laneq_f16(a, b, 1);
+  results[2] = vluti2q_laneq_f16(a, b, 2);
+  results[3] = vluti2q_laneq_f16(a, b, 3);
+  results[4] = vluti2q_laneq_f16(a, b, 4);
+  results[5] = vluti2q_laneq_f16(a, b, 5);
+  results[6] = vluti2q_laneq_f16(a, b, 6);
+  results[7] = vluti2q_laneq_f16(a, b, 7);
+}
+
+/*
+** test_vluti2_lanebf16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_lanebf16(bfloat16x4_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2_lane_bf16(a, b, 0);
+  results[1] = vluti2_lane_bf16(a, b, 1);
+  results[2] = vluti2_lane_bf16(a, b, 2);
+  results[3] = vluti2_lane_bf16(a, b, 3);
+}
+
+/*
+** test_vluti2_laneqbf16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2_laneqbf16(bfloat16x4_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2_laneq_bf16(a, b, 0);
+  results[1] = vluti2_laneq_bf16(a, b, 1);
+  results[2] = vluti2_laneq_bf16(a, b, 2);
+  results[3] = vluti2_laneq_bf16(a, b, 3);
+  results[4] = vluti2_laneq_bf16(a, b, 4);
+  results[5] = vluti2_laneq_bf16(a, b, 5);
+  results[6] = vluti2_laneq_bf16(a, b, 6);
+  results[7] = vluti2_laneq_bf16(a, b, 7);
+}
+
+/*
+** test_vluti2q_lanebf16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_lanebf16(bfloat16x8_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2q_lane_bf16(a, b, 0);
+  results[1] = vluti2q_lane_bf16(a, b, 1);
+  results[2] = vluti2q_lane_bf16(a, b, 2);
+  results[3] = vluti2q_lane_bf16(a, b, 3);
+}
+
+/*
+** test_vluti2q_laneqbf16:
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[3\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[4\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[5\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[6\]
+**     luti2   v[0-9]+\.8h, {v[0-9]+\.8h}, v[0-9]+\[7\]
+**     ...
+**     ret
+*/
+void
+test_vluti2q_laneqbf16(bfloat16x8_t a, uint8x16_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti2q_laneq_bf16(a, b, 0);
+  results[1] = vluti2q_laneq_bf16(a, b, 1);
+  results[2] = vluti2q_laneq_bf16(a, b, 2);
+  results[3] = vluti2q_laneq_bf16(a, b, 3);
+  results[4] = vluti2q_laneq_bf16(a, b, 4);
+  results[5] = vluti2q_laneq_bf16(a, b, 5);
+  results[6] = vluti2q_laneq_bf16(a, b, 6);
+  results[7] = vluti2q_laneq_bf16(a, b, 7);
+}
+
+/*
+** test_vluti4q_laneu8:
+**     luti4   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_laneu8(uint8x16_t a, uint8x8_t b, uint8x16_t results[])
+{
+  results[0] = vluti4q_lane_u8(a, b, 0);
+}
+
+/*
+** test_vluti4q_lanequ8:
+**     luti4   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_lanequ8(uint8x16_t a, uint8x16_t b, uint8x16_t results[])
+{
+  results[0] = vluti4q_laneq_u8(a, b, 0);
+  results[1] = vluti4q_laneq_u8(a, b, 1);
+}
+
+/*
+** test_vluti4q_lanep8:
+**     luti4   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_lanep8(poly8x16_t a, uint8x8_t b, poly8x16_t results[])
+{
+  results[0] = vluti4q_lane_p8(a, b, 0);
+}
+
+/*
+** test_vluti4q_laneqp8:
+**     luti4   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.16b, {v[0-9]+\.16b}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_laneqp8(poly8x16_t a, uint8x16_t b, poly8x16_t results[])
+{
+  results[0] = vluti4q_laneq_p8(a, b, 0);
+  results[1] = vluti4q_laneq_p8(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneu16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_laneu16_x2(uint16x8x2_t a, uint8x8_t b, uint16x8_t results[])
+{
+  results[0] = vluti4q_lane_u16_x2(a, b, 0);
+  results[1] = vluti4q_lane_u16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_lanequ16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_lanequ16_x2(uint16x8x2_t a, uint8x16_t b, uint16x8_t results[])
+{
+  results[0] = vluti4q_laneq_u16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_u16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_u16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_u16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanes16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_lanes16_x2(int16x8x2_t a, uint8x8_t b, int16x8_t results[])
+{
+  results[0] = vluti4q_lane_s16_x2(a, b, 0);
+  results[1] = vluti4q_lane_s16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqs16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_laneqs16_x2(int16x8x2_t a, uint8x16_t b, int16x8_t results[])
+{
+  results[0] = vluti4q_laneq_s16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_s16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_s16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_s16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanep16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_lanep16_x2(poly16x8x2_t a, uint8x8_t b, poly16x8_t results[])
+{
+  results[0] = vluti4q_lane_p16_x2(a, b, 0);
+  results[1] = vluti4q_lane_p16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqp16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_laneqp16_x2(poly16x8x2_t a, uint8x16_t b, poly16x8_t results[])
+{
+  results[0] = vluti4q_laneq_p16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_p16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_p16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_p16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanef16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_lanef16_x2(float16x8x2_t a, uint8x8_t b, float16x8_t results[])
+{
+  results[0] = vluti4q_lane_f16_x2(a, b, 0);
+  results[1] = vluti4q_lane_f16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqf16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_laneqf16_x2(float16x8x2_t a, uint8x16_t b, float16x8_t results[])
+{
+  results[0] = vluti4q_laneq_f16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_f16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_f16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_f16_x2(a, b, 3);
+}
+
+/*
+** test_vluti4q_lanebf16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_lanebf16_x2(bfloat16x8x2_t a, uint8x8_t b, bfloat16x8_t results[])
+{
+  results[0] = vluti4q_lane_bf16_x2(a, b, 0);
+  results[1] = vluti4q_lane_bf16_x2(a, b, 1);
+}
+
+/*
+** test_vluti4q_laneqbf16_x2:
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[0\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[1\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[2\]
+**     luti4   v[0-9]+\.8h, {v[0-9]+\.8h, v[0-9]+\.8h}, v[0-9]+\[3\]
+**     ...
+**     ret
+*/
+void
+test_vluti4q_laneqbf16_x2(bfloat16x8x2_t a, uint8x16_t b, bfloat16x8_t 
results[])
+{
+  results[0] = vluti4q_laneq_bf16_x2(a, b, 0);
+  results[1] = vluti4q_laneq_bf16_x2(a, b, 1);
+  results[2] = vluti4q_laneq_bf16_x2(a, b, 2);
+  results[3] = vluti4q_laneq_bf16_x2(a, b, 3);
+}

[gcc r15-5884] aarch64: Add support for AdvSIMD lut

Reply via email to