Add a fold at gimple_fold_builtin to prefer the highpart variant of a builtin
if the arguments are better suited to it. This helps us avoid copying data
between lanes before operation.
E.g. We prefer to use UMULL2 rather than DUP+UMULL for the following:
uint16x8_t
foo(const uint8x16_t s) {
const uint8x16_t f0 = vdupq_n_u8(4);
return vmull_u8(vget_high_u8(s), vget_high_u8(f0));
}
gcc/ChangeLog:
* config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS): New macro.
Cover every lo/hi pairing in builtin-pairs.def.
(aarch64_get_highpart_builtin): New function. Get the fndecl for
the hi builtin paired with FCODE.
(LO_HI_PAIR): New macro.
(aarch64_object_of_bfr): New function. Parse BIT_FIELD_REF expressions.
(aarch64_duplicate_vector_cst): New function.
(aarch64_nbit_vector_type_p): New function. Check if a type describes
an n-bit vector.
(aarch64_vq_high_half): New function. Helper to identify vector
highparts.
(aarch64_fold_lo_call_to_hi): New function. Perform the fold described
here.
(aarch64_general_gimple_fold_builtin): Add cases for lo builtins.
* config/aarch64/aarch64-builtin-pairs.def: New file. Declare pairings
of lo/hi builtins.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/vabal_combine.c: Removed.
* gcc.target/aarch64/simd/fold_to_highpart_1.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_2.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_3.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_4.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_5.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_6.c: New test.
* gcc.target/aarch64/simd/fold_to_highpart_7.c: New test.
---
gcc/config/aarch64/aarch64-builtin-pairs.def | 81 ++
gcc/config/aarch64/aarch64-builtins.cc | 206 +++++
.../aarch64/simd/fold_to_highpart_1.c | 733 ++++++++++++++++++
.../aarch64/simd/fold_to_highpart_2.c | 86 ++
.../aarch64/simd/fold_to_highpart_3.c | 81 ++
.../aarch64/simd/fold_to_highpart_4.c | 77 ++
.../aarch64/simd/fold_to_highpart_5.c | 38 +
.../aarch64/simd/fold_to_highpart_6.c | 94 +++
.../aarch64/simd/fold_to_highpart_7.c | 36 +
.../gcc.target/aarch64/simd/vabal_combine.c | 72 --
10 files changed, 1432 insertions(+), 72 deletions(-)
create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
delete mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def
b/gcc/config/aarch64/aarch64-builtin-pairs.def
new file mode 100644
index 00000000000..e1dc0b71a1c
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtin-pairs.def
@@ -0,0 +1,81 @@
+/* Pairings of AArch64 builtins that can be folded into each other.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+/* LO/HI widenable integer modes. */
+#define LO_HI_PAIR_V_WI(T, LO, HI) \
+ LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+ LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) \
+ LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi)
+
+/* LO/HI Single/Half integer modes. */
+#define LO_HI_PAIR_V_HSI(T, LO, HI) \
+ LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+ LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi)
+
+#define UNOP_LONG_LH_PAIRS \
+ LO_HI_PAIR (UNOP_sxtlv8hi, UNOP_vec_unpacks_hi_v16qi) \
+ LO_HI_PAIR (UNOP_sxtlv4si, UNOP_vec_unpacks_hi_v8hi) \
+ LO_HI_PAIR (UNOP_sxtlv2di, UNOP_vec_unpacks_hi_v4si) \
+ LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \
+ LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \
+ LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpacku_hi_v4si) \
+ LO_HI_PAIR (UNOP_float_extend_lo_v4sf, UNOP_vec_unpacks_hi_v8hf) \
+ LO_HI_PAIR (UNOP_float_extend_lo_v2df, UNOP_vec_unpacks_hi_v4sf) \
+ LO_HI_PAIR (UNOP_vbfcvtv4bf, UNOP_vbfcvt_highv8bf)
+
+#define BINOP_LONG_LH_PAIRS \
+ LO_HI_PAIR_V_WI (BINOP, saddl, saddl2) \
+ LO_HI_PAIR_V_WI (BINOPU, uaddl, uaddl2) \
+ LO_HI_PAIR_V_WI (BINOP, ssubl, ssubl2) \
+ LO_HI_PAIR_V_WI (BINOPU, usubl, usubl2) \
+ LO_HI_PAIR_V_WI (BINOP, sabdl, sabdl2) \
+ LO_HI_PAIR_V_WI (BINOPU, uabdl, uabdl2) \
+ LO_HI_PAIR_V_WI (BINOP, intrinsic_vec_smult_lo_, vec_widen_smult_hi_) \
+ LO_HI_PAIR_V_WI (BINOPU, intrinsic_vec_umult_lo_, vec_widen_umult_hi_) \
+ LO_HI_PAIR_V_HSI (BINOP, sqdmull, sqdmull2) \
+ LO_HI_PAIR (BINOPP_pmullv8qi, BINOPP_pmull_hiv16qi)
+
+#define BINOP_LONG_N_LH_PAIRS \
+ LO_HI_PAIR_V_HSI (BINOP, smull_n, smull_hi_n) \
+ LO_HI_PAIR_V_HSI (BINOPU, umull_n, umull_hi_n) \
+ LO_HI_PAIR_V_HSI (BINOP, sqdmull_n, sqdmull2_n) \
+
+#define BINOP_WIDE_LH_PAIRS \
+ LO_HI_PAIR_V_WI (BINOP, ssubw, ssubw2) \
+ LO_HI_PAIR_V_WI (BINOPU, usubw, usubw2) \
+ LO_HI_PAIR_V_WI (BINOP, saddw, saddw2) \
+ LO_HI_PAIR_V_WI (BINOPU, uaddw, uaddw2)
+
+#define TERNOP_LONG_LH_PAIRS \
+ LO_HI_PAIR_V_WI (TERNOP, smlal, smlal_hi) \
+ LO_HI_PAIR_V_WI (TERNOPU, umlal, umlal_hi) \
+ LO_HI_PAIR_V_WI (TERNOP, smlsl, smlsl_hi) \
+ LO_HI_PAIR_V_WI (TERNOPU, umlsl, umlsl_hi) \
+ LO_HI_PAIR_V_WI (TERNOP, sabal, sabal2) \
+ LO_HI_PAIR_V_WI (TERNOPU, uabal, uabal2) \
+ LO_HI_PAIR_V_HSI (TERNOP, sqdmlal, sqdmlal2) \
+ LO_HI_PAIR_V_HSI (TERNOP, sqdmlsl, sqdmlsl2)
+
+#define TERNOP_LONG_N_LH_PAIRS \
+ LO_HI_PAIR_V_HSI (TERNOP, smlal_n, smlal_hi_n) \
+ LO_HI_PAIR_V_HSI (TERNOPU, umlal_n, umlal_hi_n) \
+ LO_HI_PAIR_V_HSI (TERNOP, smlsl_n, smlsl_hi_n) \
+ LO_HI_PAIR_V_HSI (TERNOPU, umlsl_n, umlsl_hi_n) \
+ LO_HI_PAIR_V_HSI (TERNOP, sqdmlal_n, sqdmlal2_n) \
+ LO_HI_PAIR_V_HSI (TERNOP, sqdmlsl_n, sqdmlsl2_n)
diff --git a/gcc/config/aarch64/aarch64-builtins.cc
b/gcc/config/aarch64/aarch64-builtins.cc
index 128cc365d3d..6cffbdb79a9 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -48,6 +48,8 @@
#include "attribs.h"
#include "gimple-fold.h"
#include "builtins.h"
+#include "tree-pass.h"
+#include "tree-vector-builder.h"
#include "aarch64-builtins.h"
using namespace aarch64;
@@ -737,6 +739,16 @@ static aarch64_simd_builtin_datum
aarch64_simd_builtin_data[] = {
VGET_HIGH_BUILTIN(u64) \
VGET_HIGH_BUILTIN(bf16)
+#include "aarch64-builtin-pairs.def"
+
+#define LO_HI_PAIRINGS \
+ UNOP_LONG_LH_PAIRS \
+ BINOP_WIDE_LH_PAIRS \
+ BINOP_LONG_LH_PAIRS \
+ BINOP_LONG_N_LH_PAIRS \
+ TERNOP_LONG_LH_PAIRS \
+ TERNOP_LONG_N_LH_PAIRS \
+
typedef struct
{
const char *name;
@@ -4982,6 +4994,196 @@ aarch64_gimple_fold_pragma_builtin
}
}
+/* Return the fndecl of the builtin paired with FCODE_LO if one
+ exists (see aarch64-builtin-pairs.def), or NULL_TREE if not. */
+static inline tree
+aarch64_get_highpart_builtin (unsigned int fcode_lo)
+{
+#undef LO_HI_PAIR
+#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A: \
+ return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_##B];
+
+ switch (fcode_lo)
+ {
+ LO_HI_PAIRINGS
+ default:
+ return NULL_TREE;
+ }
+}
+
+/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a
+ BIT_FIELD_REF with SIZE and OFFSET, return the object of the
+ BIT_FIELD_REF. Otherwise, return NULL_TREE. */
+static tree
+aarch64_object_of_bfr (const_tree arg, unsigned HOST_WIDE_INT size,
+ unsigned HOST_WIDE_INT offset)
+{
+ if (TREE_CODE (arg) != SSA_NAME)
+ return NULL_TREE;
+
+ gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (arg));
+
+ if (!stmt)
+ return NULL_TREE;
+
+ if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF)
+ return NULL_TREE;
+
+ tree bf_ref = gimple_assign_rhs1 (stmt);
+
+ if (bit_field_size (bf_ref).to_constant () != size
+ || bit_field_offset (bf_ref).to_constant () != offset)
+ return NULL_TREE;
+
+ return TREE_OPERAND (bf_ref, 0);
+}
+
+/* Build and return a new VECTOR_CST of type OUT_TY using the
+ elements of VEC_IN. */
+static tree
+aarch64_duplicate_vector_cst (const_tree vec_in, tree out_ty)
+{
+ gcc_assert (TREE_CODE (vec_in) == VECTOR_CST
+ && VECTOR_TYPE_P (out_ty));
+ unsigned HOST_WIDE_INT nelts
+ = VECTOR_CST_NELTS (vec_in).to_constant ();
+
+ tree_vector_builder vec_out (out_ty, nelts, 1);
+ for (unsigned i = 0; i < nelts; i++)
+ vec_out.quick_push (VECTOR_CST_ELT (vec_in, i));
+
+ return vec_out.build ();
+}
+
+/* Return true if TYPE denotes a vector type with a known
+ and constant size in bits N. Return false otherwise. */
+static inline bool
+aarch64_nbit_vector_type_p (const_tree type,
+ unsigned HOST_WIDE_INT n)
+{
+ if (!VECTOR_TYPE_P (type))
+ return false;
+
+ return (tree_fits_uhwi_p (TYPE_SIZE (type))
+ && wi::to_widest (TYPE_SIZE (type)) == n);
+}
+
+/* Helper for aarch64_fold_lo_call_to_hi; if ARG is a reference to the
+ upper half of a 128b vector then return the 128b vector. Otherwise,
+ return NULL_TREE. */
+static tree
+aarch64_vq_high_half (const_tree arg)
+{
+ unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
+ tree base = aarch64_object_of_bfr (arg, 64, offset);
+
+ if (!base || !aarch64_nbit_vector_type_p (TREE_TYPE (base), 128))
+ return NULL_TREE;
+
+ return base;
+}
+
+/* Fold a builtin call to it's hi equivalent if the arguments
+ are better suited to it.
+
+ Return the new call if so, otherwise nullptr. */
+static gcall *
+aarch64_fold_lo_call_to_hi (unsigned int fcode, gcall *stmt,
+ gimple_stmt_iterator *gsi)
+{
+ /* Punt until as late as possible:
+ 1) By folding away BIT_FIELD_REFs we remove information about the
+ operands that may be useful to other optimizers.
+ 2) For simplicity, we'd like the expression
+
+ x = BIT_FIELD_REF<a, 64, 64>
+
+ to imply that A is not a VECTOR_CST. This assumption is unlikely
+ to hold before constant propagation/folding. */
+ if (!(cfun->curr_properties & PROP_last_full_fold))
+ return nullptr;
+
+ tree vectype_hi = NULL_TREE;
+ tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+ gcc_assert (builtin_hi != NULL_TREE);
+
+ /* Prefer to use the highpart builtin when at least one vector
+ argument is a reference to the upper half of a 128b vector, and
+ all others are VECTOR_CSTs. */
+ auto_vec<unsigned int, 2> vec_constants;
+ auto_vec<unsigned int, 2> vec_highparts;
+ auto_vec<tree, 4> new_args;
+
+ /* The interesting args are those that differ between the lo/hi
+ builtins. Walk the function signatures to find these. */
+ tree types_hi = TYPE_ARG_TYPES (TREE_TYPE (builtin_hi));
+ tree types_lo = TYPE_ARG_TYPES (gimple_call_fntype (stmt));
+ unsigned int argno = 0;
+ while (types_lo != void_list_node && types_hi != void_list_node)
+ {
+ tree type_lo = TREE_VALUE (types_lo);
+ tree type_hi = TREE_VALUE (types_hi);
+ tree curr_arg = gimple_call_arg (stmt, argno);
+ if (!types_compatible_p (type_lo, type_hi))
+ {
+ /* Check our assumptions about this pair. */
+ gcc_assert (aarch64_nbit_vector_type_p (type_lo, 64));
+ if (!vectype_hi)
+ {
+ gcc_assert (aarch64_nbit_vector_type_p (type_hi, 128));
+ vectype_hi = type_hi;
+ }
+ else
+ gcc_assert (type_hi == vectype_hi);
+
+ if (tree vq = aarch64_vq_high_half (curr_arg))
+ {
+ curr_arg = vq;
+ vec_highparts.safe_push (argno);
+ }
+ else if (TREE_CODE (curr_arg) == VECTOR_CST)
+ vec_constants.safe_push (argno);
+ else
+ return nullptr;
+ }
+ new_args.safe_push (curr_arg);
+ argno++;
+ types_hi = TREE_CHAIN (types_hi);
+ types_lo = TREE_CHAIN (types_lo);
+ }
+ gcc_assert (types_lo == void_list_node
+ && types_hi == void_list_node);
+ if (vec_highparts.is_empty ())
+ return nullptr;
+
+ /* Build a valid call to BUILTIN_HI. */
+ for (auto i : vec_constants)
+ new_args[i] = aarch64_duplicate_vector_cst (new_args[i],
+ vectype_hi);
+ for (auto i : vec_highparts)
+ {
+ if (!types_compatible_p (TREE_TYPE (new_args[i]), vectype_hi))
+ {
+ /* Reinterpret this vector to VECTYPE_HI. */
+ tree vce_ssa = make_ssa_name (vectype_hi);
+ tree vce_expr = build1 (VIEW_CONVERT_EXPR, vectype_hi,
+ new_args[i]);
+ gsi_insert_before (gsi,
+ gimple_build_assign (vce_ssa, vce_expr),
+ GSI_SAME_STMT);
+ new_args[i] = vce_ssa;
+ }
+ }
+
+ gcall *new_call
+ = gimple_build_call_vec (builtin_hi, new_args);
+ gimple_call_set_lhs (new_call, gimple_call_lhs (stmt));
+ return new_call;
+}
+
+#undef LO_HI_PAIR
+#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A:
+
/* Try to fold STMT, given that it's a call to the built-in function with
subcode FCODE. Return the new statement on success and null on
failure. */
@@ -5168,6 +5370,10 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode,
gcall *stmt,
}
break;
}
+ break;
+ LO_HI_PAIRINGS
+ new_stmt = aarch64_fold_lo_call_to_hi (fcode, stmt, gsi);
+ break;
case AARCH64_SIMD_BUILTIN_LANE_CHECK:
if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
{
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
new file mode 100644
index 00000000000..f6dc4e52362
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
@@ -0,0 +1,733 @@
+/* { dg-do compile } */
+/* { dg-options "-O -march=armv9-a+bf16" } */
+
+#include <arm_neon.h>
+
+/* Prefer the highpart variant of a builtin when it's arguments
+ are vector highparts. */
+
+#ifndef TEST_UN_HIGHPARTS
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a) \
+ { \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a)); \
+ }
+#endif
+
+#ifndef TEST_BIN_W_HIGHPARTS
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+ { \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b)); \
+ }
+#endif
+
+#ifndef TEST_BIN_N_HIGHPARTS
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a) \
+ { \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a), a[1]); \
+ }
+#endif
+
+#ifndef TEST_TERN_N_HIGHPARTS
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+ { \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), b[1]); \
+ }
+#endif
+
+#ifndef TEST_BIN_HIGHPARTS
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a, INTYPE b) \
+ { \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a), \
+ vget_high_ ## SUFF (b)); \
+ }
+#endif
+
+#ifndef TEST_TERN_HIGHPARTS
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b, INTYPE c) \
+ { \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), \
+ vget_high_ ## SUFF (c)); \
+ }
+#endif
+
+#define TEST_UN_VQW(FN) \
+ TEST_UN_HIGHPARTS (FN, int16x8_t, int8x16_t, s8) \
+ TEST_UN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, u8) \
+ TEST_UN_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \
+ TEST_UN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+ TEST_UN_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \
+ TEST_UN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_BIN_VQW(FN) \
+ TEST_BIN_HIGHPARTS (FN, int16x8_t, int8x16_t, int8x8_t, s8) \
+ TEST_BIN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t, u8) \
+ TEST_BIN_HIGHPARTS (FN, int32x4_t, int16x8_t, int16x4_t, s16) \
+ TEST_BIN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \
+ TEST_BIN_HIGHPARTS (FN, int64x2_t, int32x4_t, int32x2_t, s32) \
+ TEST_BIN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32)
+
+#define TEST_BIN_N_VQW(FN) \
+ TEST_BIN_N_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \
+ TEST_BIN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+ TEST_BIN_N_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \
+ TEST_BIN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_BIN_W_VQW(FN) \
+ TEST_BIN_W_HIGHPARTS (FN, int16x8_t, int8x16_t, s8) \
+ TEST_BIN_W_HIGHPARTS (FN, uint16x8_t, uint8x16_t, u8) \
+ TEST_BIN_W_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \
+ TEST_BIN_W_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+ TEST_BIN_W_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \
+ TEST_BIN_W_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_TERN_N_VQW(FN) \
+ TEST_TERN_N_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \
+ TEST_TERN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+ TEST_TERN_N_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \
+ TEST_TERN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_TERN_VQW(FN) \
+ TEST_TERN_HIGHPARTS (FN, int16x8_t, int8x16_t, int8x8_t, s8) \
+ TEST_TERN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t, u8) \
+ TEST_TERN_HIGHPARTS (FN, int32x4_t, int16x8_t, int16x4_t, s16) \
+ TEST_TERN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \
+ TEST_TERN_HIGHPARTS (FN, int64x2_t, int32x4_t, int32x2_t, s32) \
+ TEST_TERN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32)
+
+#define TEST_VQDMULL \
+ TEST_BIN_HIGHPARTS (vqdmull, int32x4_t, int16x8_t, int16x4_t, s16) \
+ TEST_BIN_HIGHPARTS (vqdmull, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQDMULL_N \
+ TEST_BIN_N_HIGHPARTS (vqdmull_n, int32x4_t, int16x8_t, s16) \
+ TEST_BIN_N_HIGHPARTS (vqdmull_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VQMLAL \
+ TEST_TERN_HIGHPARTS (vqdmlal, int32x4_t, int16x8_t, int16x4_t, s16) \
+ TEST_TERN_HIGHPARTS (vqdmlal, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQMLAL_N \
+ TEST_TERN_N_HIGHPARTS (vqdmlal_n, int32x4_t, int16x8_t, s16) \
+ TEST_TERN_N_HIGHPARTS (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VQMLSL \
+ TEST_TERN_HIGHPARTS (vqdmlsl, int32x4_t, int16x8_t, int16x4_t, s16) \
+ TEST_TERN_HIGHPARTS (vqdmlsl, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQMLSL_N \
+ TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int32x4_t, int16x8_t, s16) \
+ TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VMOVL \
+ TEST_UN_VQW (vmovl)
+
+#define TEST_VCVT \
+ TEST_UN_HIGHPARTS (vcvt_f32, float32x4_t, float16x8_t, f16) \
+ TEST_UN_HIGHPARTS (vcvt_f32, float32x4_t, bfloat16x8_t, bf16) \
+ TEST_UN_HIGHPARTS (vcvt_f64, float64x2_t, float32x4_t, f32)
+
+#define TEST_VMULL \
+ TEST_BIN_VQW (vmull)
+
+#define TEST_VMULL_N \
+ TEST_BIN_N_VQW (vmull_n)
+
+#define TEST_VADDL \
+ TEST_BIN_VQW (vaddl)
+
+#define TEST_VSUBL \
+ TEST_BIN_VQW (vsubl)
+
+#define TEST_VMLAL \
+ TEST_TERN_VQW (vmlal)
+
+#define TEST_VMLAL_N \
+ TEST_TERN_N_VQW (vmlal_n)
+
+#define TEST_VMLSL \
+ TEST_TERN_VQW (vmlsl)
+
+#define TEST_VMLSL_N \
+ TEST_TERN_N_VQW (vmlsl_n)
+
+#define TEST_VABDL \
+ TEST_BIN_VQW (vabdl)
+
+#define TEST_VABAL \
+ TEST_TERN_VQW (vabal)
+
+#define TEST_VSUBW \
+ TEST_BIN_W_VQW (vsubw)
+
+#define TEST_VADDW \
+ TEST_BIN_W_VQW (vaddw)
+
+/*
+** test_vmovl_s8:
+** sxtl2 v0\.8h, v0\.16b
+** ret
+*/
+
+/*
+** test_vmovl_u8:
+** uxtl2 v0\.8h, v0\.16b
+** ret
+*/
+
+/*
+** test_vmovl_s16:
+** sxtl2 v0\.4s, v0\.8h
+** ret
+*/
+
+/*
+** test_vmovl_u16:
+** uxtl2 v0\.4s, v0\.8h
+** ret
+*/
+
+/*
+** test_vmovl_s32:
+** sxtl2 v0\.2d, v0\.4s
+** ret
+*/
+
+/*
+** test_vmovl_u32:
+** uxtl2 v0\.2d, v0\.4s
+** ret
+*/
+
+TEST_VMOVL
+
+/*
+** test_vcvt_f32_f16:
+** fcvtl2 v0\.4s, v0\.8h
+** ret
+*/
+
+/*
+** test_vcvt_f32_bf16:
+** shll2 v0\.4s, v0\.8h, #16
+** ret
+*/
+
+/*
+** test_vcvt_f64_f32:
+** fcvtl2 v0\.2d, v0\.4s
+** ret
+*/
+
+TEST_VCVT
+
+/*
+** test_vmull_s8:
+** smull2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+** ret
+*/
+
+/*
+** test_vmull_u8:
+** umull2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+** ret
+*/
+
+/*
+** test_vmull_s16:
+** smull2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+** ret
+*/
+
+/*
+** test_vmull_u16:
+** umull2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+** ret
+*/
+
+/*
+** test_vmull_s32:
+** smull2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+** ret
+*/
+
+/*
+** test_vmull_u32:
+** umull2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+** ret
+*/
+
+TEST_VMULL
+
+/*
+** test_vmull_n_s16:
+** smull2 v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vmull_n_u16:
+** umull2 v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vmull_n_s32:
+** smull2 v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+** ret
+*/
+
+/*
+** test_vmull_n_u32:
+** umull2 v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+** ret
+*/
+
+TEST_VMULL_N
+
+/*
+** test_vaddl_s8:
+** saddl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+** ret
+*/
+
+/*
+** test_vaddl_u8:
+** uaddl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+** ret
+*/
+
+/*
+** test_vaddl_s16:
+** saddl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+** ret
+*/
+
+/*
+** test_vaddl_u16:
+** uaddl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+** ret
+*/
+
+/*
+** test_vaddl_s32:
+** saddl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+** ret
+*/
+
+/*
+** test_vaddl_u32:
+** uaddl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+** ret
+*/
+
+TEST_VADDL
+
+/*
+** test_vsubl_s8:
+** ssubl2 v0\.8h, v0\.16b, v1\.16b
+** ret
+*/
+
+/*
+** test_vsubl_u8:
+** usubl2 v0\.8h, v0\.16b, v1\.16b
+** ret
+*/
+
+/*
+** test_vsubl_s16:
+** ssubl2 v0\.4s, v0\.8h, v1\.8h
+** ret
+*/
+
+/*
+** test_vsubl_u16:
+** usubl2 v0\.4s, v0\.8h, v1\.8h
+** ret
+*/
+
+/*
+** test_vsubl_s32:
+** ssubl2 v0\.2d, v0\.4s, v1\.4s
+** ret
+*/
+
+/*
+** test_vsubl_u32:
+** usubl2 v0\.2d, v0\.4s, v1\.4s
+** ret
+*/
+
+TEST_VSUBL
+
+/*
+** test_vabal_s8:
+** sabal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+** ret
+*/
+
+/*
+** test_vabal_u8:
+** uabal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+** ret
+*/
+
+/*
+** test_vabal_s16:
+** sabal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+** ret
+*/
+
+/*
+** test_vabal_u16:
+** uabal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+** ret
+*/
+
+/*
+** test_vabal_s32:
+** sabal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+** ret
+*/
+
+/*
+** test_vabal_u32:
+** uabal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+** ret
+*/
+
+TEST_VABAL
+
+/*
+** test_vsubw_s8:
+** ssubw2 v0\.8h, v0\.8h, v1\.16b
+** ret
+*/
+
+/*
+** test_vsubw_u8:
+** usubw2 v0\.8h, v0\.8h, v1\.16b
+** ret
+*/
+
+/*
+** test_vsubw_s16:
+** ssubw2 v0\.4s, v0\.4s, v1\.8h
+** ret
+*/
+
+/*
+** test_vsubw_u16:
+** usubw2 v0\.4s, v0\.4s, v1\.8h
+** ret
+*/
+
+/*
+** test_vsubw_s32:
+** ssubw2 v0\.2d, v0\.2d, v1\.4s
+** ret
+*/
+
+/*
+** test_vsubw_u32:
+** usubw2 v0\.2d, v0\.2d, v1\.4s
+** ret
+*/
+
+TEST_VSUBW
+
+/*
+** test_vaddw_s8:
+** saddw2 v0\.8h, v0\.8h, v1\.16b
+** ret
+*/
+
+/*
+** test_vaddw_u8:
+** uaddw2 v0\.8h, v0\.8h, v1\.16b
+** ret
+*/
+
+/*
+** test_vaddw_s16:
+** saddw2 v0\.4s, v0\.4s, v1\.8h
+** ret
+*/
+
+/*
+** test_vaddw_u16:
+** uaddw2 v0\.4s, v0\.4s, v1\.8h
+** ret
+*/
+
+/*
+** test_vaddw_s32:
+** saddw2 v0\.2d, v0\.2d, v1\.4s
+** ret
+*/
+
+/*
+** test_vaddw_u32:
+** uaddw2 v0\.2d, v0\.2d, v1\.4s
+** ret
+*/
+
+TEST_VADDW
+
+/*
+** test_vabdl_s8:
+** sabdl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+** ret
+*/
+
+/*
+** test_vabdl_u8:
+** uabdl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+** ret
+*/
+
+/*
+** test_vabdl_s16:
+** sabdl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+** ret
+*/
+
+/*
+** test_vabdl_u16:
+** uabdl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+** ret
+*/
+
+/*
+** test_vabdl_s32:
+** sabdl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+** ret
+*/
+
+/*
+** test_vabdl_u32:
+** uabdl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+** ret
+*/
+
+TEST_VABDL
+
+/*
+** test_vmlal_s8:
+** smlal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+** ret
+*/
+
+/*
+** test_vmlal_u8:
+** umlal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+** ret
+*/
+
+/*
+** test_vmlal_s16:
+** smlal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+** ret
+*/
+
+/*
+** test_vmlal_u16:
+** umlal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+** ret
+*/
+
+/*
+** test_vmlal_s32:
+** smlal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+** ret
+*/
+
+/*
+** test_vmlal_u32:
+** umlal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+** ret
+*/
+
+TEST_VMLAL
+
+/*
+** test_vmlal_n_s16:
+** smlal2 v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vmlal_n_u16:
+** umlal2 v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vmlal_n_s32:
+** smlal2 v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+** ret
+*/
+
+/*
+** test_vmlal_n_u32:
+** umlal2 v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+** ret
+*/
+
+TEST_VMLAL_N
+
+/*
+** test_vmlsl_s8:
+** smlsl2 v0\.8h, v1\.16b, v2\.16b
+** ret
+*/
+
+/*
+** test_vmlsl_u8:
+** umlsl2 v0\.8h, v1\.16b, v2\.16b
+** ret
+*/
+
+/*
+** test_vmlsl_s16:
+** smlsl2 v0\.4s, v1\.8h, v2\.8h
+** ret
+*/
+
+/*
+** test_vmlsl_u16:
+** umlsl2 v0\.4s, v1\.8h, v2\.8h
+** ret
+*/
+
+/*
+** test_vmlsl_s32:
+** smlsl2 v0\.2d, v1\.4s, v2\.4s
+** ret
+*/
+
+/*
+** test_vmlsl_u32:
+** umlsl2 v0\.2d, v1\.4s, v2\.4s
+** ret
+*/
+
+TEST_VMLSL
+
+/*
+** test_vmlsl_n_s16:
+** smlsl2 v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vmlsl_n_u16:
+** umlsl2 v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vmlsl_n_s32:
+** smlsl2 v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+** ret
+*/
+
+/*
+** test_vmlsl_n_u32:
+** umlsl2 v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+** ret
+*/
+
+TEST_VMLSL_N
+
+/*
+** test_vqdmull_s16:
+** sqdmull2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+** ret
+*/
+
+/*
+** test_vqdmull_s32:
+** sqdmull2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+** ret
+*/
+
+TEST_VQDMULL
+
+/*
+** test_vqdmull_n_s16:
+** sqdmull2 v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vqdmull_n_s32:
+** sqdmull2 v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+** ret
+*/
+
+TEST_VQDMULL_N
+
+/*
+** test_vqdmlal_s16:
+** sqdmlal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+** ret
+*/
+
+/*
+** test_vqdmlal_s32:
+** sqdmlal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+** ret
+*/
+
+TEST_VQMLAL
+
+/*
+** test_vqdmlal_n_s16:
+** sqdmlal2 v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vqdmlal_n_s32:
+** sqdmlal2 v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+** ret
+*/
+
+TEST_VQMLAL_N
+
+/*
+** test_vqdmlsl_s16:
+** sqdmlsl2 v0\.4s, v1\.8h, v2\.8h
+** ret
+*/
+
+/*
+** test_vqdmlsl_s32:
+** sqdmlsl2 v0\.2d, v1\.4s, v2\.4s
+** ret
+*/
+
+TEST_VQMLSL
+
+/*
+** test_vqdmlsl_n_s16:
+** sqdmlsl2 v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+** ret
+*/
+
+/*
+** test_vqdmlsl_n_s32:
+** sqdmlsl2 v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+** ret
+*/
+
+TEST_VQMLSL_N
+
+/* { dg-final { check-function-bodies "**" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
new file mode 100644
index 00000000000..2dd3eb3268c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
@@ -0,0 +1,86 @@
+/* { dg-do compile } */
+/* { dg-options "-O -march=armv9-a+bf16" } */
+
+/* Don't fold to the hi builtin unless at least one argument is a true
+ highpart (not that of a VECTOR_CST). */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF () \
+ { \
+ INTYPE a = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a)); \
+ }
+
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a) \
+ { \
+ INTYPE b = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b)); \
+ }
+
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (INTYPE c) \
+ { \
+ INTYPE a = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a), c[1]); \
+ } \
+
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a) \
+ { \
+ INTYPE b = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), b[1]); \
+ } \
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (H_INTYPE b) \
+ { \
+ INTYPE a = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a), b); \
+ } \
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, H_INTYPE b) \
+ { \
+ INTYPE c = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (c), b); \
+ } \
+
+#include "fold_to_highpart_1.c"
+
+
+/* { dg-final { scan-assembler-not {uxtl2\t} } } */
+/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+
+/* { dg-final { scan-assembler-not {fcvtl2\t} } } */
+/* { dg-final { scan-assembler-not {shll2\t} } } */
+
+/* { dg-final { scan-assembler-not {umull2\t} } } */
+/* { dg-final { scan-assembler-not {smull2\t} } } */
+
+/* { dg-final { scan-assembler-not {uaddl2\t} } } */
+/* { dg-final { scan-assembler-not {saddl2\t} } } */
+
+/* { dg-final { scan-assembler-not {usubl2\t} } } */
+/* { dg-final { scan-assembler-not {ssubl2\t} } } */
+
+/* { dg-final { scan-assembler-not {uabdl2\t} } } */
+/* { dg-final { scan-assembler-not {sabdl2\t} } } */
+
+/* { dg-final { scan-assembler-not {usubw2\t} } } */
+/* { dg-final { scan-assembler-not {ssubw2\t} } } */
+
+/* { dg-final { scan-assembler-not {uaddw2\t} } } */
+/* { dg-final { scan-assembler-not {saddw2\t} } } */
+
+/* { dg-final { scan-assembler-not {umlal2\t} } } */
+/* { dg-final { scan-assembler-not {smlal2\t} } } */
+
+/* { dg-final { scan-assembler-not {umlsl2\t} } } */
+/* { dg-final { scan-assembler-not {smlsl2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmull2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmlal2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmlsl2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
new file mode 100644
index 00000000000..07c79ca1608
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+/* PR117850 */
+
+/* For builtins with multiple lo arguments, prefer the hi builtin if
+ at least one is a true highpart and all others are VECTOR_CSTs. */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_BIN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a1_ ## FN ## _ ## SUFF (INTYPE a) \
+ { \
+ INTYPE b = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a), \
+ vget_high_ ## SUFF (b)); \
+ }
+
+#define TEST_BIN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a2_ ## FN ## _ ## SUFF (INTYPE a) \
+ { \
+ INTYPE b = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (b), \
+ vget_high_ ## SUFF (a)); \
+ }
+
+#define TEST_TERN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a1_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+ { \
+ INTYPE c = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), \
+ vget_high_ ## SUFF (c)); \
+ }
+
+#define TEST_TERN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a2_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+ { \
+ INTYPE c = vdupq_n_ ## SUFF (0x1A); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (c), \
+ vget_high_ ## SUFF (b)); \
+ }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ TEST_BIN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \
+ TEST_BIN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ TEST_TERN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \
+ TEST_TERN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-not {dup\t} } } */
+
+/* { dg-final { scan-assembler-times {smull2\t} 6} } */
+/* { dg-final { scan-assembler-times {umull2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {saddl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uaddl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {ssubl2\t} 6} } */
+/* { dg-final { scan-assembler-times {usubl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlal2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlal2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlsl2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlsl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
new file mode 100644
index 00000000000..f77b2355fcf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
@@ -0,0 +1,77 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+/* For builtins with multiple lo arguments, prefer the hi builtin if
+ at least one is a true highpart and all others are VECTOR_CSTs. */
+
+#define VEC_64b 0x1A2E4A4FFFED773E
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_BIN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a1_ ## FN ## _ ## SUFF (INTYPE a) \
+ { \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (a), \
+ vcreate_ ## SUFF (VEC_64b)); \
+ }
+
+#define TEST_BIN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a2_ ## FN ## _ ## SUFF (INTYPE a) \
+ { \
+ return FN ## _ ## SUFF (vcreate_ ## SUFF (VEC_64b), \
+ vget_high_ ## SUFF (a)); \
+ }
+
+#define TEST_TERN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a1_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+ { \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), \
+ vcreate_ ## SUFF (VEC_64b)); \
+ }
+
+#define TEST_TERN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_a2_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \
+ { \
+ return FN ## _ ## SUFF (a, vcreate_ ## SUFF (VEC_64b), \
+ vget_high_ ## SUFF (b)); \
+ }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ TEST_BIN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \
+ TEST_BIN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ TEST_TERN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \
+ TEST_TERN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-not {dup\t} } } */
+
+/* { dg-final { scan-assembler-times {smull2\t} 6} } */
+/* { dg-final { scan-assembler-times {umull2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {saddl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uaddl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {ssubl2\t} 6} } */
+/* { dg-final { scan-assembler-times {usubl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlal2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlal2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlsl2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlsl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
new file mode 100644
index 00000000000..046c7a00def
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+#include "arm_neon.h"
+
+#define VEC_CST_u8 0x0102030405060708
+#define VEC_CST_u16 0x0001000200030004
+#define VEC_CST_u32 0x0000000100000002
+
+/* Extend the 64b VECTOR_CST to the type required by the hi builtin. */
+
+uint16x8_t
+test_u8 (uint8x16_t a)
+{
+ const uint8x8_t b = vcreate_u8 (VEC_CST_u8);
+ return vmull_u8 (vget_high_u8 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5,
4, 3, 2, 1 \}" 1 "optimized" } } */
+
+uint32x4_t
+test_u16 (uint16x8_t a)
+{
+ const uint16x4_t b = vcreate_u16 (VEC_CST_u16);
+ return vmull_u16 (vget_high_u16 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 4, 3, 2, 1, 4, 3, 2, 1 \}" 1
"optimized" } } */
+
+uint64x2_t
+test_u32 (uint32x4_t a)
+{
+ const uint32x2_t b = vcreate_u32 (VEC_CST_u32);
+ return vmull_u32 (vget_high_u32 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 2, 1, 2, 1 \}" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
new file mode 100644
index 00000000000..5d41cc4e5fd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O -march=armv9-a+bf16" } */
+
+/* Test that we can still fold when the base type of the vector who's
+ highpart we are referring to is incompatible with that of the hi builtin.
+
+ Use float64x2_t as it is never INTYPE. */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (float64x2_t a) \
+ { \
+ INTYPE x = vreinterpretq_ ## SUFF ## _f64 (a); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (x)); \
+ }
+
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, float64x2_t b) \
+ { \
+ INTYPE x = vreinterpretq_ ## SUFF ## _f64 (b); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (x)); \
+ }
+
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (float64x2_t a) \
+ { \
+ INTYPE x = vreinterpretq_ ## SUFF ## _f64 (a); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (x), x[1]); \
+ }
+
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, float64x2_t b) \
+ { \
+ INTYPE x = vreinterpretq_ ## SUFF ## _f64 (b); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (x), x[1]); \
+ }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (float64x2_t a, float64x2_t b) \
+ { \
+ INTYPE x = vreinterpretq_ ## SUFF ## _f64 (a); \
+ INTYPE y = vreinterpretq_ ## SUFF ## _f64 (b); \
+ return FN ## _ ## SUFF (vget_high_ ## SUFF (x), \
+ vget_high_ ## SUFF (y)); \
+ }
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+ RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, float64x2_t b, float64x2_t c) \
+ { \
+ INTYPE x = vreinterpretq_ ## SUFF ## _f64 (b); \
+ INTYPE y = vreinterpretq_ ## SUFF ## _f64 (c); \
+ return FN ## _ ## SUFF (a, vget_high_ ## SUFF (x), \
+ vget_high_ ## SUFF (y)); \
+ }
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-times {sxtl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uxtl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {fcvtl2\t} 2} } */
+/* { dg-final { scan-assembler-times {shll2\t} 1} } */
+
+/* { dg-final { scan-assembler-times {smull2\t} 5} } */
+/* { dg-final { scan-assembler-times {umull2\t} 5} } */
+
+/* { dg-final { scan-assembler-times {saddl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uaddl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {ssubl2\t} 3} } */
+/* { dg-final { scan-assembler-times {usubl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {saddw2\t} 3} } */
+/* { dg-final { scan-assembler-times {uaddw2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {ssubw2\t} 3} } */
+/* { dg-final { scan-assembler-times {usubw2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {smlal2\t} 5} } */
+/* { dg-final { scan-assembler-times {umlal2\t} 5} } */
+
+/* { dg-final { scan-assembler-times {smlsl2\t} 5} } */
+/* { dg-final { scan-assembler-times {umlsl2\t} 5} } */
+
+/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
new file mode 100644
index 00000000000..a8daa46ce76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O2 -march=armv8-a+sve" } */
+
+#include <arm_neon_sve_bridge.h>
+
+typedef int32_t int32x8_t __attribute__ ((vector_size (32)));
+typedef int16_t int16x16_t __attribute__ ((vector_size (32)));
+
+/* Edge cases where we don't/can't fold, reject these gracefully. */
+
+int16x8_t
+test_sizeless_type (svint8_t scalable)
+{
+ return vmovl_s8 (vget_high_s8 (svget_neonq_s8 (scalable)));
+}
+
+int16x8_t
+test_scalar_type (poly128_t foo)
+{
+ return vmovl_s8 (vget_high_s8 (vreinterpretq_s8_p128 (foo)));
+}
+
+int32x4_t
+test_256b_type_1 (int16x16_t foo)
+{
+ return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
+}
+
+int64x2_t
+test_256b_type_2 (int32x8_t foo)
+{
+ return vmovl_s32 (vget_high_s32 ((int32x4_t) {foo[0], foo[1], foo[2],
foo[3]}));
+}
+
+/* { dg-final { scan-assembler-not {sxtl2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
b/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
deleted file mode 100644
index c51878aa226..00000000000
--- a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O" } */
-/* { dg-final { check-function-bodies "**" "" "" } } */
-
-#include <arm_neon.h>
-
-/*
-** test_vabal_s8:
-** sabal2 v0.8h, v2.16b, v1.16b
-** ret
-*/
-int16x8_t
-test_vabal_s8 (int16x8_t sadv, int8x16_t pv, int8x16_t sv)
-{
- return vabal_s8 (sadv, vget_high_s8 (pv), vget_high_s8 (sv));
-}
-
-/*
-** test_vabal_u8:
-** uabal2 v0.8h, v2.16b, v1.16b
-** ret
-*/
-uint16x8_t
-test_vabal_u8 (uint16x8_t sadv, uint8x16_t pv, uint8x16_t sv)
-{
- return vabal_u8 (sadv, vget_high_u8 (pv), vget_high_u8 (sv));
-}
-
-/*
-** test_vabal_s16:
-** sabal2 v0.4s, v2.8h, v1.8h
-** ret
-*/
-int32x4_t
-test_vabal_s16 (int32x4_t sadv, int16x8_t pv, int16x8_t sv)
-{
- return vabal_s16 (sadv, vget_high_s16 (pv), vget_high_s16 (sv));
-}
-
-/*
-** test_vabal_u16:
-** uabal2 v0.4s, v2.8h, v1.8h
-** ret
-*/
-uint32x4_t
-test_vabal_u16 (uint32x4_t sadv, uint16x8_t pv, uint16x8_t sv)
-{
- return vabal_u16 (sadv, vget_high_u16 (pv), vget_high_u16 (sv));
-}
-
-/*
-** test_vabal_s32:
-** sabal2 v0.2d, v2.4s, v1.4s
-** ret
-*/
-int64x2_t
-test_vabal_s32 (int64x2_t sadv, int32x4_t pv, int32x4_t sv)
-{
- return vabal_s32 (sadv, vget_high_s32 (pv), vget_high_s32 (sv));
-}
-
-/*
-** test_vabal_u32:
-** uabal2 v0.2d, v2.4s, v1.4s
-** ret
-*/
-uint64x2_t
-test_vabal_u32 (uint64x2_t sadv, uint32x4_t pv, uint32x4_t sv)
-{
- return vabal_u32 (sadv, vget_high_u32 (pv), vget_high_u32 (sv));
-}
-
--
2.34.1