[PATCH v3 1/1] aarch64: Fold builtins with highpart args to highpart equivalent [PR117850]

Spencer Abson Thu, 10 Jul 2025 06:16:20 -0700

Add a fold at gimple_fold_builtin to prefer the highpart variant of
a builtin if at least one argument is a vector highpart and any
others are VECTOR_CSTs that we can cheaply extend to 128-bits.


This eliminates data movement instructions.  For example, we prefer
UMULL2 here over DUP+UMULL

uint16x8_t
foo (const uint8x16_t s)
{
    const uint8x8_t f0 = vdup_n_u8 (4);
    return vmull_u8 (vget_high_u8 (s), f0);
}

gcc/ChangeLog:
        PR target/117850
        * config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS):  New
        (aarch64_get_highpart_builtin):  New function.
        (aarch64_v128_highpart_ref):  New function.  Helper to Look
        for BIT_FIELD_REFs to the high 64 bits of 128-bit vectors.
        (aarch64_build_vector_cst):  New function.  Build a new
        VECTOR_CST from the elements of another.
        (aarch64_fold_lo_call_to_hi):  New function.  Main logic
        for the fold.
        (aarch64_general_gimple_fold_builtin):  Add cases for the
        pairs in aarch64-builtin-pairs.def.
        * config/aarch64/aarch64-builtin-pairs.def: New file.

gcc/testsuite/ChangeLog:
        PR target/117850
        * gcc.target/aarch64/simd/vabal_combine.c: Removed.  This is
        covered by fold_to_highpart_1.c
        * gcc.target/aarch64/simd/fold_to_highpart_1.c: New test.
        * gcc.target/aarch64/simd/fold_to_highpart_2.c: Likewise.
        * gcc.target/aarch64/simd/fold_to_highpart_3.c: Likewise.
        * gcc.target/aarch64/simd/fold_to_highpart_4.c: Likewise.
        * gcc.target/aarch64/simd/fold_to_highpart_5.c: Likewise.
        * gcc.target/aarch64/simd/fold_to_highpart_6.c: Likewise.
---
 gcc/config/aarch64/aarch64-builtin-pairs.def  |  73 ++
 gcc/config/aarch64/aarch64-builtins.cc        | 183 +++++
 .../aarch64/simd/fold_to_highpart_1.c         | 717 ++++++++++++++++++
 .../aarch64/simd/fold_to_highpart_2.c         |  89 +++
 .../aarch64/simd/fold_to_highpart_3.c         |  83 ++
 .../aarch64/simd/fold_to_highpart_4.c         |  38 +
 .../aarch64/simd/fold_to_highpart_5.c         |  92 +++
 .../aarch64/simd/fold_to_highpart_6.c         |  37 +
 .../gcc.target/aarch64/simd/vabal_combine.c   |  72 --
 9 files changed, 1312 insertions(+), 72 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c

diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def 
b/gcc/config/aarch64/aarch64-builtin-pairs.def
new file mode 100644
index 00000000000..83cb0e2fe3a
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtin-pairs.def
@@ -0,0 +1,73 @@
+/* Pairings of AArch64 builtins that can be folded into each other.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Pairs of single and half integer modes.  */
+#define LO_HI_PAIR_V_HSI(T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \
+  LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi)
+
+#define LO_HI_PAIR_V_US_HSI(T, LO, HI) \
+  LO_HI_PAIR_V_HSI (T, s##LO, s##HI) \
+  LO_HI_PAIR_V_HSI (T##U, u##LO, u##HI)
+
+/* Pairs of widenable integer modes.  */
+#define LO_HI_PAIR_V_WI(T, LO, HI) \
+  LO_HI_PAIR_V_HSI (T, LO, HI) \
+  LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi)
+
+#define LO_HI_PAIR_V_US_WI(T, LO, HI) \
+  LO_HI_PAIR_V_WI (T, s##LO, s##HI) \
+  LO_HI_PAIR_V_WI (T##U, u##LO, u##HI)
+
+#define UNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR (UNOP_sxtlv8hi,  UNOP_vec_unpacks_hi_v16qi) \
+  LO_HI_PAIR (UNOP_sxtlv4si,  UNOP_vec_unpacks_hi_v8hi) \
+  LO_HI_PAIR (UNOP_sxtlv2di,  UNOP_vec_unpacks_hi_v4si) \
+  LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \
+  LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \
+  LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpacku_hi_v4si)
+
+#define BINOP_LONG_LH_PAIRS \
+  LO_HI_PAIR_V_US_WI (BINOP,  addl, addl2) \
+  LO_HI_PAIR_V_US_WI (BINOP,  subl, subl2) \
+  LO_HI_PAIR_V_US_WI (BINOP,  abdl, abdl2) \
+  LO_HI_PAIR_V_WI (BINOP,  intrinsic_vec_smult_lo_, vec_widen_smult_hi_) \
+  LO_HI_PAIR_V_WI (BINOPU, intrinsic_vec_umult_lo_, vec_widen_umult_hi_) \
+  LO_HI_PAIR_V_HSI (BINOP,  sqdmull, sqdmull2) \
+  LO_HI_PAIR (BINOPP_pmullv8qi, BINOPP_pmull_hiv16qi)
+
+#define BINOP_LONG_N_LH_PAIRS \
+  LO_HI_PAIR_V_US_HSI (BINOP,  mull_n, mull_hi_n) \
+  LO_HI_PAIR_V_HSI (BINOP,  sqdmull_n, sqdmull2_n)
+
+#define BINOP_WIDE_LH_PAIRS \
+  LO_HI_PAIR_V_US_WI (BINOP,  subw, subw2) \
+  LO_HI_PAIR_V_US_WI (BINOP,  addw, addw2)
+
+#define TERNOP_LONG_LH_PAIRS \
+  LO_HI_PAIR_V_US_WI (TERNOP,  mlal, mlal_hi) \
+  LO_HI_PAIR_V_US_WI (TERNOP,  mlsl, mlsl_hi) \
+  LO_HI_PAIR_V_US_WI (TERNOP,  abal, abal2) \
+  LO_HI_PAIR_V_HSI (TERNOP, sqdmlal, sqdmlal2) \
+  LO_HI_PAIR_V_HSI (TERNOP, sqdmlsl, sqdmlsl2)
+
+#define TERNOP_LONG_N_LH_PAIRS \
+  LO_HI_PAIR_V_US_HSI (TERNOP,  mlal_n, mlal_hi_n) \
+  LO_HI_PAIR_V_US_HSI (TERNOP,  mlsl_n, mlsl_hi_n) \
+  LO_HI_PAIR_V_HSI (TERNOP,  sqdmlal_n, sqdmlal2_n)
diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 93f939a9c83..38dd3430e7d 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -49,6 +49,8 @@
 #include "attribs.h"
 #include "gimple-fold.h"
 #include "builtins.h"
+#include "tree-pass.h"
+#include "tree-vector-builder.h"
 #include "aarch64-builtins.h"
 
 using namespace aarch64;
@@ -738,6 +740,16 @@ static aarch64_simd_builtin_datum 
aarch64_simd_builtin_data[] = {
   VGET_HIGH_BUILTIN(u64) \
   VGET_HIGH_BUILTIN(bf16)
 
+#include "aarch64-builtin-pairs.def"
+
+#define LO_HI_PAIRINGS \
+  UNOP_LONG_LH_PAIRS \
+  BINOP_LONG_LH_PAIRS \
+  BINOP_LONG_N_LH_PAIRS \
+  BINOP_WIDE_LH_PAIRS \
+  TERNOP_LONG_LH_PAIRS \
+  TERNOP_LONG_N_LH_PAIRS
+
 typedef struct
 {
   const char *name;
@@ -5004,6 +5016,173 @@ aarch64_gimple_fold_pragma_builtin
     }
 }
 
+/* Return the fndecl of the builtin paired with FCODE_LO if one
+   exists (see aarch64-builtin-pairs.def), or NULL_TREE if not.  */
+static inline tree
+aarch64_get_highpart_builtin (unsigned int fcode_lo)
+{
+#undef LO_HI_PAIR
+#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A:   \
+  return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_##B];
+
+  switch (fcode_lo)
+    {
+      LO_HI_PAIRINGS
+      default:
+       return NULL_TREE;
+    }
+}
+
+/* If REF describes the high half of a 128-bit vector, return this
+   vector.  Otherwise, return NULL_TREE.  */
+static tree
+aarch64_v128_highpart_ref (const_tree ref)
+{
+  if (TREE_CODE (ref) != SSA_NAME)
+    return NULL_TREE;
+
+  gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (ref));
+  if (!stmt || gimple_assign_rhs_code (stmt) != BIT_FIELD_REF)
+    return NULL_TREE;
+
+  /* Look for a BIT_FIELD_REF that denotes the most significant 64
+     bits of a 128-bit vector.  */
+  tree bf_ref = gimple_assign_rhs1 (stmt);
+  unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64;
+
+  if (bit_field_size (bf_ref).to_constant () != 64
+      || bit_field_offset (bf_ref).to_constant () != offset)
+    return NULL_TREE;
+
+  tree obj = TREE_OPERAND (bf_ref, 0);
+  tree type = TREE_TYPE (obj);
+
+  if (VECTOR_TYPE_P (type) && tree_fits_uhwi_p (TYPE_SIZE (type))
+      && tree_to_uhwi (TYPE_SIZE (type)) == 128)
+    return obj;
+
+  return NULL_TREE;
+}
+
+/* Build and return a new VECTOR_CST of type OUT_TY using the
+   elements of VEC_IN.  */
+static tree
+aarch64_build_vector_cst (const_tree vec_in, tree out_ty)
+{
+  gcc_assert (TREE_CODE (vec_in) == VECTOR_CST
+             && VECTOR_TYPE_P (out_ty));
+  unsigned HOST_WIDE_INT nelts
+    = VECTOR_CST_NELTS (vec_in).to_constant ();
+
+  tree_vector_builder vec_out (out_ty, nelts, 1);
+  for (unsigned i = 0; i < nelts; i++)
+    vec_out.quick_push (VECTOR_CST_ELT (vec_in, i));
+
+  return vec_out.build ();
+}
+
+/* Try to fold STMT, a call to to a lowpart-operating builtin, to
+   it's highpart-operating equivalent if doing so would save
+   unnecessary data movement instructions.
+
+   Return the new call if so, otherwise nullptr.  */
+static gcall *
+aarch64_fold_lo_call_to_hi (unsigned int fcode, gcall *stmt,
+                           gimple_stmt_iterator *gsi)
+{
+  /* Punt until as late as possible:
+    1) By folding away BIT_FIELD_REFs we remove information about the
+    operands that may be useful to other optimizers.
+
+    2) For simplicity, we'd like the expression
+
+       x = BIT_FIELD_REF<a, x, y>
+
+    to imply that A is not a VECTOR_CST.  This assumption is unlikely
+    to hold before constant prop/folding.  */
+  if (!(cfun->curr_properties & PROP_last_full_fold))
+    return nullptr;
+
+  tree builtin_hi = aarch64_get_highpart_builtin (fcode);
+  gcc_assert (builtin_hi != NULL_TREE);
+
+  /* Prefer to use the highpart builtin when at least one vector
+     argument is a reference to the high half of a 128b vector, and
+     all others are VECTOR_CSTs that we can extend to 128b.  */
+  auto_vec<unsigned int, 2> vec_constants;
+  auto_vec<unsigned int, 2> vec_highparts;
+  /* The arguments and signature of the new call.  */
+  auto_vec<tree, 4> call_args;
+  auto_vec<tree, 4> call_types;
+
+  /* The interesting args are those that differ between the lo/hi
+     builtins.  Walk the function signatures to find these.  */
+  tree types_hi = TYPE_ARG_TYPES (TREE_TYPE (builtin_hi));
+  tree types_lo = TYPE_ARG_TYPES (gimple_call_fntype (stmt));
+  unsigned int argno = 0;
+  while (types_lo != void_list_node && types_hi != void_list_node)
+    {
+      tree type_lo = TREE_VALUE (types_lo);
+      tree type_hi = TREE_VALUE (types_hi);
+      tree arg = gimple_call_arg (stmt, argno);
+      if (!types_compatible_p (type_lo, type_hi))
+       {
+         /* Check our assumptions about this pair.  */
+         gcc_assert (wi::to_widest (TYPE_SIZE (type_lo)) == 64
+                     && wi::to_widest (TYPE_SIZE (type_hi)) == 128);
+
+         tree vq = aarch64_v128_highpart_ref (arg);
+         if (vq && is_gimple_reg (vq))
+           {
+             vec_highparts.safe_push (argno);
+             arg = vq;
+           }
+         else if (TREE_CODE (arg) == VECTOR_CST)
+           vec_constants.safe_push (argno);
+         else
+           return nullptr;
+       }
+      call_args.safe_push (arg);
+      call_types.safe_push (type_hi);
+
+      argno++;
+      types_hi = TREE_CHAIN (types_hi);
+      types_lo = TREE_CHAIN (types_lo);
+    }
+  gcc_assert (types_lo == void_list_node
+             && types_hi == void_list_node);
+
+  if (vec_highparts.is_empty ())
+    return nullptr;
+
+  /* Build and return a new call to BUILTIN_HI.  */
+  for (auto i : vec_constants)
+    call_args[i] = aarch64_build_vector_cst (call_args[i],
+                                            call_types[i]);
+  for (auto i : vec_highparts)
+    {
+      if (!types_compatible_p (TREE_TYPE (call_args[i]),
+                              call_types[i]))
+       {
+         tree vce_ssa = make_ssa_name (call_types[i]);
+         tree vce_expr = build1 (VIEW_CONVERT_EXPR,
+                                 call_types[i], call_args[i]);
+         gsi_insert_before (gsi,
+                            gimple_build_assign (vce_ssa, vce_expr),
+                            GSI_SAME_STMT);
+         call_args[i] = vce_ssa;
+       }
+    }
+
+  gcall *new_call
+    = gimple_build_call_vec (builtin_hi, call_args);
+  gimple_call_set_lhs (new_call, gimple_call_lhs (stmt));
+  return new_call;
+}
+
+#undef LO_HI_PAIR
+#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A:
+
 /* Try to fold STMT, given that it's a call to the built-in function with
    subcode FCODE.  Return the new statement on success and null on
    failure.  */
@@ -5190,6 +5369,10 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt,
            }
          break;
        }
+      break;
+    LO_HI_PAIRINGS
+       new_stmt = aarch64_fold_lo_call_to_hi (fcode, stmt, gsi);
+       break;
     case AARCH64_SIMD_BUILTIN_LANE_CHECK:
       if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
        {
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
new file mode 100644
index 00000000000..4f38d1fdd3e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c
@@ -0,0 +1,717 @@
+/* { dg-do compile } */
+/* { dg-options "-O -march=armv9-a+bf16" } */
+
+#include <arm_neon.h>
+
+/* We should use the highpart builtin/instruction where doing so
+   would avoid data movement instructions.  This case, where all
+   arguments are non-constant vector highparts, can be handled
+   by either gimple_fold_builtin or combine.  */
+
+#ifndef TEST_UN_HIGHPARTS
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_##FN##_##SUFF (INTYPE a)                     \
+  {                                                  \
+    return FN##_##SUFF (vget_high_##SUFF (a));      \
+  }
+#endif
+
+#ifndef TEST_BIN_W_HIGHPARTS
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)  \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a, INTYPE b)      \
+  {                                                      \
+    return FN##_##SUFF (a, vget_high_##SUFF (b));       \
+  }
+#endif
+
+#ifndef TEST_BIN_N_HIGHPARTS
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_##FN##_##SUFF (INTYPE a)                           \
+  {                                                        \
+    return FN##_##SUFF (vget_high_##SUFF (a), a[1]);      \
+  }
+#endif
+
+#ifndef TEST_TERN_N_HIGHPARTS
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)      \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a, INTYPE b)           \
+  {                                                           \
+    return FN##_##SUFF (a, vget_high_##SUFF (b), b[1]);              \
+  }
+#endif
+
+#ifndef TEST_BIN_HIGHPARTS
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  RETTYPE test_##FN##_##SUFF (INTYPE a, INTYPE b)              \
+  {                                                             \
+    return FN##_##SUFF (vget_high_##SUFF (a),                  \
+                       vget_high_##SUFF (b));                  \
+  }
+#endif
+
+#ifndef TEST_TERN_HIGHPARTS
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF)   \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a, INTYPE b, INTYPE c)      \
+  {                                                                \
+    return FN##_##SUFF(a, vget_high_##SUFF (b),                           \
+                      vget_high_##SUFF (c));                      \
+  }
+#endif
+
+#define TEST_UNOP(FN) \
+  TEST_UN_HIGHPARTS (FN, int16x8_t,  int8x16_t,  s8)  \
+  TEST_UN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, u8)  \
+  TEST_UN_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_UN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_UN_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_UN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_BINOP(FN)                                            \
+  TEST_BIN_HIGHPARTS (FN, int16x8_t,  int8x16_t,  int8x8_t,   s8)  \
+  TEST_BIN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t,  u8)  \
+  TEST_BIN_HIGHPARTS (FN, int32x4_t,  int16x8_t,  int16x4_t,  s16) \
+  TEST_BIN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \
+  TEST_BIN_HIGHPARTS (FN, int64x2_t,  int32x4_t,  int32x2_t,  s32) \
+  TEST_BIN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32)
+
+#define TEST_BINOP_N(FN)                                \
+  TEST_BIN_N_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_BIN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_BIN_N_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_BIN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_BINOP_W(FN)                                \
+  TEST_BIN_W_HIGHPARTS (FN, int16x8_t,  int8x16_t,   s8) \
+  TEST_BIN_W_HIGHPARTS (FN, uint16x8_t, uint8x16_t,  u8) \
+  TEST_BIN_W_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_BIN_W_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_BIN_W_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_BIN_W_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_TERNOP_N(FN)                                \
+  TEST_TERN_N_HIGHPARTS (FN, int32x4_t,  int16x8_t,  s16) \
+  TEST_TERN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \
+  TEST_TERN_N_HIGHPARTS (FN, int64x2_t,  int32x4_t,  s32) \
+  TEST_TERN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32)
+
+#define TEST_TERNOP(FN)                                           \
+  TEST_TERN_HIGHPARTS (FN, int16x8_t,  int8x16_t,  int8x8_t,   s8)  \
+  TEST_TERN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t,  u8)  \
+  TEST_TERN_HIGHPARTS (FN, int32x4_t,  int16x8_t,  int16x4_t,  s16) \
+  TEST_TERN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \
+  TEST_TERN_HIGHPARTS (FN, int64x2_t,  int32x4_t,  int32x2_t,  s32) \
+  TEST_TERN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32)
+
+#define TEST_VQDMULL                                                 \
+  TEST_BIN_HIGHPARTS (vqdmull, int32x4_t, int16x8_t, int16x4_t, s16) \
+  TEST_BIN_HIGHPARTS (vqdmull, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQDMULL_N                                        \
+  TEST_BIN_N_HIGHPARTS (vqdmull_n, int32x4_t, int16x8_t, s16) \
+  TEST_BIN_N_HIGHPARTS (vqdmull_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VQMLAL                                                   \
+  TEST_TERN_HIGHPARTS (vqdmlal, int32x4_t, int16x8_t, int16x4_t, s16) \
+  TEST_TERN_HIGHPARTS (vqdmlal, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQMLAL_N                                          \
+  TEST_TERN_N_HIGHPARTS (vqdmlal_n, int32x4_t, int16x8_t, s16) \
+  TEST_TERN_N_HIGHPARTS (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VQMLSL                                                   \
+  TEST_TERN_HIGHPARTS (vqdmlsl, int32x4_t, int16x8_t, int16x4_t, s16) \
+  TEST_TERN_HIGHPARTS (vqdmlsl, int64x2_t, int32x4_t, int32x2_t, s32)
+
+#define TEST_VQMLSL_N                                          \
+  TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int32x4_t, int16x8_t, s16) \
+  TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+#define TEST_VMOVL \
+  TEST_UNOP (vmovl)
+
+#define TEST_VMULL \
+  TEST_BINOP (vmull) \
+  TEST_BIN_HIGHPARTS (vmull, poly16x8_t, poly8x16_t, poly8x8_t, p8)
+
+#define TEST_VMULL_N \
+  TEST_BINOP_N (vmull_n)
+
+#define TEST_VADDL \
+  TEST_BINOP (vaddl)
+
+#define TEST_VSUBL \
+  TEST_BINOP (vsubl)
+
+#define TEST_VMLAL \
+  TEST_TERNOP (vmlal)
+
+#define TEST_VMLAL_N \
+  TEST_TERNOP_N (vmlal_n)
+
+#define TEST_VMLSL \
+  TEST_TERNOP (vmlsl)
+
+#define TEST_VMLSL_N \
+  TEST_TERNOP_N (vmlsl_n)
+
+#define TEST_VABDL \
+  TEST_BINOP (vabdl)
+
+#define TEST_VABAL \
+  TEST_TERNOP (vabal)
+
+#define TEST_VSUBW \
+  TEST_BINOP_W (vsubw)
+
+#define TEST_VADDW \
+  TEST_BINOP_W (vaddw)
+
+/*
+** test_vmovl_s8:
+**     sxtl2   v0\.8h, v0\.16b
+**     ret
+*/
+
+/*
+** test_vmovl_u8:
+**     uxtl2   v0\.8h, v0\.16b
+**     ret
+*/
+
+/*
+** test_vmovl_s16:
+**     sxtl2   v0\.4s, v0\.8h
+**     ret
+*/
+
+/*
+** test_vmovl_u16:
+**     uxtl2   v0\.4s, v0\.8h
+**     ret
+*/
+
+/*
+** test_vmovl_s32:
+**     sxtl2   v0\.2d, v0\.4s
+**     ret
+*/
+
+/*
+** test_vmovl_u32:
+**     uxtl2   v0\.2d, v0\.4s
+**     ret
+*/
+
+TEST_VMOVL
+
+/*
+** test_vmull_s8:
+**     smull2  v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+**     ret
+*/
+
+/*
+** test_vmull_u8:
+**     umull2  v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+**     ret
+*/
+
+/*
+** test_vmull_s16:
+**     smull2  v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+**     ret
+*/
+
+/*
+** test_vmull_u16:
+**     umull2  v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+**     ret
+*/
+
+/*
+** test_vmull_s32:
+**     smull2  v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+**     ret
+*/
+
+/*
+** test_vmull_u32:
+**     umull2  v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+**     ret
+*/
+
+/*
+** test_vmull_p8:
+**     pmull2  v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+**     ret
+*/
+
+TEST_VMULL
+
+/*
+** test_vmull_n_s16:
+**     smull2  v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmull_n_u16:
+**     umull2  v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmull_n_s32:
+**     smull2  v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+**     ret
+*/
+
+/*
+** test_vmull_n_u32:
+**     umull2  v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VMULL_N
+
+/*
+** test_vaddl_s8:
+**     saddl2  v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+**     ret
+*/
+
+/*
+** test_vaddl_u8:
+**     uaddl2  v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+**     ret
+*/
+
+/*
+** test_vaddl_s16:
+**     saddl2  v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+**     ret
+*/
+
+/*
+** test_vaddl_u16:
+**     uaddl2  v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+**     ret
+*/
+
+/*
+** test_vaddl_s32:
+**     saddl2  v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+**     ret
+*/
+
+/*
+** test_vaddl_u32:
+**     uaddl2  v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+**     ret
+*/
+
+TEST_VADDL
+
+/*
+** test_vsubl_s8:
+**     ssubl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubl_u8:
+**     usubl2  v0\.8h, v0\.16b, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubl_s16:
+**     ssubl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubl_u16:
+**     usubl2  v0\.4s, v0\.8h, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubl_s32:
+**     ssubl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+/*
+** test_vsubl_u32:
+**     usubl2  v0\.2d, v0\.4s, v1\.4s
+**     ret
+*/
+
+TEST_VSUBL
+
+/*
+** test_vabal_s8:
+**     sabal2  v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+**     ret
+*/
+
+/*
+** test_vabal_u8:
+**     uabal2  v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+**     ret
+*/
+
+/*
+** test_vabal_s16:
+**     sabal2  v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+**     ret
+*/
+
+/*
+** test_vabal_u16:
+**     uabal2  v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+**     ret
+*/
+
+/*
+** test_vabal_s32:
+**     sabal2  v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+**     ret
+*/
+
+/*
+** test_vabal_u32:
+**     uabal2  v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+**     ret
+*/
+
+TEST_VABAL
+
+/*
+** test_vsubw_s8:
+**     ssubw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubw_u8:
+**     usubw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vsubw_s16:
+**     ssubw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubw_u16:
+**     usubw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vsubw_s32:
+**     ssubw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+/*
+** test_vsubw_u32:
+**     usubw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+TEST_VSUBW
+
+/*
+** test_vaddw_s8:
+**     saddw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vaddw_u8:
+**     uaddw2  v0\.8h, v0\.8h, v1\.16b
+**     ret
+*/
+
+/*
+** test_vaddw_s16:
+**     saddw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vaddw_u16:
+**     uaddw2  v0\.4s, v0\.4s, v1\.8h
+**     ret
+*/
+
+/*
+** test_vaddw_s32:
+**     saddw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+/*
+** test_vaddw_u32:
+**     uaddw2  v0\.2d, v0\.2d, v1\.4s
+**     ret
+*/
+
+TEST_VADDW
+
+/*
+** test_vabdl_s8:
+**     sabdl2  v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+**     ret
+*/
+
+/*
+** test_vabdl_u8:
+**     uabdl2  v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b)
+**     ret
+*/
+
+/*
+** test_vabdl_s16:
+**     sabdl2  v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+**     ret
+*/
+
+/*
+** test_vabdl_u16:
+**     uabdl2  v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+**     ret
+*/
+
+/*
+** test_vabdl_s32:
+**     sabdl2  v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+**     ret
+*/
+
+/*
+** test_vabdl_u32:
+**     uabdl2  v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+**     ret
+*/
+
+TEST_VABDL
+
+/*
+** test_vmlal_s8:
+**     smlal2  v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+**     ret
+*/
+
+/*
+** test_vmlal_u8:
+**     umlal2  v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b)
+**     ret
+*/
+
+/*
+** test_vmlal_s16:
+**     smlal2  v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+**     ret
+*/
+
+/*
+** test_vmlal_u16:
+**     umlal2  v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+**     ret
+*/
+
+/*
+** test_vmlal_s32:
+**     smlal2  v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+**     ret
+*/
+
+/*
+** test_vmlal_u32:
+**     umlal2  v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+**     ret
+*/
+
+TEST_VMLAL
+
+/*
+** test_vmlal_n_s16:
+**     smlal2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlal_n_u16:
+**     umlal2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlal_n_s32:
+**     smlal2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+/*
+** test_vmlal_n_u32:
+**     umlal2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VMLAL_N
+
+/*
+** test_vmlsl_s8:
+**     smlsl2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vmlsl_u8:
+**     umlsl2  v0\.8h, v1\.16b, v2\.16b
+**     ret
+*/
+
+/*
+** test_vmlsl_s16:
+**     smlsl2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vmlsl_u16:
+**     umlsl2  v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vmlsl_s32:
+**     smlsl2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+/*
+** test_vmlsl_u32:
+**     umlsl2  v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+TEST_VMLSL
+
+/*
+** test_vmlsl_n_s16:
+**     smlsl2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlsl_n_u16:
+**     umlsl2  v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vmlsl_n_s32:
+**     smlsl2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+/*
+** test_vmlsl_n_u32:
+**     umlsl2  v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VMLSL_N
+
+/*
+** test_vqdmull_s16:
+**     sqdmull2        v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h)
+**     ret
+*/
+
+/*
+** test_vqdmull_s32:
+**     sqdmull2        v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s)
+**     ret
+*/
+
+TEST_VQDMULL
+
+/*
+** test_vqdmull_n_s16:
+**     sqdmull2        v0\.4s, v0\.8h, v0\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vqdmull_n_s32:
+**     sqdmull2        v0\.2d, v0\.4s, v0\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VQDMULL_N
+
+/*
+** test_vqdmlal_s16:
+**     sqdmlal2        v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h)
+**     ret
+*/
+
+/*
+** test_vqdmlal_s32:
+**     sqdmlal2        v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s)
+**     ret
+*/
+
+TEST_VQMLAL
+
+/*
+** test_vqdmlal_n_s16:
+**     sqdmlal2        v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vqdmlal_n_s32:
+**     sqdmlal2        v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VQMLAL_N
+
+/*
+** test_vqdmlsl_s16:
+**     sqdmlsl2        v0\.4s, v1\.8h, v2\.8h
+**     ret
+*/
+
+/*
+** test_vqdmlsl_s32:
+**     sqdmlsl2        v0\.2d, v1\.4s, v2\.4s
+**     ret
+*/
+
+TEST_VQMLSL
+
+/*
+** test_vqdmlsl_n_s16:
+**     sqdmlsl2        v0\.4s, v1\.8h, v1\.h\[[0-7]\]
+**     ret
+*/
+
+/*
+** test_vqdmlsl_n_s32:
+**     sqdmlsl2        v0\.2d, v1\.4s, v1\.s\[[0-3]\]
+**     ret
+*/
+
+TEST_VQMLSL_N
+
+/* { dg-final { check-function-bodies "**" ""} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
new file mode 100644
index 00000000000..02687cba5d0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c
@@ -0,0 +1,89 @@
+/* { dg-do compile } */
+/* { dg-options "-O -march=armv9-a+bf16" } */
+
+/* We should not use the highpart builtin unless doing so would avoid
+   data movement instructions.  That is, unless at least one argument
+   is a reference to the highpart of a non-constant vector (on the
+   stack).  */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_##FN##_## SUFF ()                    \
+  {                                                  \
+    INTYPE a = vdupq_n_##SUFF (0x1A);               \
+    return FN##_##SUFF (vget_high_##SUFF (a));      \
+  }
+
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a)               \
+  {                                                     \
+    INTYPE b = vdupq_n_##SUFF (0x1A);                  \
+    return FN##_##SUFF (a, vget_high_##SUFF (b));      \
+  }
+
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)     \
+  RETTYPE test_##FN##_##SUFF (INTYPE c)                            \
+  {                                                        \
+    INTYPE a = vdupq_n_##SUFF (0x1A);                      \
+    return FN##_##SUFF (vget_high_##SUFF (a), c[1]);       \
+  }
+
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)      \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a)                     \
+  {                                                           \
+    INTYPE b = vdupq_n_##SUFF (0x1A);                        \
+    return FN##_##SUFF (a, vget_high_##SUFF (b), b[1]);      \
+  }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  RETTYPE test_##FN##_## SUFF (H_INTYPE b)                     \
+  {                                                             \
+    INTYPE a = vdupq_n_##SUFF (0x1A);                          \
+    return FN##_##SUFF (vget_high_##SUFF (a), b);              \
+  }
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a, H_INTYPE b)            \
+  {                                                              \
+    INTYPE c = vdupq_n_##SUFF (0x1A);                           \
+    return FN##_##SUFF (a, vget_high_##SUFF (c), b);            \
+  }
+
+#include "fold_to_highpart_1.c"
+
+
+/* { dg-final { scan-assembler-not {uxtl2\t} } } */
+/* { dg-final { scan-assembler-not {sxtl2\t} } } */
+
+/* { dg-final { scan-assembler-not {umull2\t} } } */
+/* { dg-final { scan-assembler-not {smull2\t} } } */
+/* { dg-final { scan-assembler-not {pmull2\t} } } */
+
+/* { dg-final { scan-assembler-not {uaddl2\t} } } */
+/* { dg-final { scan-assembler-not {saddl2\t} } } */
+
+/* { dg-final { scan-assembler-not {usubl2\t} } } */
+/* { dg-final { scan-assembler-not {ssubl2\t} } } */
+
+/* { dg-final { scan-assembler-not {uabal2\t} } } */
+/* { dg-final { scan-assembler-not {sabal2\t} } } */
+
+/* { dg-final { scan-assembler-not {uabdl2\t} } } */
+/* { dg-final { scan-assembler-not {sabdl2\t} } } */
+
+/* { dg-final { scan-assembler-not {usubw2\t} } } */
+/* { dg-final { scan-assembler-not {ssubw2\t} } } */
+
+/* { dg-final { scan-assembler-not {uaddw2\t} } } */
+/* { dg-final { scan-assembler-not {saddw2\t} } } */
+
+/* { dg-final { scan-assembler-not {umlal2\t} } } */
+/* { dg-final { scan-assembler-not {smlal2\t} } } */
+
+/* { dg-final { scan-assembler-not {umlsl2\t} } } */
+/* { dg-final { scan-assembler-not {smlsl2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmull2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmlal2\t} } } */
+
+/* { dg-final { scan-assembler-not {sqdmlsl2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
new file mode 100644
index 00000000000..a6fea299f25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c
@@ -0,0 +1,83 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+/* PR117850 */
+
+/* We should use the highpart builtin where doing so would avoid
+   data movement instructions.  We avoid a DUP here after extending
+   the VECTOR_CSTs to 128-bits.  */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_BIN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_a1_##FN##_##SUFF (INTYPE a)             \
+  {                                                     \
+    INTYPE b = vdupq_n_##SUFF (0x1A);                  \
+    return FN##_##SUFF (vget_high_##SUFF (a),          \
+                       vget_high_##SUFF (b));          \
+  }
+
+#define TEST_BIN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_a2_##FN##_##SUFF (INTYPE a)             \
+  {                                                     \
+    INTYPE b = vdupq_n_##SUFF (0x1A);                  \
+    return FN##_##SUFF (vget_high_##SUFF (b),          \
+                       vget_high_##SUFF (a));          \
+  }
+
+#define TEST_TERN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_a1_##FN##_##SUFF (RETTYPE a, INTYPE b)      \
+  {                                                         \
+    INTYPE c = vdupq_n_##SUFF (0x1A);                      \
+    return FN##_##SUFF (a, vget_high_##SUFF (b),           \
+                       vget_high_##SUFF (c));              \
+  }
+
+#define TEST_TERN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_a2_##FN##_##SUFF (RETTYPE a, INTYPE b)      \
+  {                                                         \
+    INTYPE c = vdupq_n_##SUFF (0x1A);                      \
+    return FN##_##SUFF (a, vget_high_##SUFF (c),           \
+                       vget_high_##SUFF (b));              \
+  }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  TEST_BIN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF)              \
+  TEST_BIN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \
+  TEST_TERN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF)              \
+  TEST_TERN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF)
+
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-not {dup\t} } } */
+
+/* { dg-final { scan-assembler-times {smull2\t} 6} } */
+/* { dg-final { scan-assembler-times {umull2\t} 6} } */
+/* { dg-final { scan-assembler-times {pmull2\t} 2} } */
+
+/* { dg-final { scan-assembler-times {saddl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uaddl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {ssubl2\t} 6} } */
+/* { dg-final { scan-assembler-times {usubl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 6} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlal2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlal2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {smlsl2\t} 6} } */
+/* { dg-final { scan-assembler-times {umlsl2\t} 6} } */
+
+/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
new file mode 100644
index 00000000000..046c7a00def
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+#include "arm_neon.h"
+
+#define VEC_CST_u8  0x0102030405060708
+#define VEC_CST_u16 0x0001000200030004
+#define VEC_CST_u32 0x0000000100000002
+
+/* Extend the 64b VECTOR_CST to the type required by the hi builtin.  */
+
+uint16x8_t
+test_u8 (uint8x16_t a)
+{
+  const uint8x8_t b = vcreate_u8 (VEC_CST_u8);
+  return vmull_u8 (vget_high_u8 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 
4, 3, 2, 1 \}" 1 "optimized" } } */
+
+uint32x4_t
+test_u16 (uint16x8_t a)
+{
+  const uint16x4_t b = vcreate_u16 (VEC_CST_u16);
+  return vmull_u16 (vget_high_u16 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 4, 3, 2, 1, 4, 3, 2, 1 \}" 1 
"optimized" } } */
+
+uint64x2_t
+test_u32 (uint32x4_t a)
+{
+  const uint32x2_t b = vcreate_u32 (VEC_CST_u32);
+  return vmull_u32 (vget_high_u32 (a), b);
+}
+
+/* { dg-final { scan-tree-dump-times "\{ 2, 1, 2, 1 \}" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
new file mode 100644
index 00000000000..dd47fe94e94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c
@@ -0,0 +1,92 @@
+/* { dg-do compile } */
+/* { dg-options "-O -march=armv9-a+bf16" } */
+
+/* Test that we can still fold when the base type of the vector who's
+   highpart we are referring to is incompatible with that of the hi builtin.
+
+   Use float64x2_t as it is never INTYPE.  */
+
+#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \
+  RETTYPE test_##FN##_##SUFF (float64x2_t a)        \
+  {                                                  \
+    INTYPE x = vreinterpretq_##SUFF##_f64 (a);      \
+    return FN##_##SUFF(vget_high_##SUFF (x));       \
+  }
+
+#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)       \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a, float64x2_t b)              \
+  {                                                           \
+    INTYPE x = vreinterpretq_##SUFF##_f64 (b);               \
+    return FN##_##SUFF (a, vget_high_##SUFF (x));            \
+  }
+
+#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)    \
+  RETTYPE test_##FN##_##SUFF (float64x2_t a)              \
+  {                                                        \
+    INTYPE x = vreinterpretq_##SUFF##_f64 (a);            \
+    return FN##_##SUFF (vget_high_##SUFF (x), x[1]);      \
+  }
+
+#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF)         \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a, float64x2_t b)                 \
+  {                                                              \
+    INTYPE x = vreinterpretq_##SUFF##_f64 (b);                  \
+    return FN##_##SUFF (a, vget_high_##SUFF (x), x[1]);         \
+  }
+
+#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF)   \
+  RETTYPE test_##FN##_##SUFF (float64x2_t a, float64x2_t b)      \
+  {                                                               \
+    INTYPE x = vreinterpretq_##SUFF##_f64 (a);                   \
+    INTYPE y = vreinterpretq_##SUFF##_f64 (b);                   \
+    return FN##_##SUFF (vget_high_##SUFF (x),                    \
+                       vget_high_##SUFF (y));                    \
+  }
+
+#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF)       \
+  RETTYPE test_##FN##_##SUFF (RETTYPE a, float64x2_t b, float64x2_t c) \
+  {                                                                    \
+    INTYPE x = vreinterpretq_##SUFF##_f64 (b);                         \
+    INTYPE y = vreinterpretq_##SUFF##_f64 (c);                         \
+    return FN##_##SUFF (a, vget_high_## SUFF (x),                       \
+                       vget_high_## SUFF (y));                         \
+  }
+
+#include "fold_to_highpart_1.c"
+
+/* { dg-final { scan-assembler-times {sxtl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uxtl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {smull2\t} 5} } */
+/* { dg-final { scan-assembler-times {umull2\t} 5} } */
+/* { dg-final { scan-assembler-times {pmull2\t} 1} } */
+
+/* { dg-final { scan-assembler-times {saddl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uaddl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {ssubl2\t} 3} } */
+/* { dg-final { scan-assembler-times {usubl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {saddw2\t} 3} } */
+/* { dg-final { scan-assembler-times {uaddw2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {ssubw2\t} 3} } */
+/* { dg-final { scan-assembler-times {usubw2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {sabdl2\t} 3} } */
+/* { dg-final { scan-assembler-times {uabdl2\t} 3} } */
+
+/* { dg-final { scan-assembler-times {smlal2\t} 5} } */
+/* { dg-final { scan-assembler-times {umlal2\t} 5} } */
+
+/* { dg-final { scan-assembler-times {smlsl2\t} 5} } */
+/* { dg-final { scan-assembler-times {umlsl2\t} 5} } */
+
+/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */
+
+/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c 
b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
new file mode 100644
index 00000000000..3570d4da34b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target aarch64_little_endian } */
+/* { dg-options "-O2 -march=armv8-a+sve" } */
+
+#include <arm_neon_sve_bridge.h>
+
+typedef int16_t int16x16_t __attribute__ ((vector_size (32)));
+
+/* Edge cases where we don't/can't fold, reject these gracefully.  */
+
+int8x16_t z;
+
+int16x8_t
+test_addressable ()
+{
+  return vmovl_s8 (vget_high_s8 (z));
+}
+
+int16x8_t
+test_scalable_type (svint8_t scalable)
+{
+  return vmovl_s8 (vget_high_s8 (svget_neonq_s8 (scalable)));
+}
+
+int16x8_t
+test_scalar_type (__int128_t foo)
+{
+  return vmovl_s8 (vget_high_s8 (vreinterpretq_s8_p128 (foo)));
+}
+
+int32x4_t
+test_256b_type (int16x16_t foo)
+{
+  return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] });
+}
+
+/* { dg-final { scan-assembler-not {sxtl2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
deleted file mode 100644
index c51878aa226..00000000000
--- a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/* { dg-do compile } */
-/* { dg-options "-O" } */
-/* { dg-final { check-function-bodies "**" "" "" } } */
-
-#include <arm_neon.h>
-
-/*
-** test_vabal_s8:
-**      sabal2 v0.8h, v2.16b, v1.16b
-**      ret
-*/
-int16x8_t
-test_vabal_s8 (int16x8_t sadv, int8x16_t pv, int8x16_t sv)
-{
-  return vabal_s8 (sadv, vget_high_s8 (pv), vget_high_s8 (sv));
-}
-
-/*
-** test_vabal_u8:
-**      uabal2 v0.8h, v2.16b, v1.16b
-**      ret
-*/
-uint16x8_t
-test_vabal_u8 (uint16x8_t sadv, uint8x16_t pv, uint8x16_t sv)
-{
-  return vabal_u8 (sadv, vget_high_u8 (pv), vget_high_u8 (sv));
-}
-
-/*
-** test_vabal_s16:
-**      sabal2 v0.4s, v2.8h, v1.8h
-**      ret
-*/
-int32x4_t
-test_vabal_s16 (int32x4_t sadv, int16x8_t pv, int16x8_t sv)
-{
-  return vabal_s16 (sadv, vget_high_s16 (pv), vget_high_s16 (sv));
-}
-
-/*
-** test_vabal_u16:
-**      uabal2 v0.4s, v2.8h, v1.8h
-**      ret
-*/
-uint32x4_t
-test_vabal_u16 (uint32x4_t sadv, uint16x8_t pv, uint16x8_t sv)
-{
-  return vabal_u16 (sadv, vget_high_u16 (pv), vget_high_u16 (sv));
-}
-
-/*
-** test_vabal_s32:
-**      sabal2 v0.2d, v2.4s, v1.4s
-**      ret
-*/
-int64x2_t
-test_vabal_s32 (int64x2_t sadv, int32x4_t pv, int32x4_t sv)
-{
-  return vabal_s32 (sadv, vget_high_s32 (pv), vget_high_s32 (sv));
-}
-
-/*
-** test_vabal_u32:
-**      uabal2 v0.2d, v2.4s, v1.4s
-**      ret
-*/
-uint64x2_t
-test_vabal_u32 (uint64x2_t sadv, uint32x4_t pv, uint32x4_t sv)
-{
-  return vabal_u32 (sadv, vget_high_u32 (pv), vget_high_u32 (sv));
-}
-
-- 
2.34.1

[PATCH v3 1/1] aarch64: Fold builtins with highpart args to highpart equivalent [PR117850]

Reply via email to