[PATCH] vect: Improve vectorization for small-trip-count loops using subvectors

Pengfei Li Thu, 08 May 2025 10:16:36 -0700

This patch improves the auto-vectorization for loops with known small
trip counts by enabling the use of subvectors - bit fields of original
wider vectors. A subvector must have the same vector element type as the
original vector and enough bits for all vector elements to be processed
in the loop. Using subvectors is beneficial because machine instructions
operating on narrower vectors usually show better performance.


To enable this optimization, this patch introduces a new target hook.
This hook allows the vectorizer to query the backend for a suitable
subvector type given the original vector type and the number of elements
to be processed in the small-trip-count loop. The target hook also has a
could_trap parameter to say if the subvector is allowed to have more
bits than needed.

This optimization is currently enabled for AArch64 only. Below example
shows how it uses AdvSIMD vectors as subvectors of SVE vectors for
higher instruction throughput.

Consider this loop operating on an array of 16-bit integers:

        for (int i = 0; i < 5; i++) {
          a[i] = a[i] < 0 ? -a[i] : a[i];
        }

Before this patch, the generated AArch64 code would be:

        ptrue   p7.h, vl5
        ptrue   p6.b, all
        ld1h    z31.h, p7/z, [x0]
        abs     z31.h, p6/m, z31.h
        st1h    z31.h, p7, [x0]

After this patch, it is optimized to:

        ptrue   p7.h, vl5
        ld1h    z31.h, p7/z, [x0]
        abs     v31.8h, v31.8h
        st1h    z31.h, p7, [x0]

This patch also helps eliminate the ptrue in the case.

Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/ChangeLog:

        * config/aarch64/aarch64.cc (aarch64_find_subvector_type):
        Implement target hook for finding subvectors for AArch64.
        * doc/tm.texi: Document the new target hook.
        * doc/tm.texi.in: Document the new target hook.
        * expmed.cc (extract_bit_field_as_subreg): Support expanding
        BIT_FIELD_REF for subvector types to SUBREG in RTL.
        * match.pd: Prevent simplification of BIT_FIELD_REF for
        subvector types to VIEW_CONVERT.
        * target.def: New target hook definition.
        * targhooks.cc (default_vectorize_find_subvector_type): Provide
        default implementation for the target hook.
        * tree-cfg.cc (verify_types_in_gimple_reference): Update GIMPLE
        verification for BIT_FIELD_REF used for subvectors.
        * tree-vect-stmts.cc (vectorizable_operation): Output vectorized
        GIMPLE with subvector types.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/cond_unary_6.c: Adjust loop trip counts
        to avoid triggering this new optimization.
        * gcc.target/aarch64/vect-subvector-1.c: New test.
        * gcc.target/aarch64/vect-subvector-2.c: New test.
---
 gcc/config/aarch64/aarch64.cc                 | 39 ++++++++
 gcc/doc/tm.texi                               | 12 +++
 gcc/doc/tm.texi.in                            |  2 +
 gcc/expmed.cc                                 |  5 +-
 gcc/match.pd                                  |  3 +-
 gcc/target.def                                | 17 ++++
 gcc/targhooks.cc                              |  8 ++
 gcc/targhooks.h                               |  3 +
 .../gcc.target/aarch64/sve/cond_unary_6.c     |  4 +-
 .../gcc.target/aarch64/vect-subvector-1.c     | 28 ++++++
 .../gcc.target/aarch64/vect-subvector-2.c     | 28 ++++++
 gcc/tree-cfg.cc                               |  8 ++
 gcc/tree-vect-stmts.cc                        | 90 ++++++++++++++++++-
 13 files changed, 240 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-subvector-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-subvector-2.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index fff8d9da49d..700f1646706 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17012,6 +17012,42 @@ aarch64_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
     }
 }
 
+/* Implement TARGET_VECTORIZE_FIND_SUBVECTOR_TYPE.  */
+static tree
+aarch64_find_subvector_type (tree vectype, unsigned HOST_WIDE_INT elem_cnt,
+                            bool could_trap)
+{
+  gcc_assert (VECTOR_TYPE_P (vectype));
+
+  /* AArch64 AdvSIMD vectors are treated as subvectors of SVE for all
+     vectorization preferences except "sve-only".  */
+  if (aarch64_autovec_preference == AARCH64_AUTOVEC_SVE_ONLY)
+    return NULL_TREE;
+
+  /* No subvectors for AdvSIMD or partial vectors, since elements in partial
+     vectors could be non-consecutive.  */
+  machine_mode mode = TYPE_MODE (vectype);
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  if ((vec_flags & VEC_ADVSIMD) || (vec_flags & VEC_PARTIAL))
+    return NULL_TREE;
+
+  tree innertype = TREE_TYPE (vectype);
+  unsigned int scalar_prec = TYPE_PRECISION (innertype);
+  unsigned int data_bits = elem_cnt * scalar_prec;
+
+  /* If the operation could trap, we can use AdvSIMD vectors only if they
+     contain full vectors of data.  */
+  if (could_trap)
+    return (elem_cnt > 1 && (data_bits == 64 || data_bits == 128))
+          ? build_vector_type (innertype, elem_cnt) : NULL_TREE;
+
+  /* Without possible trapping, AdvSIMD vectors can be used as long no data
+     are more than 128 bits.  */
+  gcc_assert (multiple_p (128, scalar_prec));
+  return data_bits <= 128 ? build_vector_type (innertype, 128 / scalar_prec)
+                         : NULL_TREE;
+}
+
 /* Return true if an access of kind KIND for STMT_INFO (or NODE if SLP)
    represents one vector of an LD[234] or ST[234] operation.  Return the total
    number of vectors (2, 3 or 4) if so, otherwise return a value outside that
@@ -31963,6 +31999,9 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_VECTORIZE_CREATE_COSTS
 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
 
+#undef TARGET_VECTORIZE_FIND_SUBVECTOR_TYPE
+#define TARGET_VECTORIZE_FIND_SUBVECTOR_TYPE aarch64_find_subvector_type
+
 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
   aarch64_builtin_vectorization_cost
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 5e305643b3a..95177d255ba 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6495,6 +6495,18 @@ current cost model is for the scalar version of a loop 
or block; otherwise
 it is for the vector version.
 @end deftypefn
 
+@deftypefn {Target Hook} tree TARGET_VECTORIZE_FIND_SUBVECTOR_TYPE (tree 
@var{vectype}, unsigned HOST_WIDE_INT @var{elem_cnt}, bool @var{could_trap})
+This hook returns a subvector type of the given vector type @var{vectype}.
+A subvector here refers to a bit field of the original type vector.  It must
+have the same vector element type as the original vector and enough bits for
+@var{elem_cnt} elements.  If @var{could_trap} is true, the subvector must be
+filled with exactly @var{elem_cnt} elements to avoid trapping caused by
+undefined values.  Otherwise, the subvector is allowed to have more bits than
+needed.
+
+The default version of this hook returns NULL_TREE.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_GATHER (const_tree 
@var{mem_vectype}, const_tree @var{index_type}, int @var{scale})
 Target builtin that implements vector gather operation.  @var{mem_vectype}
 is the vector type of the load and @var{index_type} is scalar type of
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index eccc4d88493..c9e1b3b281d 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4307,6 +4307,8 @@ address;  but often a machine-dependent strategy can 
generate better code.
 
 @hook TARGET_VECTORIZE_CREATE_COSTS
 
+@hook TARGET_VECTORIZE_FIND_SUBVECTOR_TYPE
+
 @hook TARGET_VECTORIZE_BUILTIN_GATHER
 
 @hook TARGET_VECTORIZE_BUILTIN_SCATTER
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8cf10d9c73b..5a8b848ff92 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -1643,9 +1643,10 @@ extract_bit_field_as_subreg (machine_mode mode, rtx op0,
 {
   poly_uint64 bytenum;
   if (multiple_p (bitnum, BITS_PER_UNIT, &bytenum)
-      && known_eq (bitsize, GET_MODE_BITSIZE (mode))
       && lowpart_bit_field_p (bitnum, bitsize, op0_mode)
-      && TRULY_NOOP_TRUNCATION_MODES_P (mode, op0_mode))
+      && TRULY_NOOP_TRUNCATION_MODES_P (mode, op0_mode)
+      && (known_eq (bitsize, GET_MODE_BITSIZE (mode))
+         || known_eq (bitsize, GET_MODE_BITSIZE (op0_mode))))
     return force_subreg (mode, op0, op0_mode, bytenum);
   return NULL_RTX;
 }
diff --git a/gcc/match.pd b/gcc/match.pd
index ab496d923cc..dbb0be925e5 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9467,7 +9467,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 
 (simplify
  (BIT_FIELD_REF @0 @1 integer_zerop)
- (if (tree_int_cst_equal (@1, TYPE_SIZE (TREE_TYPE (@0))))
+ (if (tree_int_cst_equal (@1, TYPE_SIZE (TREE_TYPE (@0)))
+      && tree_int_cst_equal (@1, TYPE_SIZE (type)))
   (view_convert @0)))
 
 (simplify
diff --git a/gcc/target.def b/gcc/target.def
index 38903eb567a..45482958ef8 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2095,6 +2095,23 @@ it is for the vector version.",
  (vec_info *vinfo, bool costing_for_scalar),
  default_vectorize_create_costs)
 
+/* Target function to find a possible subvector type which can be used to
+   substitude the original vector type.  */
+DEFHOOK
+(find_subvector_type,
+ "This hook returns a subvector type of the given vector type @var{vectype}.\n\
+A subvector here refers to a bit field of the original type vector.  It must\n\
+have the same vector element type as the original vector and enough bits for\n\
+@var{elem_cnt} elements.  If @var{could_trap} is true, the subvector must be\n\
+filled with exactly @var{elem_cnt} elements to avoid trapping caused by\n\
+undefined values.  Otherwise, the subvector is allowed to have more bits 
than\n\
+needed.\n\
+\n\
+The default version of this hook returns NULL_TREE.",
+ tree,
+ (tree vectype, unsigned HOST_WIDE_INT elem_cnt, bool could_trap),
+ default_vectorize_find_subvector_type)
+
 HOOK_VECTOR_END (vectorize)
 
 #undef HOOK_PREFIX
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index c79458e374e..8460d068b1a 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1641,6 +1641,14 @@ default_vectorize_create_costs (vec_info *vinfo, bool 
costing_for_scalar)
   return new vector_costs (vinfo, costing_for_scalar);
 }
 
+/* By default, do not use subvector types in vectorization.  */
+
+tree
+default_vectorize_find_subvector_type (tree, unsigned HOST_WIDE_INT, bool)
+{
+  return NULL_TREE;
+}
+
 /* Determine whether or not a pointer mode is valid. Assume defaults
    of ptr_mode or Pmode - can be overridden.  */
 bool
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index f16b58798c2..e8c33e54739 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -125,6 +125,9 @@ extern opt_machine_mode default_get_mask_mode 
(machine_mode);
 extern bool default_empty_mask_is_expensive (unsigned);
 extern bool default_conditional_operation_is_expensive (unsigned);
 extern vector_costs *default_vectorize_create_costs (vec_info *, bool);
+extern tree default_vectorize_find_subvector_type (tree,
+                                                  unsigned HOST_WIDE_INT,
+                                                  bool);
 
 /* OpenACC hooks.  */
 extern bool default_goacc_validate_dims (tree, int [], int, unsigned);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_6.c
index c49a3040b21..437849eff8b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_6.c
@@ -29,9 +29,9 @@
 #define TEST_ALL(T) \
   TEST_TYPES (T, int16_t, int8_t, 7) \
   TEST_TYPES (T, int32_t, int8_t, 3) \
-  TEST_TYPES (T, int32_t, int16_t, 3) \
+  TEST_TYPES (T, int32_t, int16_t, 9) \
   TEST_TYPES (T, int64_t, int8_t, 5) \
-  TEST_TYPES (T, int64_t, int16_t, 5) \
+  TEST_TYPES (T, int64_t, int16_t, 9) \
   TEST_TYPES (T, int64_t, int32_t, 5)
 
 TEST_ALL (DEF_LOOP)
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-subvector-1.c 
b/gcc/testsuite/gcc.target/aarch64/vect-subvector-1.c
new file mode 100644
index 00000000000..1096e143686
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-subvector-1.c
@@ -0,0 +1,28 @@
+/* Test if AdvSIMD subvectors are used in vectorization with SVE.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8-a+sve" } */
+
+#include <stdint.h>
+
+void unary (int16_t *restrict a) {
+    for (int i = 0; i < 5; i++) {
+        a[i] = a[i] < 0 ? -a[i] : a[i];
+    }
+}
+
+void binary (int16_t *restrict a, int16_t * restrict b) {
+    for (int i = 0; i < 7; i++) {
+        a[i] = a[i] > b[i] ? a[i] : b[i];
+    }
+}
+
+void ternary (int8_t *restrict a, int8_t * restrict b,
+             int8_t *restrict c) {
+    for (int i = 0; i < 12; i++) {
+        a[i] = a[i] - b[i] * c[i];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tabs\tv[0-9]+\.8h, v[0-9]+\.8h} 1 } } */
+/* { dg-final { scan-assembler-times {\tsmax\tv[0-9]+\.8h, v[0-9]+\.8h, 
v[0-9]+\.8h} 1 } } */
+/* { dg-final { scan-assembler-times {\tmls\tv[0-9]+\.16b, v[0-9]+\.16b, 
v[0-9]+\.16b} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-subvector-2.c 
b/gcc/testsuite/gcc.target/aarch64/vect-subvector-2.c
new file mode 100644
index 00000000000..ca88a48a2ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-subvector-2.c
@@ -0,0 +1,28 @@
+/* Test if AdvSIMD subvectors are used in vectorization with SVE.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=armv8-a+sve" } */
+
+#include <stdint.h>
+
+void unary (__fp16 *restrict a) {
+    for (int i = 0; i < 5; i++) {
+        a[i] = a[i] < 0 ? -a[i] : a[i];
+    }
+}
+
+void binary (float *restrict a, float * restrict b) {
+    for (int i = 0; i < 3; i++) {
+        a[i] = a[i] > b[i] ? a[i] : b[i];
+    }
+}
+
+void ternary (float *restrict a, float * restrict b,
+             float *restrict c) {
+    for (int i = 0; i < 3; i++) {
+        a[i] = a[i] - b[i] * c[i];
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfabs\tv[0-9]+\.8h, v[0-9]+\.8h} 1 } } 
*/
+/* { dg-final { scan-assembler-times {\tfmaxnm\tv[0-9]+\.4s, v[0-9]+\.4s, 
v[0-9]+\.4s} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmls\tv[0-9]+\.4s, v[0-9]+\.4s, 
v[0-9]+\.4s} 1 } } */
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 6a95b82ff40..76b468d26d3 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -3158,7 +3158,15 @@ verify_types_in_gimple_reference (tree expr, bool 
require_lvalue)
                     "field size of %qs", code_name);
              return true;
            }
+         else if (VECTOR_TYPE_P (TREE_TYPE (expr))
+               && !multiple_p (size, element_precision (expr)))
+           {
+             error ("field size is not a multiple of vector element size in "
+                    "%qs", code_name);
+             return true;
+           }
          else if (!INTEGRAL_TYPE_P (TREE_TYPE (expr))
+                  && !VECTOR_TYPE_P (TREE_TYPE (expr))
                   && TYPE_MODE (TREE_TYPE (expr)) != BLKmode
                   && maybe_ne (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (expr))),
                                size))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ea0b4262781..24817609a67 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -7063,8 +7063,8 @@ vectorizable_operation (vec_info *vinfo,
      Similarly, if this operation is part of a reduction, a fully-masked
      loop should only change the active lanes of the reduction chain,
      keeping the inactive lanes as-is.  */
-  bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
-                           || reduc_idx >= 0);
+  bool could_trap = gimple_could_trap_p (stmt);
+  bool mask_out_inactive = ((!is_invariant && could_trap) || reduc_idx >= 0);
 
   if (!vec_stmt) /* transformation not required.  */
     {
@@ -7168,6 +7168,44 @@ vectorizable_operation (vec_info *vinfo,
   else
     vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  /* For loops with known small iteration count, operating on subvectors (bit
+     fields of original vectors) may be more beneficial than operating on the
+     original vectors.  This checks if a subvector type is available for the
+     current operation and set SUBVECTYPE accordingly.  */
+  tree subvectype = NULL_TREE, subvectype2 = NULL_TREE,
+       subvectype3 = NULL_TREE, subvectype_out = NULL_TREE;
+  if (masked_loop_p
+      && reduc_idx == -1
+      && vec_num * ncopies == 1
+      && loop_vinfo && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      unsigned HOST_WIDE_INT niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      subvectype = targetm.vectorize.find_subvector_type (vectype,
+                                                         niters,
+                                                         could_trap);
+      if (subvectype != NULL_TREE)
+       {
+         /* The output and other operands could have different signedness, so
+            we find subvector types for them separately.  */
+         subvectype_out = targetm.vectorize.find_subvector_type (vectype_out,
+                                                                 niters,
+                                                                 could_trap);
+         if (op_type == binary_op || op_type == ternary_op)
+           subvectype2 = targetm.vectorize.find_subvector_type (vectype2,
+                                                                niters,
+                                                                could_trap);
+         if (op_type == ternary_op)
+           subvectype3 = targetm.vectorize.find_subvector_type (vectype3,
+                                                                niters,
+                                                                could_trap);
+
+         /* Check if optab with the subvector type is implemented.  */
+         optab = optab_for_tree_code (code, subvectype, optab_default);
+         if (!optab || !can_implement_p (optab, TYPE_MODE (subvectype)))
+           subvectype = NULL_TREE;
+       }
+    }
+
   /* In case the vectorization factor (VF) is bigger than the number
      of elements that we can fit in a vectype (nunits), we have to generate
      more than one vector stmt - i.e - we need to "unroll" the
@@ -7319,6 +7357,54 @@ vectorizable_operation (vec_info *vinfo,
          gimple_assign_set_lhs (new_stmt, new_temp);
          vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
        }
+      else if (subvectype != NULL_TREE)
+       {
+         tree bitsize = TYPE_SIZE (subvectype);
+         tree offset = bitsize_int (0);
+
+         /* Build GIMPLE statements with BIT_FILED_REF to cast each operand to
+            the subvector type.  */
+         new_stmt = gimple_build_assign (NULL_TREE, BIT_FIELD_REF,
+                                         build3 (BIT_FIELD_REF, subvectype,
+                                                 vop0, bitsize, offset));
+         vop0 = make_ssa_name (subvectype, new_stmt);
+         gimple_assign_set_lhs (new_stmt, vop0);
+         vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+         if (vop1)
+           {
+             new_stmt = gimple_build_assign (NULL_TREE, BIT_FIELD_REF,
+                                             build3 (BIT_FIELD_REF,
+                                                     subvectype2,
+                                                     vop1, bitsize, offset));
+             vop1 = make_ssa_name (subvectype2, new_stmt);
+             gimple_assign_set_lhs (new_stmt, vop1);
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+           }
+         if (vop2)
+           {
+             new_stmt = gimple_build_assign (NULL_TREE, BIT_FIELD_REF,
+                                             build3 (BIT_FIELD_REF,
+                                                     subvectype3,
+                                                     vop2, bitsize, offset));
+             vop2 = make_ssa_name (subvectype3, new_stmt);
+             gimple_assign_set_lhs (new_stmt, vop2);
+             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+           }
+
+         /* Build vectorized statement based on the subvector type.  */
+         new_stmt = gimple_build_assign (NULL_TREE, code, vop0, vop1, vop2);
+         new_temp = make_ssa_name (subvectype_out, new_stmt);
+         gimple_assign_set_lhs (new_stmt, new_temp);
+         vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+         /* Build another GIMPLE statement to cast the vector result back.  */
+         new_stmt = gimple_build_assign (NULL_TREE, BIT_FIELD_REF,
+                                         build3 (BIT_FIELD_REF, vectype,
+                                                 new_temp, bitsize, offset));
+         new_temp = make_ssa_name (vectype, new_stmt);
+         gimple_assign_set_lhs (new_stmt, new_temp);
+         vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+       }
       else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
        {
          tree mask;
-- 
2.43.0

[PATCH] vect: Improve vectorization for small-trip-count loops using subvectors

Reply via email to