[gcc r17-2098] AArch64: Cap suggested unroll factor for small known-niters loops

Pengfei Li via Gcc-cvs Thu, 02 Jul 2026 11:47:49 -0700

https://gcc.gnu.org/g:ac91bb5b441cd7330c1644937830fe83201c69c3


commit r17-2098-gac91bb5b441cd7330c1644937830fe83201c69c3
Author: Pengfei Li <[email protected]>
Date:   Mon Jun 29 08:30:45 2026 +0000

    AArch64: Cap suggested unroll factor for small known-niters loops
    
    The AArch64 backend can suggest an unroll factor to the vectorizer in
    order to expose more ILP. However, in some cases the suggested value is
    larger than needed. For the test cases added by this patch, the AArch64
    backend suggests an unroll factor of 4, but the loops only need 1 or 2
    SVE vector iterations respectively to cover their 10 or 20 scalar
    iterations.
    
    This patch caps the suggested unroll factor with CEIL (niters, VF) for
    small known-niters loops. CEIL is used rather than truncating division
    so that the completely unrolled vector loop still covers all scalar
    iterations. Reducing the unroll factor below the number of required
    vector iterations could require a separate epilogue loop and lead to
    worse code generation.
    
    Bootstrapped and tested on aarch64-linux-gnu.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64.cc
            (aarch64_vector_costs::determine_suggested_unroll_factor): Add a
            loop_vec_info parameter.
            (determine_suggested_unroll_factor): Cap the suggested unroll for
            small-niters loops.
            (aarch64_vector_costs::finish_cost): Pass loop_vinfo to
            determine_suggested_unroll_factor.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/vect-unroll-1.c: New test.
            * gcc.target/aarch64/sve/vect-unroll-2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.cc                        | 18 +++++++++++++++---
 gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c | 17 +++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c | 17 +++++++++++++++++
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 3785a8e722d0..78f1eae8336c 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -17570,7 +17570,7 @@ private:
   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
                                 unsigned int);
   bool prefer_unrolled_loop () const;
-  unsigned int determine_suggested_unroll_factor ();
+  unsigned int determine_suggested_unroll_factor (loop_vec_info loop_vinfo);
 
   /* True if we have performed one-time initialization based on the
      vec_info.  */
@@ -19132,7 +19132,8 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops,
 }
 
 unsigned int
-aarch64_vector_costs::determine_suggested_unroll_factor ()
+aarch64_vector_costs::
+determine_suggested_unroll_factor (loop_vec_info loop_vinfo)
 {
   bool sve = m_vec_flags & VEC_ANY_SVE;
   /* If we are trying to unroll an Advanced SIMD main loop that contains
@@ -19189,6 +19190,16 @@ 
aarch64_vector_costs::determine_suggested_unroll_factor ()
       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
     }
 
+  /* For known iteration loops, cap suggested unroll factor to avoid redundant
+     unrolled chunks.  Use CEIL rather than truncating division to make sure
+     the completely unrolled vector loop covers all scalar iterations.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      unsigned int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+      unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
+      max_unroll_factor = MIN (max_unroll_factor, CEIL (niters, estimated_vf));
+    }
+
   /* Make sure unroll factor is power of 2.  */
   return 1 << ceil_log2 (max_unroll_factor);
 }
@@ -19380,7 +19391,8 @@ aarch64_vector_costs::finish_cost (const vector_costs 
*uncast_scalar_costs)
     {
       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                             m_costs[vect_body]);
-      m_suggested_unroll_factor = determine_suggested_unroll_factor ();
+      m_suggested_unroll_factor
+       = determine_suggested_unroll_factor (loop_vinfo);
 
       /* For gather and scatters there's an additional overhead for the first
         iteration.  For low count loops they're not beneficial so model the
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c
new file mode 100644
index 000000000000..8548f504db02
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c
@@ -0,0 +1,17 @@
+/* Check that the loop is not unrolled.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+  int sum = 0;
+  for (int i = 0; i < 10; i++)
+    sum += abs (p1[i] - p2[i]);
+  return sum;
+}
+
+/* { dg-final { scan-assembler-not {\tld1b\t[^\n]*, mul vl} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c
new file mode 100644
index 000000000000..e5f8c457d30b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c
@@ -0,0 +1,17 @@
+/* Check that the loop is unrolled by 2 rather than 4 for small niters.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */
+
+#include <stdint.h>
+#include <stdlib.h>
+
+int
+foo (uint8_t *p1, uint8_t *p2)
+{
+  int sum = 0;
+  for (int i = 0; i < 20; i++)
+    sum += abs (p1[i] - p2[i]);
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times {\tld1b\t[^\n]*, mul vl} 2 } } */

[gcc r17-2098] AArch64: Cap suggested unroll factor for small known-niters loops

Reply via email to