https://gcc.gnu.org/g:ac91bb5b441cd7330c1644937830fe83201c69c3
commit r17-2098-gac91bb5b441cd7330c1644937830fe83201c69c3 Author: Pengfei Li <[email protected]> Date: Mon Jun 29 08:30:45 2026 +0000 AArch64: Cap suggested unroll factor for small known-niters loops The AArch64 backend can suggest an unroll factor to the vectorizer in order to expose more ILP. However, in some cases the suggested value is larger than needed. For the test cases added by this patch, the AArch64 backend suggests an unroll factor of 4, but the loops only need 1 or 2 SVE vector iterations respectively to cover their 10 or 20 scalar iterations. This patch caps the suggested unroll factor with CEIL (niters, VF) for small known-niters loops. CEIL is used rather than truncating division so that the completely unrolled vector loop still covers all scalar iterations. Reducing the unroll factor below the number of required vector iterations could require a separate epilogue loop and lead to worse code generation. Bootstrapped and tested on aarch64-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_vector_costs::determine_suggested_unroll_factor): Add a loop_vec_info parameter. (determine_suggested_unroll_factor): Cap the suggested unroll for small-niters loops. (aarch64_vector_costs::finish_cost): Pass loop_vinfo to determine_suggested_unroll_factor. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/vect-unroll-1.c: New test. * gcc.target/aarch64/sve/vect-unroll-2.c: New test. Diff: --- gcc/config/aarch64/aarch64.cc | 18 +++++++++++++++--- gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c | 17 +++++++++++++++++ gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c | 17 +++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 3785a8e722d0..78f1eae8336c 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -17570,7 +17570,7 @@ private: unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *, unsigned int); bool prefer_unrolled_loop () const; - unsigned int determine_suggested_unroll_factor (); + unsigned int determine_suggested_unroll_factor (loop_vec_info loop_vinfo); /* True if we have performed one-time initialization based on the vec_info. */ @@ -19132,7 +19132,8 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops, } unsigned int -aarch64_vector_costs::determine_suggested_unroll_factor () +aarch64_vector_costs:: +determine_suggested_unroll_factor (loop_vec_info loop_vinfo) { bool sve = m_vec_flags & VEC_ANY_SVE; /* If we are trying to unroll an Advanced SIMD main loop that contains @@ -19189,6 +19190,16 @@ aarch64_vector_costs::determine_suggested_unroll_factor () max_unroll_factor = MAX (max_unroll_factor, unroll_factor); } + /* For known iteration loops, cap suggested unroll factor to avoid redundant + unrolled chunks. Use CEIL rather than truncating division to make sure + the completely unrolled vector loop covers all scalar iterations. */ + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) + { + unsigned int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); + unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo); + max_unroll_factor = MIN (max_unroll_factor, CEIL (niters, estimated_vf)); + } + /* Make sure unroll factor is power of 2. */ return 1 << ceil_log2 (max_unroll_factor); } @@ -19380,7 +19391,8 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs) { m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, m_costs[vect_body]); - m_suggested_unroll_factor = determine_suggested_unroll_factor (); + m_suggested_unroll_factor + = determine_suggested_unroll_factor (loop_vinfo); /* For gather and scatters there's an additional overhead for the first iteration. For low count loops they're not beneficial so model the diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c new file mode 100644 index 000000000000..8548f504db02 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-1.c @@ -0,0 +1,17 @@ +/* Check that the loop is not unrolled. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */ + +#include <stdint.h> +#include <stdlib.h> + +int +foo (uint8_t *p1, uint8_t *p2) +{ + int sum = 0; + for (int i = 0; i < 10; i++) + sum += abs (p1[i] - p2[i]); + return sum; +} + +/* { dg-final { scan-assembler-not {\tld1b\t[^\n]*, mul vl} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c new file mode 100644 index 000000000000..e5f8c457d30b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-unroll-2.c @@ -0,0 +1,17 @@ +/* Check that the loop is unrolled by 2 rather than 4 for small niters. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=neoverse-v2 -mautovec-preference=sve-only" } */ + +#include <stdint.h> +#include <stdlib.h> + +int +foo (uint8_t *p1, uint8_t *p2) +{ + int sum = 0; + for (int i = 0; i < 20; i++) + sum += abs (p1[i] - p2[i]); + return sum; +} + +/* { dg-final { scan-assembler-times {\tld1b\t[^\n]*, mul vl} 2 } } */
