https://gcc.gnu.org/g:7f87bfa4a7302ce663db51fb073a40045052cc11
commit r16-1646-g7f87bfa4a7302ce663db51fb073a40045052cc11 Author: Tamar Christina <tamar.christ...@arm.com> Date: Tue Jun 24 07:14:27 2025 +0100 middle-end: Apply loop->unroll directly in vectorizer Consider the loop void f1 (int *restrict a, int n) { #pragma GCC unroll 4 requested for (int i = 0; i < n; i++) a[i] *= 2; } Which today is vectorized and then unrolled 3x by the RTL unroller due to the use of the pragma. This is unfortunate because the pragma was intended for the scalar loop but we end up with an unrolled vector loop and a longer path to the entry which has a low enough VF requirement to enter. This patch instead seeds the suggested_unroll_factor with the value the user requested and instead uses it to maintain the total VF that the user wanted the scalar loop to maintain. In effect it applies the unrolling inside the vector loop itself. This has the benefits for things like reductions, as it allows us to split the accumulator and so the unrolled loop is more efficient. For early-break it allows the cbranch call to be shared between the unrolled elements, giving you more effective unrolling because it doesn't need the repeated cbranch which can be expensive. The target can then choose to create multiple epilogues to deal with the "rest". The example above now generates: .L4: ldr q31, [x2] add v31.4s, v31.4s, v31.4s str q31, [x2], 16 cmp x2, x3 bne .L4 as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates: .L4: ldp q30, q31, [x2] add v30.4s, v30.4s, v30.4s add v31.4s, v31.4s, v31.4s stp q30, q31, [x2], 32 cmp x3, x2 bne .L4 gcc/ChangeLog: * doc/extend.texi: Document pragma unroll interaction with vectorizer. * tree-vectorizer.h (LOOP_VINFO_USER_UNROLL): New. (class _loop_vec_info): Add user_unroll. * tree-vect-loop.cc (vect_analyze_loop_1): Set suggested_unroll_factor and retry. (_loop_vec_info::_loop_vec_info): Initialize user_unroll. (vect_transform_loop): Clear the loop->unroll value if the pragma was used. gcc/testsuite/ChangeLog: * gcc.target/aarch64/unroll-vect.c: New test. Diff: --- gcc/doc/extend.texi | 5 ++ gcc/testsuite/gcc.target/aarch64/unroll-vect.c | 20 ++++++++ gcc/tree-vect-loop.cc | 63 +++++++++++++++++++------- gcc/tree-vectorizer.h | 5 ++ 4 files changed, 77 insertions(+), 16 deletions(-) diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi index 69c651207464..7da99f77ec82 100644 --- a/gcc/doc/extend.texi +++ b/gcc/doc/extend.texi @@ -10382,6 +10382,11 @@ loop or a @code{#pragma GCC ivdep}, and applies only to the loop that follows. @var{n} is an integer constant expression specifying the unrolling factor. The values of @math{0} and @math{1} block any unrolling of the loop. +If the loop was vectorized the unroll factor specified will be used to seed the +vectorizer unroll factor. Whether the loop is unrolled or not will be +determined by target costing. The resulting vectorized loop may still be +unrolled more in later passes depending on the target costing. + @end table @node Thread-Local diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c new file mode 100644 index 000000000000..3cb774ba9578 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -march=armv8-a --param aarch64-autovec-preference=asimd-only -std=gnu99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* +** f1: +** ... +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** ... +*/ +void f1 (int *restrict a, int n) +{ +#pragma GCC unroll 16 + for (int i = 0; i < n; i++) + a[i] *= 2; +} + diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index eb2eb8b1fc08..9ee8e50ee75a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -1074,6 +1074,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) peeling_for_gaps (false), peeling_for_niter (false), early_breaks (false), + user_unroll (false), no_data_dependencies (false), has_mask_store (false), scalar_loop_scaling (profile_probability::uninitialized ()), @@ -3429,27 +3430,50 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared, res ? "succeeded" : "failed", GET_MODE_NAME (loop_vinfo->vector_mode)); - if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && suggested_unroll_factor > 1) + auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll; + if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) + /* Check to see if the user wants to unroll or if the target wants to. */ + && (suggested_unroll_factor > 1 || user_unroll > 1)) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, + if (suggested_unroll_factor == 1) + { + int assumed_vf = vect_vf_for_cost (loop_vinfo); + suggested_unroll_factor = user_unroll / assumed_vf; + if (suggested_unroll_factor > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "setting unroll factor to %d based on user requested " + "unroll factor %d and suggested vectorization " + "factor: %d\n", + suggested_unroll_factor, user_unroll, assumed_vf); + } + } + + if (suggested_unroll_factor > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "***** Re-trying analysis for unrolling" " with unroll factor %d and slp %s.\n", suggested_unroll_factor, slp_done_for_suggested_uf ? "on" : "off"); - loop_vec_info unroll_vinfo - = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL); - unroll_vinfo->vector_mode = vector_mode; - unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; - opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL, - slp_done_for_suggested_uf); - if (new_res) - { - delete loop_vinfo; - loop_vinfo = unroll_vinfo; - } - else - delete unroll_vinfo; + loop_vec_info unroll_vinfo + = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL); + unroll_vinfo->vector_mode = vector_mode; + unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor; + opt_result new_res + = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL, + slp_done_for_suggested_uf); + if (new_res) + { + delete loop_vinfo; + loop_vinfo = unroll_vinfo; + LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1; + } + else + delete unroll_vinfo; + } } /* Remember the autodetected vector mode. */ @@ -12042,6 +12066,13 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" " variable-length vectorization factor\n"); } + + /* When we have unrolled the loop due to a user requested value we should + leave it up to the RTL unroll heuristics to determine if it's still worth + while to unroll more. */ + if (LOOP_VINFO_USER_UNROLL (loop_vinfo)) + loop->unroll = 0; + /* Free SLP instances here because otherwise stmt reference counting won't work. */ slp_instance instance; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 6ccafaf18e63..dc60b4184ee0 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -985,6 +985,10 @@ public: /* Main loop IV cond. */ gcond* loop_iv_cond; + /* True if we have an unroll factor requested by the user through pragma GCC + unroll. */ + bool user_unroll; + /* True if there are no loop carried data dependencies in the loop. If loop->safelen <= 1, then this is always true, either the loop didn't have any loop carried data dependencies, or the loop is being @@ -1110,6 +1114,7 @@ public: #define LOOP_VINFO_CHECK_UNEQUAL_ADDRS(L) (L)->check_unequal_addrs #define LOOP_VINFO_CHECK_NONZERO(L) (L)->check_nonzero #define LOOP_VINFO_LOWER_BOUNDS(L) (L)->lower_bounds +#define LOOP_VINFO_USER_UNROLL(L) (L)->user_unroll #define LOOP_VINFO_GROUPED_STORES(L) (L)->grouped_stores #define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor