https://gcc.gnu.org/g:7f87bfa4a7302ce663db51fb073a40045052cc11

commit r16-1646-g7f87bfa4a7302ce663db51fb073a40045052cc11
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Tue Jun 24 07:14:27 2025 +0100

    middle-end: Apply loop->unroll directly in vectorizer
    
    Consider the loop
    
    void f1 (int *restrict a, int n)
    {
    #pragma GCC unroll 4 requested
      for (int i = 0; i < n; i++)
        a[i] *= 2;
    }
    
    Which today is vectorized and then unrolled 3x by the RTL unroller due to 
the
    use of the pragma.  This is unfortunate because the pragma was intended for 
the
    scalar loop but we end up with an unrolled vector loop and a longer path to 
the
    entry which has a low enough VF requirement to enter.
    
    This patch instead seeds the suggested_unroll_factor with the value the user
    requested and instead uses it to maintain the total VF that the user wanted 
the
    scalar loop to maintain.
    
    In effect it applies the unrolling inside the vector loop itself.  This has 
the
    benefits for things like reductions, as it allows us to split the 
accumulator
    and so the unrolled loop is more efficient.  For early-break it allows the
    cbranch call to be shared between the unrolled elements, giving you more
    effective unrolling because it doesn't need the repeated cbranch which can 
be
    expensive.
    
    The target can then choose to create multiple epilogues to deal with the 
"rest".
    
    The example above now generates:
    
    .L4:
            ldr     q31, [x2]
            add     v31.4s, v31.4s, v31.4s
            str     q31, [x2], 16
            cmp     x2, x3
            bne     .L4
    
    as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:
    
    .L4:
            ldp     q30, q31, [x2]
            add     v30.4s, v30.4s, v30.4s
            add     v31.4s, v31.4s, v31.4s
            stp     q30, q31, [x2], 32
            cmp     x3, x2
            bne     .L4
    
    gcc/ChangeLog:
    
            * doc/extend.texi: Document pragma unroll interaction with 
vectorizer.
            * tree-vectorizer.h (LOOP_VINFO_USER_UNROLL): New.
            (class _loop_vec_info): Add user_unroll.
            * tree-vect-loop.cc (vect_analyze_loop_1): Set
            suggested_unroll_factor and retry.
            (_loop_vec_info::_loop_vec_info): Initialize user_unroll.
            (vect_transform_loop): Clear the loop->unroll value if the pragma 
was
            used.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/unroll-vect.c: New test.

Diff:
---
 gcc/doc/extend.texi                            |  5 ++
 gcc/testsuite/gcc.target/aarch64/unroll-vect.c | 20 ++++++++
 gcc/tree-vect-loop.cc                          | 63 +++++++++++++++++++-------
 gcc/tree-vectorizer.h                          |  5 ++
 4 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 69c651207464..7da99f77ec82 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -10382,6 +10382,11 @@ loop or a @code{#pragma GCC ivdep}, and applies only 
to the loop that follows.
 @var{n} is an integer constant expression specifying the unrolling factor.
 The values of @math{0} and @math{1} block any unrolling of the loop.
 
+If the loop was vectorized the unroll factor specified will be used to seed the
+vectorizer unroll factor.  Whether the loop is unrolled or not will be
+determined by target costing.  The resulting vectorized loop may still be
+unrolled more in later passes depending on the target costing.
+
 @end table
 
 @node Thread-Local
diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c 
b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
new file mode 100644
index 000000000000..3cb774ba9578
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8-a --param 
aarch64-autovec-preference=asimd-only -std=gnu99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f1:
+**     ...
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     ...
+*/
+void f1 (int *restrict a, int n)
+{
+#pragma GCC unroll 16
+  for (int i = 0; i < n; i++)
+    a[i] *= 2;
+}
+
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index eb2eb8b1fc08..9ee8e50ee75a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1074,6 +1074,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
     peeling_for_gaps (false),
     peeling_for_niter (false),
     early_breaks (false),
+    user_unroll (false),
     no_data_dependencies (false),
     has_mask_store (false),
     scalar_loop_scaling (profile_probability::uninitialized ()),
@@ -3429,27 +3430,50 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared 
*shared,
                     res ? "succeeded" : "failed",
                     GET_MODE_NAME (loop_vinfo->vector_mode));
 
-  if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) && suggested_unroll_factor > 
1)
+  auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
+  if (res && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      /* Check to see if the user wants to unroll or if the target wants to.  
*/
+      && (suggested_unroll_factor > 1 || user_unroll > 1))
     {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location,
+      if (suggested_unroll_factor == 1)
+       {
+         int assumed_vf = vect_vf_for_cost (loop_vinfo);
+         suggested_unroll_factor = user_unroll / assumed_vf;
+         if (suggested_unroll_factor > 1)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                        "setting unroll factor to %d based on user requested "
+                        "unroll factor %d and suggested vectorization "
+                        "factor: %d\n",
+                        suggested_unroll_factor, user_unroll, assumed_vf);
+           }
+       }
+
+       if (suggested_unroll_factor > 1)
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_NOTE, vect_location,
                         "***** Re-trying analysis for unrolling"
                         " with unroll factor %d and slp %s.\n",
                         suggested_unroll_factor,
                         slp_done_for_suggested_uf ? "on" : "off");
-      loop_vec_info unroll_vinfo
-       = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
-      unroll_vinfo->vector_mode = vector_mode;
-      unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
-      opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
-                                               slp_done_for_suggested_uf);
-      if (new_res)
-       {
-         delete loop_vinfo;
-         loop_vinfo = unroll_vinfo;
-       }
-      else
-       delete unroll_vinfo;
+           loop_vec_info unroll_vinfo
+               = vect_create_loop_vinfo (loop, shared, loop_form_info, NULL);
+           unroll_vinfo->vector_mode = vector_mode;
+           unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
+           opt_result new_res
+               = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
+                                      slp_done_for_suggested_uf);
+           if (new_res)
+             {
+               delete loop_vinfo;
+               loop_vinfo = unroll_vinfo;
+               LOOP_VINFO_USER_UNROLL (loop_vinfo) = user_unroll > 1;
+             }
+           else
+             delete unroll_vinfo;
+         }
     }
 
   /* Remember the autodetected vector mode.  */
@@ -12042,6 +12066,13 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)
        dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
                         " variable-length vectorization factor\n");
     }
+
+  /* When we have unrolled the loop due to a user requested value we should
+     leave it up to the RTL unroll heuristics to determine if it's still worth
+     while to unroll more.  */
+  if (LOOP_VINFO_USER_UNROLL (loop_vinfo))
+    loop->unroll = 0;
+
   /* Free SLP instances here because otherwise stmt reference counting
      won't work.  */
   slp_instance instance;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6ccafaf18e63..dc60b4184ee0 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -985,6 +985,10 @@ public:
   /* Main loop IV cond.  */
   gcond* loop_iv_cond;
 
+  /* True if we have an unroll factor requested by the user through pragma GCC
+     unroll.  */
+  bool user_unroll;
+
   /* True if there are no loop carried data dependencies in the loop.
      If loop->safelen <= 1, then this is always true, either the loop
      didn't have any loop carried data dependencies, or the loop is being
@@ -1110,6 +1114,7 @@ public:
 #define LOOP_VINFO_CHECK_UNEQUAL_ADDRS(L)  (L)->check_unequal_addrs
 #define LOOP_VINFO_CHECK_NONZERO(L)        (L)->check_nonzero
 #define LOOP_VINFO_LOWER_BOUNDS(L)         (L)->lower_bounds
+#define LOOP_VINFO_USER_UNROLL(L)          (L)->user_unroll
 #define LOOP_VINFO_GROUPED_STORES(L)       (L)->grouped_stores
 #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
 #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor

Reply via email to