Hi All,

Consider the loop

void f1 (int *restrict a, int n)
{
#pragma GCC unroll 4 requested
  for (int i = 0; i < n; i++)
    a[i] *= 2;
}

Which today is vectorized and then unrolled 3x by the RTL unroller due to the
use of the pragma.  This is unfortunate because the pragma was intended for the
scalar loop but we end up with an unrolled vector loop and a longer path to the
entry which has a low enough VF requirement to enter.

This patch instead seeds the suggested_unroll_factor with the value the user
requested and instead uses it to maintain the total VF that the user wanted the
scalar loop to maintain.

In effect it applies the unrolling inside the vector loop itself.  This has the
benefits for things like reductions, as it allows us to split the accumulator
and so the unrolled loop is more efficient.  For early-break it allows the
cbranch call to be shared between the unrolled elements, giving you more
effective unrolling because it doesn't need the repeated cbranch which can be
expensive.

The target can then choose to create multiple epilogues to deal with the "rest".

The example above now generates:

.L4:
        ldr     q31, [x2]
        add     v31.4s, v31.4s, v31.4s
        str     q31, [x2], 16
        cmp     x2, x3
        bne     .L4

as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:

.L4:
        ldp     q30, q31, [x2]
        add     v30.4s, v30.4s, v30.4s
        add     v31.4s, v31.4s, v31.4s
        stp     q30, q31, [x2], 32
        cmp     x3, x2
        bne     .L4

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vectorizer.h (vector_costs::set_suggested_unroll_factor,
        LOOP_VINFO_USER_UNROLL): New.
        (class _loop_vec_info): Add user_unroll.
        * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Set
        suggested_unroll_factor before calling backend costing.
        (_loop_vec_info::_loop_vec_info): Initialize user_unroll.
        (vect_transform_loop): Clear the loop->unroll value if the pragma was
        used.

---
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
fe6f3cf188e40396b299ff9e814cc402bc2d4e2d..a13e4978bc7ed651be3a65d243e84c5aaf706f65
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1073,6 +1073,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
     peeling_for_gaps (false),
     peeling_for_niter (false),
     early_breaks (false),
+    user_unroll (false),
     no_data_dependencies (false),
     has_mask_store (false),
     scalar_loop_scaling (profile_probability::uninitialized ()),
@@ -4983,6 +4984,26 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
        }
     }
 
+  /* Seed the target cost model with what the user requested if the unroll
+     factor is larger than 1 vector VF.  */
+  auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
+  if (user_unroll > 1)
+    {
+      LOOP_VINFO_USER_UNROLL (loop_vinfo) = true;
+      int unroll_fact = user_unroll / assumed_vf;
+      unroll_fact = 1 << ceil_log2 (unroll_fact);
+      if (unroll_fact > 1)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                        "setting unroll factor to %d based on user requested "
+                        "unroll factor %d and suggested vectorization "
+                        "factor: %d\n",
+                        unroll_fact, user_unroll, assumed_vf);
+         loop_vinfo->vector_costs->set_suggested_unroll_factor (unroll_fact);
+       }
+    }
+
   /* Complete the target-specific cost calculations.  */
   loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
   vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
@@ -12364,14 +12385,20 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)
                         GET_MODE_NAME (loop_vinfo->vector_mode));
     }
 
-  /* Loops vectorized with a variable factor won't benefit from
+  /* Loops vectorized would have already taken into account unrolling specified
+     by the user as the suggested unroll factor, as such we need to prevent the
+     RTL unroller from unrolling twice.  The only exception is static known
+     iterations where we would have expected the loop to be fully unrolled.
+     Loops vectorized with a variable factor won't benefit from
      unrolling/peeling.  */
-  if (!vf.is_constant ())
+  if (LOOP_VINFO_USER_UNROLL (loop_vinfo)
+      || !vf.is_constant ())
     {
       loop->unroll = 1;
       if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
-                        " variable-length vectorization factor\n");
+                        " having already considered vector unrolling or "
+                        "variable-length vectorization factor.\n");
     }
   /* Free SLP instances here because otherwise stmt reference counting
      won't work.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 
a2f33a5ecd60288fe7f28ee639ff8b6a77667796..5675309ab16dc7f7f0b98e229e4798075e6cdd7e
 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -970,6 +970,10 @@ public:
   /* Main loop IV cond.  */
   gcond* loop_iv_cond;
 
+  /* True if we have an unroll factor requested by the user through pragma GCC
+     unroll.  */
+  bool user_unroll;
+
   /* True if there are no loop carried data dependencies in the loop.
      If loop->safelen <= 1, then this is always true, either the loop
      didn't have any loop carried data dependencies, or the loop is being
@@ -1094,6 +1098,7 @@ public:
 #define LOOP_VINFO_CHECK_UNEQUAL_ADDRS(L)  (L)->check_unequal_addrs
 #define LOOP_VINFO_CHECK_NONZERO(L)        (L)->check_nonzero
 #define LOOP_VINFO_LOWER_BOUNDS(L)         (L)->lower_bounds
+#define LOOP_VINFO_USER_UNROLL(L)          (L)->user_unroll
 #define LOOP_VINFO_GROUPED_STORES(L)       (L)->grouped_stores
 #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
 #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
@@ -1693,6 +1698,7 @@ public:
   unsigned int outside_cost () const;
   unsigned int total_cost () const;
   unsigned int suggested_unroll_factor () const;
+  void set_suggested_unroll_factor (unsigned int);
   machine_mode suggested_epilogue_mode () const;
 
 protected:
@@ -1791,6 +1797,15 @@ vector_costs::suggested_unroll_factor () const
   return m_suggested_unroll_factor;
 }
 
+/* Set the suggested unroll factor.  */
+
+inline void
+vector_costs::set_suggested_unroll_factor (unsigned unroll_fact)
+{
+  gcc_checking_assert (!m_finished);
+  m_suggested_unroll_factor = unroll_fact;
+}
+
 /* Return the suggested epilogue mode.  */
 
 inline machine_mode


-- 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index fe6f3cf188e40396b299ff9e814cc402bc2d4e2d..a13e4978bc7ed651be3a65d243e84c5aaf706f65 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1073,6 +1073,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     peeling_for_gaps (false),
     peeling_for_niter (false),
     early_breaks (false),
+    user_unroll (false),
     no_data_dependencies (false),
     has_mask_store (false),
     scalar_loop_scaling (profile_probability::uninitialized ()),
@@ -4983,6 +4984,26 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	}
     }
 
+  /* Seed the target cost model with what the user requested if the unroll
+     factor is larger than 1 vector VF.  */
+  auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
+  if (user_unroll > 1)
+    {
+      LOOP_VINFO_USER_UNROLL (loop_vinfo) = true;
+      int unroll_fact = user_unroll / assumed_vf;
+      unroll_fact = 1 << ceil_log2 (unroll_fact);
+      if (unroll_fact > 1)
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_NOTE, vect_location,
+			 "setting unroll factor to %d based on user requested "
+			 "unroll factor %d and suggested vectorization "
+			 "factor: %d\n",
+			 unroll_fact, user_unroll, assumed_vf);
+	  loop_vinfo->vector_costs->set_suggested_unroll_factor (unroll_fact);
+	}
+    }
+
   /* Complete the target-specific cost calculations.  */
   loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
   vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
@@ -12364,14 +12385,20 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 			 GET_MODE_NAME (loop_vinfo->vector_mode));
     }
 
-  /* Loops vectorized with a variable factor won't benefit from
+  /* Loops vectorized would have already taken into account unrolling specified
+     by the user as the suggested unroll factor, as such we need to prevent the
+     RTL unroller from unrolling twice.  The only exception is static known
+     iterations where we would have expected the loop to be fully unrolled.
+     Loops vectorized with a variable factor won't benefit from
      unrolling/peeling.  */
-  if (!vf.is_constant ())
+  if (LOOP_VINFO_USER_UNROLL (loop_vinfo)
+      || !vf.is_constant ())
     {
       loop->unroll = 1;
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
-			 " variable-length vectorization factor\n");
+			 " having already considered vector unrolling or "
+			 "variable-length vectorization factor.\n");
     }
   /* Free SLP instances here because otherwise stmt reference counting
      won't work.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a2f33a5ecd60288fe7f28ee639ff8b6a77667796..5675309ab16dc7f7f0b98e229e4798075e6cdd7e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -970,6 +970,10 @@ public:
   /* Main loop IV cond.  */
   gcond* loop_iv_cond;
 
+  /* True if we have an unroll factor requested by the user through pragma GCC
+     unroll.  */
+  bool user_unroll;
+
   /* True if there are no loop carried data dependencies in the loop.
      If loop->safelen <= 1, then this is always true, either the loop
      didn't have any loop carried data dependencies, or the loop is being
@@ -1094,6 +1098,7 @@ public:
 #define LOOP_VINFO_CHECK_UNEQUAL_ADDRS(L)  (L)->check_unequal_addrs
 #define LOOP_VINFO_CHECK_NONZERO(L)        (L)->check_nonzero
 #define LOOP_VINFO_LOWER_BOUNDS(L)         (L)->lower_bounds
+#define LOOP_VINFO_USER_UNROLL(L)          (L)->user_unroll
 #define LOOP_VINFO_GROUPED_STORES(L)       (L)->grouped_stores
 #define LOOP_VINFO_SLP_INSTANCES(L)        (L)->slp_instances
 #define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
@@ -1693,6 +1698,7 @@ public:
   unsigned int outside_cost () const;
   unsigned int total_cost () const;
   unsigned int suggested_unroll_factor () const;
+  void set_suggested_unroll_factor (unsigned int);
   machine_mode suggested_epilogue_mode () const;
 
 protected:
@@ -1791,6 +1797,15 @@ vector_costs::suggested_unroll_factor () const
   return m_suggested_unroll_factor;
 }
 
+/* Set the suggested unroll factor.  */
+
+inline void
+vector_costs::set_suggested_unroll_factor (unsigned unroll_fact)
+{
+  gcc_checking_assert (!m_finished);
+  m_suggested_unroll_factor = unroll_fact;
+}
+
 /* Return the suggested epilogue mode.  */
 
 inline machine_mode

Reply via email to