Hi Richard(s),

I am trying to tackle PR88915 and get GCC to further vectorize the "fallback" loop when doing loop versioning as it does when loop versioning is not required.

I have a prototype patch that needs further testing, but at first glance it seems to be achieving the desired outcome. I was wondering whether you had any specific concerns with my current approach.

On top of this change I am looking at the iterations and alias checks generated for every "vectorized-version". I.e. with the above patch I see:
if (iterations_check_VF_0 () && alias_check_VF_0 ())
  vectorized_for_VF_0 ();
else if (iterations_check_VF_1 () && alias_check_VF_1 ())
  vectorized_for_VF_1 ();
...
else
  scalar_loop ();

The alias checks are not always short and may cause unnecessary performance hits. Instead I am now trying to change the checks to produce the following form:

if (iterations_check_VF_0 ())
{
  if (alias_check_VF_0 ())
   {
     vectorized_for_VF_0 ();
   }
  else
    goto VF_1_check;  // or scalar_loop
}
else if (iterations_check_VF_1 ())
  {
VF_1_check:

    if (alias_check_VF_1 ())
      vectorized_for_VF_1 ();
    else
      goto goto_VF_2_check; // or scalar_loop
  }
...
else
  scalar_loop ();


I am not yet sure whether to try the next VF after an alias check fail or to just fall back to scalar instead.

Cheers,
Andre
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 2f8ab106d03a8927087ee8038e08a825f6e1e237..04d874b70ddfb8a3f5175dcddf00fef6d33f3219 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -266,6 +266,8 @@ struct GTY ((chain_next ("%h.next"))) loop {
      the basic-block from being collected but its index can still be
      reused.  */
   basic_block former_header;
+
+  unsigned long max_vf_limit;
 };
 
 /* Set if the loop is known to be infinite.  */
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index f64326b944e630075ced7035937f4601a1cb6c66..07d633b678b52943d3ab82e8d61b80cd712431ac 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -355,6 +355,7 @@ alloc_loop (void)
   loop->nb_iterations_upper_bound = 0;
   loop->nb_iterations_likely_upper_bound = 0;
   loop->nb_iterations_estimate = 0;
+  loop->max_vf_limit = MAX_VECTORIZATION_FACTOR;
   return loop;
 }
 
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index bd8fffb1704787d0a611fc02ee29054422596cbb..89529138b9cefb7f822bca72da06df519eff1a28 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2968,7 +2968,8 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
 struct loop *
 vect_loop_versioning (loop_vec_info loop_vinfo,
 		      unsigned int th, bool check_profitability,
-		      poly_uint64 versioning_threshold)
+		      poly_uint64 versioning_threshold,
+		      vec<loop_p> &more_loops)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -3143,6 +3144,19 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
 			    prob, prob.invert (), prob, prob.invert (), true);
       gcc_assert (nloop);
+
+      if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+	{
+
+	  /* Add the scalar fallback loop to the MORE_LOOPS vector to be looked
+	     at later.  Also make sure it is never vectorized for the original
+	     vf by setting the limit of the maximum vf to the original vf minus
+	     one.  */
+	  nloop->max_vf_limit
+	    = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1;
+	  more_loops.safe_push (nloop);
+	}
+
       nloop = get_loop_copy (loop);
 
       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b49ab152012a5c7fe9cc0564e58d296447f9ffb1..081885c378200661237ef22d2b011fc775e21218 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1862,7 +1862,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
 {
   opt_result ok = opt_result::success ();
   int res;
-  unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
+  unsigned int max_vf = LOOP_VINFO_LOOP (loop_vinfo)->max_vf_limit;
   poly_uint64 min_vf = 2;
 
   /* The first group of checks is independent of the vector size.  */
@@ -8468,7 +8468,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    Returns scalar epilogue loop if any.  */
 
 struct loop *
-vect_transform_loop (loop_vec_info loop_vinfo)
+vect_transform_loop (loop_vec_info loop_vinfo, vec<loop_p> &more_loops)
 {
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   struct loop *epilogue = NULL;
@@ -8530,7 +8530,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 	}
       struct loop *sloop
 	= vect_loop_versioning (loop_vinfo, th, check_profitability,
-				versioning_threshold);
+				versioning_threshold, more_loops);
+
       sloop->force_vectorize = false;
       check_profitability = false;
     }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f7432f0584762fd28d54f2978dc59f2df443e991..53d66b72d3ba6e15681209153b57736630e40e3b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1089,8 +1089,6 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
    conversion.  */
 #define MAX_INTERM_CVT_STEPS         3
 
-#define MAX_VECTORIZATION_FACTOR INT_MAX
-
 /* Nonzero if TYPE represents a (scalar) boolean type or type
    in the middle-end compatible with it (unsigned precision 1 integral
    types).  Used to determine which types should be vectorized as
@@ -1473,7 +1471,7 @@ extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
 struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
 						     struct loop *, edge);
 struct loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
-				   poly_uint64);
+				   poly_uint64, vec<loop_p> &);
 extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
 				     tree *, tree *, tree *, int, bool, bool);
 extern void vect_prepare_for_masked_peels (loop_vec_info);
@@ -1614,7 +1612,7 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
 				unsigned int, tree, unsigned int);
 
 /* Drive for loop transformation stage.  */
-extern struct loop *vect_transform_loop (loop_vec_info);
+extern struct loop *vect_transform_loop (loop_vec_info, vec<loop_p> &);
 extern opt_loop_vec_info vect_analyze_loop_form (struct loop *,
 						 vec_info_shared *);
 extern bool vectorizable_live_operation (stmt_vec_info, gimple_stmt_iterator *,
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 325ef58722d21a65ab896a9358677b07111b060b..d63d532d5fe474904ff84b23912a2ed9cfd6194a 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -868,7 +868,8 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 		      unsigned *num_vectorized_loops,
 		      loop_p loop, loop_vec_info orig_loop_vinfo,
 		      gimple *loop_vectorized_call,
-		      gimple *loop_dist_alias_call)
+		      gimple *loop_dist_alias_call,
+		      vec<loop_p> &more_loops)
 {
   unsigned ret = 0;
   vec_info_shared shared;
@@ -979,7 +980,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 			 "loop vectorized using variable length vectors\n");
     }
 
-  loop_p new_loop = vect_transform_loop (loop_vinfo);
+  loop_p new_loop = vect_transform_loop (loop_vinfo, more_loops);
   (*num_vectorized_loops)++;
   /* Now that the loop has been vectorized, allow it to be unrolled
      etc.  */
@@ -1013,7 +1014,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
   /* Epilogue of vectorized loop must be vectorized too.  */
   if (new_loop)
     ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
-				 new_loop, loop_vinfo, NULL, NULL);
+				 new_loop, loop_vinfo, NULL, NULL, more_loops);
 
   return ret;
 }
@@ -1022,7 +1023,8 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
 
 static unsigned
 try_vectorize_loop (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
-		    unsigned *num_vectorized_loops, loop_p loop)
+		    unsigned *num_vectorized_loops, loop_p loop,
+		    vec<loop_p> &more_loops)
 {
   if (!((flag_tree_loop_vectorize
 	 && optimize_loop_nest_for_speed_p (loop))
@@ -1032,7 +1034,8 @@ try_vectorize_loop (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
   return try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
 			       loop, NULL,
 			       vect_loop_vectorized_call (loop),
-			       vect_loop_dist_alias_call (loop));
+			       vect_loop_dist_alias_call (loop),
+			       more_loops);
 }
 
 
@@ -1051,6 +1054,7 @@ vectorize_loops (void)
   hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL;
   bool any_ifcvt_loops = false;
   unsigned ret = 0;
+  auto_vec<loop_p> more_loops;
 
   vect_loops_num = number_of_loops (cfun);
 
@@ -1105,14 +1109,19 @@ vectorize_loops (void)
 		    vector_loop->dont_vectorize = true;
 		    ret |= try_vectorize_loop (simduid_to_vf_htab,
 					       &num_vectorized_loops,
-					       vector_loop);
+					       vector_loop,
+					       more_loops);
 		  }
 	      }
 	  }
       }
     else
       ret |= try_vectorize_loop (simduid_to_vf_htab, &num_vectorized_loops,
-				 loop);
+				 loop, more_loops);
+
+  while (!more_loops.is_empty ())
+    try_vectorize_loop (simduid_to_vf_htab, &num_vectorized_loops,
+			more_loops.pop (), more_loops);
 
   vect_location = dump_user_location_t ();
 
diff --git a/gcc/tree.h b/gcc/tree.h
index 3dce602dfbaca03f568e1c3638d56dfe3a3fd01c..b1c41131e9d1637784a1024d5c301252a06f89e1 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -22,6 +22,8 @@ along with GCC; see the file COPYING3.  If not see
 
 #include "tree-core.h"
 
+#define MAX_VECTORIZATION_FACTOR INT_MAX
+
 /* Convert a target-independent built-in function code to a combined_fn.  */
 
 inline combined_fn

Reply via email to