Hi Richard(s),
I am trying to tackle PR88915 and get GCC to further vectorize the
"fallback" loop when doing loop versioning as it does when loop
versioning is not required.
I have a prototype patch that needs further testing, but at first glance
it seems to be achieving the desired outcome.
I was wondering whether you had any specific concerns with my current
approach.
On top of this change I am looking at the iterations and alias checks
generated for every "vectorized-version". I.e. with the above patch I see:
if (iterations_check_VF_0 () && alias_check_VF_0 ())
vectorized_for_VF_0 ();
else if (iterations_check_VF_1 () && alias_check_VF_1 ())
vectorized_for_VF_1 ();
...
else
scalar_loop ();
The alias checks are not always short and may cause unnecessary
performance hits. Instead I am now trying to change the checks to
produce the following form:
if (iterations_check_VF_0 ())
{
if (alias_check_VF_0 ())
{
vectorized_for_VF_0 ();
}
else
goto VF_1_check; // or scalar_loop
}
else if (iterations_check_VF_1 ())
{
VF_1_check:
if (alias_check_VF_1 ())
vectorized_for_VF_1 ();
else
goto goto_VF_2_check; // or scalar_loop
}
...
else
scalar_loop ();
I am not yet sure whether to try the next VF after an alias check fail
or to just fall back to scalar instead.
Cheers,
Andre
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index 2f8ab106d03a8927087ee8038e08a825f6e1e237..04d874b70ddfb8a3f5175dcddf00fef6d33f3219 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -266,6 +266,8 @@ struct GTY ((chain_next ("%h.next"))) loop {
the basic-block from being collected but its index can still be
reused. */
basic_block former_header;
+
+ unsigned long max_vf_limit;
};
/* Set if the loop is known to be infinite. */
diff --git a/gcc/cfgloop.c b/gcc/cfgloop.c
index f64326b944e630075ced7035937f4601a1cb6c66..07d633b678b52943d3ab82e8d61b80cd712431ac 100644
--- a/gcc/cfgloop.c
+++ b/gcc/cfgloop.c
@@ -355,6 +355,7 @@ alloc_loop (void)
loop->nb_iterations_upper_bound = 0;
loop->nb_iterations_likely_upper_bound = 0;
loop->nb_iterations_estimate = 0;
+ loop->max_vf_limit = MAX_VECTORIZATION_FACTOR;
return loop;
}
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index bd8fffb1704787d0a611fc02ee29054422596cbb..89529138b9cefb7f822bca72da06df519eff1a28 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -2968,7 +2968,8 @@ vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
struct loop *
vect_loop_versioning (loop_vec_info loop_vinfo,
unsigned int th, bool check_profitability,
- poly_uint64 versioning_threshold)
+ poly_uint64 versioning_threshold,
+ vec<loop_p> &more_loops)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
@@ -3143,6 +3144,19 @@ vect_loop_versioning (loop_vec_info loop_vinfo,
nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
prob, prob.invert (), prob, prob.invert (), true);
gcc_assert (nloop);
+
+ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
+
+ /* Add the scalar fallback loop to the MORE_LOOPS vector to be looked
+ at later. Also make sure it is never vectorized for the original
+ vf by setting the limit of the maximum vf to the original vf minus
+ one. */
+ nloop->max_vf_limit
+ = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1;
+ more_loops.safe_push (nloop);
+ }
+
nloop = get_loop_copy (loop);
/* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index b49ab152012a5c7fe9cc0564e58d296447f9ffb1..081885c378200661237ef22d2b011fc775e21218 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1862,7 +1862,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
{
opt_result ok = opt_result::success ();
int res;
- unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
+ unsigned int max_vf = LOOP_VINFO_LOOP (loop_vinfo)->max_vf_limit;
poly_uint64 min_vf = 2;
/* The first group of checks is independent of the vector size. */
@@ -8468,7 +8468,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
Returns scalar epilogue loop if any. */
struct loop *
-vect_transform_loop (loop_vec_info loop_vinfo)
+vect_transform_loop (loop_vec_info loop_vinfo, vec<loop_p> &more_loops)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
struct loop *epilogue = NULL;
@@ -8530,7 +8530,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
}
struct loop *sloop
= vect_loop_versioning (loop_vinfo, th, check_profitability,
- versioning_threshold);
+ versioning_threshold, more_loops);
+
sloop->force_vectorize = false;
check_profitability = false;
}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f7432f0584762fd28d54f2978dc59f2df443e991..53d66b72d3ba6e15681209153b57736630e40e3b 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1089,8 +1089,6 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
conversion. */
#define MAX_INTERM_CVT_STEPS 3
-#define MAX_VECTORIZATION_FACTOR INT_MAX
-
/* Nonzero if TYPE represents a (scalar) boolean type or type
in the middle-end compatible with it (unsigned precision 1 integral
types). Used to determine which types should be vectorized as
@@ -1473,7 +1471,7 @@ extern bool slpeel_can_duplicate_loop_p (const struct loop *, const_edge);
struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
struct loop *, edge);
struct loop *vect_loop_versioning (loop_vec_info, unsigned int, bool,
- poly_uint64);
+ poly_uint64, vec<loop_p> &);
extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
tree *, tree *, tree *, int, bool, bool);
extern void vect_prepare_for_masked_peels (loop_vec_info);
@@ -1614,7 +1612,7 @@ extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
unsigned int, tree, unsigned int);
/* Drive for loop transformation stage. */
-extern struct loop *vect_transform_loop (loop_vec_info);
+extern struct loop *vect_transform_loop (loop_vec_info, vec<loop_p> &);
extern opt_loop_vec_info vect_analyze_loop_form (struct loop *,
vec_info_shared *);
extern bool vectorizable_live_operation (stmt_vec_info, gimple_stmt_iterator *,
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 325ef58722d21a65ab896a9358677b07111b060b..d63d532d5fe474904ff84b23912a2ed9cfd6194a 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -868,7 +868,8 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
unsigned *num_vectorized_loops,
loop_p loop, loop_vec_info orig_loop_vinfo,
gimple *loop_vectorized_call,
- gimple *loop_dist_alias_call)
+ gimple *loop_dist_alias_call,
+ vec<loop_p> &more_loops)
{
unsigned ret = 0;
vec_info_shared shared;
@@ -979,7 +980,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
"loop vectorized using variable length vectors\n");
}
- loop_p new_loop = vect_transform_loop (loop_vinfo);
+ loop_p new_loop = vect_transform_loop (loop_vinfo, more_loops);
(*num_vectorized_loops)++;
/* Now that the loop has been vectorized, allow it to be unrolled
etc. */
@@ -1013,7 +1014,7 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
/* Epilogue of vectorized loop must be vectorized too. */
if (new_loop)
ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
- new_loop, loop_vinfo, NULL, NULL);
+ new_loop, loop_vinfo, NULL, NULL, more_loops);
return ret;
}
@@ -1022,7 +1023,8 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
static unsigned
try_vectorize_loop (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
- unsigned *num_vectorized_loops, loop_p loop)
+ unsigned *num_vectorized_loops, loop_p loop,
+ vec<loop_p> &more_loops)
{
if (!((flag_tree_loop_vectorize
&& optimize_loop_nest_for_speed_p (loop))
@@ -1032,7 +1034,8 @@ try_vectorize_loop (hash_table<simduid_to_vf> *&simduid_to_vf_htab,
return try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops,
loop, NULL,
vect_loop_vectorized_call (loop),
- vect_loop_dist_alias_call (loop));
+ vect_loop_dist_alias_call (loop),
+ more_loops);
}
@@ -1051,6 +1054,7 @@ vectorize_loops (void)
hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL;
bool any_ifcvt_loops = false;
unsigned ret = 0;
+ auto_vec<loop_p> more_loops;
vect_loops_num = number_of_loops (cfun);
@@ -1105,14 +1109,19 @@ vectorize_loops (void)
vector_loop->dont_vectorize = true;
ret |= try_vectorize_loop (simduid_to_vf_htab,
&num_vectorized_loops,
- vector_loop);
+ vector_loop,
+ more_loops);
}
}
}
}
else
ret |= try_vectorize_loop (simduid_to_vf_htab, &num_vectorized_loops,
- loop);
+ loop, more_loops);
+
+ while (!more_loops.is_empty ())
+ try_vectorize_loop (simduid_to_vf_htab, &num_vectorized_loops,
+ more_loops.pop (), more_loops);
vect_location = dump_user_location_t ();
diff --git a/gcc/tree.h b/gcc/tree.h
index 3dce602dfbaca03f568e1c3638d56dfe3a3fd01c..b1c41131e9d1637784a1024d5c301252a06f89e1 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -22,6 +22,8 @@ along with GCC; see the file COPYING3. If not see
#include "tree-core.h"
+#define MAX_VECTORIZATION_FACTOR INT_MAX
+
/* Convert a target-independent built-in function code to a combined_fn. */
inline combined_fn