Hi,
This patch fixes issue reported by PR79347 by calculating/maintaining profile
counter information
on the fly in vect_do_peeling. Due to the order that we first peel prologue
loop, peel epilogue loop,
and then add guarding edge skipping prolog+vector loop if niter is small, this
patch takes a trick
that firstly scales down counters for loop before peeling and scales counters
back after adding the
aforementioned guarding edge. Otherwise, more work would be needed to
calculate counters for
prolog and vector loop. After this patch, # of profile counter for tramp3d
benchmark is improved from:
tramp3d-v4.cpp.157t.ifcvt:296
tramp3d-v4.cpp.158t.vect:1118
tramp3d-v4.cpp.159t.dce6:1118
tramp3d-v4.cpp.160t.pcom:1118
tramp3d-v4.cpp.161t.cunroll:1019
tramp3d-v4.cpp.162t.slp1:1019
tramp3d-v4.cpp.164t.ivopts:1019
tramp3d-v4.cpp.165t.lim4:1019
tramp3d-v4.cpp.166t.loopdone:1007
tramp3d-v4.cpp.167t.no_loop:31
...
tramp3d-v4.cpp.226t.optimized:1009
to:
tramp3d-v4.cpp.157t.ifcvt:296
tramp3d-v4.cpp.158t.vect:814
tramp3d-v4.cpp.159t.dce6:814
tramp3d-v4.cpp.160t.pcom:814
tramp3d-v4.cpp.161t.cunroll:723
tramp3d-v4.cpp.162t.slp1:723
tramp3d-v4.cpp.164t.ivopts:723
tramp3d-v4.cpp.165t.lim4:723
tramp3d-v4.cpp.166t.loopdone:711
tramp3d-v4.cpp.167t.no_loop:31
...
tramp3d-v4.cpp.226t.optimized:831
Bootstrap and test on x86_64 and AArch64. Is it OK?
BTW, with the patch, vectorizer only introduces mismatches by below code in
vect_transform_loop:
/* Reduce loop iterations by the vectorization factor. */
scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vf),
expected_iterations / vf);
Though it makes sense to scale down according to vect-factor, but it definitely
introduces
mismatch between vector_loop's frequency and the rest program. I also believe
it is not
that useful to scale here, especially without profiling information. At least
we need to make
vector_loop's frequency consistent with the rest program.
Thanks,
bin
2017-02-13 Bin Cheng <bin.ch...@arm.com>
PR tree-optimization/79347
* tree-vect-loop-manip.c (apply_probability_for_bb): New function.
(vect_do_peeling): Maintain profile counters during peeling.
gcc/testsuite/ChangeLog
2017-02-13 Bin Cheng <bin.ch...@arm.com>
PR tree-optimization/79347
* gcc.dg/vect/pr79347.c: New test.
diff --git a/gcc/testsuite/gcc.dg/vect/pr79347.c
b/gcc/testsuite/gcc.dg/vect/pr79347.c
new file mode 100644
index 0000000..586c638
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr79347.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fdump-tree-vect-all" } */
+
+short *a;
+int c;
+void n(void)
+{
+ for (int i = 0; i<c;i++)
+ a[i]++;
+}
+
+/* { dg-final { scan-tree-dump-times "Invalid sum of " 2 "vect" } } */
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index f29449c..e6c481c 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1562,6 +1562,17 @@ slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
}
+/* Apply probability PROB to basic block BB and its single succ edge. */
+
+static void
+apply_probability_for_bb (basic_block bb, int prob)
+{
+ bb->frequency = apply_probability (bb->frequency, prob);
+ bb->count = apply_probability (bb->count, prob);
+ gcc_assert (single_succ_p (bb));
+ single_succ_edge (bb)->count = bb->count;
+}
+
/* Function vect_do_peeling.
Input:
@@ -1690,7 +1701,18 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
may be preferred. */
basic_block anchor = loop_preheader_edge (loop)->src;
if (skip_vector)
- split_edge (loop_preheader_edge (loop));
+ {
+ split_edge (loop_preheader_edge (loop));
+
+ /* Due to the order in which we peel prolog and epilog, we first
+ propagate probability to the whole loop. The purpose is to
+ avoid adjusting probabilities of both prolog and vector loops
+ separately. Note in this case, the probability of epilog loop
+ needs to be scaled back later. */
+ basic_block bb_before_loop = loop_preheader_edge (loop)->src;
+ apply_probability_for_bb (bb_before_loop, prob_vector);
+ scale_loop_profile (loop, prob_vector, bound);
+ }
tree niters_prolog = build_int_cst (type, 0);
source_location loop_loc = find_loop_location (loop);
@@ -1727,6 +1749,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
niters_prolog, build_int_cst (type, 0));
guard_bb = loop_preheader_edge (prolog)->src;
+ basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
guard_to = split_edge (loop_preheader_edge (loop));
guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
guard_to, guard_bb,
@@ -1734,6 +1757,8 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
e = EDGE_PRED (guard_to, 0);
e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
+
+ apply_probability_for_bb (bb_after_prolog, prob_prolog);
scale_loop_profile (prolog, prob_prolog, bound_prolog);
}
/* Update init address of DRs. */
@@ -1796,9 +1821,18 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
e = EDGE_PRED (guard_to, 0);
e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
- scale_loop_profile (epilog, prob_vector, bound_scalar);
+
+ /* Simply propagate profile info from guard_bb to guard_to which is
+ a merge point of control flow. */
+ guard_to->frequency = guard_bb->frequency;
+ guard_to->count = guard_bb->count;
+ single_succ_edge (guard_to)->count = guard_to->count;
+ /* Scale probability of epilog loop back. */
+ int scale_up = REG_BR_PROB_BASE * REG_BR_PROB_BASE / prob_vector;
+ scale_loop_frequencies (epilog, scale_up, REG_BR_PROB_BASE);
}
+ basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
tree niters_vector_mult_vf;
/* If loop is peeled for non-zero constant times, now niters refers to
orig_niters - prolog_peeling, it won't overflow even the orig_niters
@@ -1826,6 +1860,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters,
tree nitersm1,
inverse_probability (prob_epilog));
slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
single_exit (epilog));
+ /* Only need to handle basic block before epilog loop if it's not
+ the guard_bb, which is the case when skip_vector is true. */
+ if (guard_bb != bb_before_epilog)
+ {
+ prob_epilog = (combine_probabilities (prob_vector, prob_epilog)
+ + inverse_probability (prob_vector));
+
+ apply_probability_for_bb (bb_before_epilog, prob_epilog);
+ }
scale_loop_profile (epilog, prob_epilog, bound);
}
else