Hi,
with recent fixes to proile updating I noticed that we get more regressions
compared to gcc 9 at Firefox testing. This is because Firefox train run is
not covering all the benchmarks and gcc 9, thanks to updating bugs sometimes
optimize code for speed even if it was not trained.
While in general one should have reasonable train run, in some cases it is
not practical to do so. For example, skia library has optimized vector code
for different ISAs and thus firefox renders quickly only if it is trained
on same CPU as run.
This patch adds flag -fprofile-partial-training which makes GCC to optimize
untrained functions as w/o -fprofile-use. This nullifies code size improvements
of FDO but can be used in cases where full training is not quite possible
(and one can use it only on portions of programs).
Previously only good answer was to disable profiling for a given function, but
that needs to be done quite precisely and in general is hard to arrange.
Patch works by
1) not setting PROFILE_READ for functions with entry count 0
2) make inliner and ipa-cp to drop profile to local one when all
trained executions are redirected to clones
3) reduce quality of branch probabilities of branches leading to never
executed regions to GUESSED. This is necessary to prevent gcc from
proagating thins back.
Bootstrapped/regtested x86_64-linux. I plan to commit it tomorrow if there
are no complains. Feedback is welcome!
Honza
* cgraphclones.c (localize_profile): New function.
(cgraph_node::create_clone): Use it for partial profiles.
* common.opt (fprofile-partial-training): New flag.
* doc/invoke.texi (-fprofile-partial-training): Document.
* ipa-cp.c (update_profiling_info): For partial profiles do not
set function profile to zero.
* profile.c (compute_branch_probabilities): With partial profile
watch if edge count is zero and turn all probabilities to guessed.
(compute_branch_probabilities): For partial profiles do not apply
profile when entry count is zero.
* tree-profile.c (tree_profiling): Only do value_profile_transformations
when profile is read.
Index: cgraphclones.c
===================================================================
--- cgraphclones.c (revision 278944)
+++ cgraphclones.c (working copy)
@@ -307,6 +307,22 @@ dump_callgraph_transformation (const cgr
}
}
+/* Turn profile of N to local profile. */
+
+static void
+localize_profile (cgraph_node *n)
+{
+ n->count = n->count.guessed_local ();
+ for (cgraph_edge *e = n->callees; e; e=e->next_callee)
+ {
+ e->count = e->count.guessed_local ();
+ if (!e->inline_failed)
+ localize_profile (e->callee);
+ }
+ for (cgraph_edge *e = n->indirect_calls; e; e=e->next_callee)
+ e->count = e->count.guessed_local ();
+}
+
/* Create node representing clone of N executed COUNT times. Decrease
the execution counts from original node too.
The new clone will have decl set to DECL that may or may not be the same
@@ -340,6 +356,7 @@ cgraph_node::create_clone (tree new_decl
cgraph_edge *e;
unsigned i;
profile_count old_count = count;
+ bool nonzero = count.ipa ().nonzero_p ();
if (new_inlined_to)
dump_callgraph_transformation (this, new_inlined_to, "inlining to");
@@ -426,6 +446,15 @@ cgraph_node::create_clone (tree new_decl
if (call_duplication_hook)
symtab->call_cgraph_duplication_hooks (this, new_node);
+ /* With partial train run we do not want to assume that original's
+ count is zero whenever we redurect all executed edges to clone.
+ Simply drop profile to local one in this case. */
+ if (update_original
+ && opt_for_fn (decl, flag_partial_profile_training)
+ && nonzero
+ && count.ipa_p ()
+ && !count.ipa ().nonzero_p ())
+ localize_profile (this);
if (!new_inlined_to)
dump_callgraph_transformation (this, new_node, suffix);
Index: common.opt
===================================================================
--- common.opt (revision 278944)
+++ common.opt (working copy)
@@ -2160,6 +2160,10 @@ fprofile-generate=
Common Joined RejectNegative
Enable common options for generating profile info for profile feedback
directed optimizations, and set -fprofile-dir=.
+fprofile-partial-training
+Common Report Var(flag_partial_profile_training) Optimization
+Do not assume that functions never executed during the train run are cold
+
fprofile-use
Common Var(flag_profile_use)
Enable common options for performing profile feedback directed optimizations.
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi (revision 278944)
+++ doc/invoke.texi (working copy)
@@ -453,8 +453,8 @@ Objective-C and Objective-C++ Dialects}.
-fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
-fprefetch-loop-arrays @gol
-fprofile-correction @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
--fprofile-reorder-functions @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-partial-training @gol
+-fprofile-values -fprofile-reorder-functions @gol
-freciprocal-math -free -frename-registers -freorder-blocks @gol
-freorder-blocks-algorithm=@var{algorithm} @gol
-freorder-blocks-and-partition -freorder-functions @gol
@@ -10634,6 +10634,17 @@ default, GCC emits an error message when
This option is enabled by @option{-fauto-profile}.
+@item -fprofile-partial-training
+@opindex fprofile-use
+With @code{-fprofile-use} all portions of programs not executed during train
+run are optimized agressively for size rather than speed. In some cases it is
not
+practical to train all possible paths hot paths in the program. (For example
+program may contain functions specific for a given hardware and trianing may
+not cover all hardware configurations program is run on.) With
+@code{-fprofile-partial-training} profile feedback will be ignored for all
+functions not executed during the train run leading them to be optimized as
+if they were compiled without profile feedback.
+
@item -fprofile-use
@itemx -fprofile-use=@var{path}
@opindex fprofile-use
Index: ipa-cp.c
===================================================================
--- ipa-cp.c (revision 278944)
+++ ipa-cp.c (working copy)
@@ -4295,6 +4295,15 @@ update_profiling_info (struct cgraph_nod
remainder = orig_node_count.combine_with_ipa_count (orig_node_count.ipa ()
- new_sum.ipa ());
+
+ /* With partial train run we do not want to assume that original's
+ count is zero whenever we redurect all executed edges to clone.
+ Simply drop profile to local one in this case. */
+ if (remainder.ipa_p () && !remainder.ipa ().nonzero_p ()
+ && orig_node->count.ipa_p () && orig_node->count.ipa ().nonzero_p ()
+ && flag_partial_profile_training)
+ remainder = remainder.guessed_local ();
+
new_sum = orig_node_count.combine_with_ipa_count (new_sum);
new_node->count = new_sum;
orig_node->count = remainder;
Index: profile.c
===================================================================
--- profile.c (revision 278944)
+++ profile.c (working copy)
@@ -635,9 +635,20 @@ compute_branch_probabilities (unsigned c
}
if (bb_gcov_count (bb))
{
+ bool set_to_guessed = false;
FOR_EACH_EDGE (e, ei, bb->succs)
- e->probability = profile_probability::probability_in_gcov_type
- (edge_gcov_count (e), bb_gcov_count (bb));
+ {
+ bool prev_never = e->probability == profile_probability::never ();
+ e->probability = profile_probability::probability_in_gcov_type
+ (edge_gcov_count (e), bb_gcov_count (bb));
+ if (e->probability == profile_probability::never ()
+ && !prev_never
+ && flag_partial_profile_training)
+ set_to_guessed = true;
+ }
+ if (set_to_guessed)
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ e->probability = e->probability.guessed ();
if (bb->index >= NUM_FIXED_BLOCKS
&& block_ends_with_condjump_p (bb)
&& EDGE_COUNT (bb->succs) >= 2)
@@ -697,17 +708,23 @@ compute_branch_probabilities (unsigned c
}
}
- if (exec_counts)
+ if (exec_counts
+ && (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun))
+ || !flag_partial_profile_training))
profile_status_for_fn (cfun) = PROFILE_READ;
/* If we have real data, use them! */
if (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun))
|| !flag_guess_branch_prob)
FOR_ALL_BB_FN (bb, cfun)
- bb->count = profile_count::from_gcov_type (bb_gcov_count (bb));
+ if (bb_gcov_count (bb) || !flag_partial_profile_training)
+ bb->count = profile_count::from_gcov_type (bb_gcov_count (bb));
+ else
+ bb->count = profile_count::guessed_zero ();
/* If function was not trained, preserve local estimates including statically
determined zero counts. */
- else if (profile_status_for_fn (cfun) == PROFILE_READ)
+ else if (profile_status_for_fn (cfun) == PROFILE_READ
+ && !flag_partial_profile_training)
FOR_ALL_BB_FN (bb, cfun)
if (!(bb->count == profile_count::zero ()))
bb->count = bb->count.global0 ();
@@ -1417,7 +1434,7 @@ branch_prob (bool thunk)
/* At this moment we have precise loop iteration count estimates.
Record them to loop structure before the profile gets out of date. */
FOR_EACH_LOOP (loop, 0)
- if (loop->header->count > 0)
+ if (loop->header->count > 0 && loop->header->count.reliable_p ())
{
gcov_type nit = expected_loop_iterations_unbounded (loop);
widest_int bound = gcov_type_to_wide_int (nit);
Index: tree-profile.c
===================================================================
--- tree-profile.c (revision 278944)
+++ tree-profile.c (working copy)
@@ -785,7 +785,8 @@ tree_profiling (void)
if (flag_branch_probabilities
&& !thunk
&& flag_profile_values
- && flag_value_profile_transformations)
+ && flag_value_profile_transformations
+ && profile_status_for_fn (cfun) == PROFILE_READ)
gimple_value_profile_transformations ();
/* The above could hose dominator info. Currently there is