https://gcc.gnu.org/g:e416c8097fc87513e05c2d104c63488f733758c0
commit r16-1401-ge416c8097fc87513e05c2d104c63488f733758c0 Author: Jan Hubicka <hubi...@ucw.cz> Date: Tue Jun 10 21:32:40 2025 +0200 More API for IPA profile manipulation This patch attempts to make IPA profile manipulation easier. It introduces node->scale_profile_to (count) which can be used to scale profile to a given IPA count. If IPA count is zero, then local profile is preserved and proper variant of global0 count is used. node->make_profile_local this can be used to drop IPA profile but keep local profile node->make_profile_global0 this can be used to make IPA profile 0 but keep local profile. Most of this can be accomplished by existing apply_scale. I.e. - node->scale_profile_to (count) corresponds to node->apply_scale (count, node->count), - node->make_profile_local corresponds to node->apply_scale (node->count.guessed_local (), node->count) I think new API is more clean about what intention is and less error prone. Also it handles some side cases when entry block of profile happens to be 0, but body is non-zero (by profile inconsistencies). In this case the scaling API did kind of random things. I noticed three bugs in ipa-cp (two already in released GCCs while one mine introduced by last patch): @@ -4528,7 +4528,7 @@ lenient_count_portion_handling (profile_count remainder, cgraph_node *orig_node) if (remainder.ipa_p () && !remainder.ipa ().nonzero_p () && orig_node->count.ipa_p () && orig_node->count.ipa ().nonzero_p () && opt_for_fn (orig_node->decl, flag_profile_partial_training)) - remainder = remainder.guessed_local (); + remainder = orig_node->count.guessed_local (); The code was intended to drop IPA profile to local when remainder is 0. In this case orig_node->count is some non-zero count but all of control flow was redirected to a clone which means that remainer is 0 (adjusted). Doing remainder = remainder.guessed_local (); will turn it into 0 (guessed_local) and the scalling will then multiply all counts by 0 and turn them tinto guessed local. We want to keep original count but reduce the quality. i.e. remainder = orig_node->count.guessed_local (); Second problem is: /* TODO: Profile has alreay gone astray, keep what we have but lower it to global0 category. */ remainder = orig_node->count.global0 (); global0 means that converting to ipa count will be precise 0. Since we lost track it should be adjusted 0 :) Finally in: new_sum = orig_node_count.combine_with_ipa_count (new_sum); orig_node->count = remainder; new_node->apply_scale (new_sum, new_node->count); if (!orig_edges_processed) orig_node->apply_scale (remainder, orig_node->count); orig_node->scale_profile_to (remainder); orig_node->count is first set to remainder and then scalling is done (which in turn does nothing). This is bug I introduced in last path which should have removed orig_node->count = remainder. As a result now counts of cgraph edges are not adjusted correctly. I am sorry for that. gcc/ChangeLog: * cgraph.cc (cgraph_node::make_profile_local): New member function. (cgraph_node::make_profile_global0): New member function. (cgraph_node::apply_scale): Do not call adjust_for_ipa_scalling. (cgraph_node::scale_profile_to): New member function. * cgraph.h (cgraph_node::make_profile_local, cgraph_node::make_profile_global0, cgraph_node::scale_profile_to): Declare. * ipa-cp.cc (lenient_count_portion_handling): Fix logic dropping count to local. (update_counts_for_self_gen_clones): Use scale_profile_to. (update_profiling_info): Use make_profile_local, make_profile_global0 and scale_profile_to. (update_specialized_profile): Likewise. * ipa-inline-transform.cc (clone_inlined_nodes): Call adjust_for_ipa_scalling. Diff: --- gcc/cgraph.cc | 114 +++++++++++++++++++++++++++++++++++++++++--- gcc/cgraph.h | 14 +++++- gcc/ipa-cp.cc | 53 ++++++-------------- gcc/ipa-inline-transform.cc | 5 +- 4 files changed, 140 insertions(+), 46 deletions(-) diff --git a/gcc/cgraph.cc b/gcc/cgraph.cc index 4a037a7bab10..2f31260207df 100644 --- a/gcc/cgraph.cc +++ b/gcc/cgraph.cc @@ -179,26 +179,128 @@ cgraph_node::function_version (void) return cgraph_fnver_htab->find (&key); } -/* Scale profile by NUM/DEN. Walk into inlined clones. */ +/* If profile is IPA, turn it into local one. */ +void +cgraph_node::make_profile_local () +{ + if (!count.ipa ().initialized_p ()) + return; + if (!(count == profile_count::zero ())) + count = count.guessed_local (); + for (cgraph_edge *e = callees; e; e = e->next_callee) + { + if (!e->inline_failed) + e->callee->make_profile_local (); + if (!(e->count == profile_count::zero ())) + e->count = e->count.guessed_local (); + } + for (cgraph_edge *e = indirect_calls; e; e = e->next_callee) + if (!(e->count == profile_count::zero ())) + e->count = e->count.guessed_local (); +} +/* Turn profile to global0. Walk into inlined functions. + QUALITY must be GUESSED_GLOBAL0 or GUESSED_GLOBAL0_ADJUSTED */ void -cgraph_node::apply_scale (profile_count num, profile_count den) +cgraph_node::make_profile_global0 (profile_quality quality) { - struct cgraph_edge *e; + if (count == profile_count::zero ()) + ; + else if (quality == GUESSED_GLOBAL0) + { + if (count.quality () == GUESSED_GLOBAL0) + return; + count = count.global0 (); + } + else if (quality == GUESSED_GLOBAL0_ADJUSTED) + { + if (count.quality () == GUESSED_GLOBAL0 + || count.quality () == GUESSED_GLOBAL0_ADJUSTED) + return; + count = count.global0adjusted (); + } + else + gcc_unreachable (); + for (cgraph_edge *e = callees; e; e = e->next_callee) + { + if (!e->inline_failed) + e->callee->make_profile_global0 (quality); + if (e->count == profile_count::zero ()) + ; + else if (quality == GUESSED_GLOBAL0) + e->count = e->count.global0 (); + else if (quality == GUESSED_GLOBAL0_ADJUSTED) + e->count = e->count.global0adjusted (); + else + gcc_unreachable (); + } + for (cgraph_edge *e = indirect_calls; e; e = e->next_callee) + if (e->count == profile_count::zero ()) + ; + else if (quality == GUESSED_GLOBAL0) + e->count = e->count.global0 (); + else if (quality == GUESSED_GLOBAL0_ADJUSTED) + e->count = e->count.global0adjusted (); + else + gcc_unreachable (); +} - profile_count::adjust_for_ipa_scaling (&num, &den); +/* Scale profile by NUM/DEN. Walk into inlined functions. */ - for (e = callees; e; e = e->next_callee) +void +cgraph_node::apply_scale (profile_count num, profile_count den) +{ + if (num == den) + return; + + for (cgraph_edge *e = callees; e; e = e->next_callee) { if (!e->inline_failed) e->callee->apply_scale (num, den); e->count = e->count.apply_scale (num, den); } - for (e = indirect_calls; e; e = e->next_callee) + for (cgraph_edge *e = indirect_calls; e; e = e->next_callee) e->count = e->count.apply_scale (num, den); count = count.apply_scale (num, den); } +/* Scale profile to given IPA_COUNT. + IPA_COUNT should pass ipa_p () with a single exception. + It can be also GUESSED_LOCAL in case we want to + drop any IPA info about the profile. */ + +void +cgraph_node::scale_profile_to (profile_count ipa_count) +{ + /* If we do not know the adjustment, it is better to keep profile + as it is. */ + if (!ipa_count.initialized_p () + || ipa_count == count) + return; + /* ipa-cp converts value to guessed-local in case it believes + that we lost track of IPA profile. */ + if (ipa_count.quality () == GUESSED_LOCAL) + { + make_profile_local (); + return; + } + if (ipa_count == profile_count::zero ()) + { + make_profile_global0 (GUESSED_GLOBAL0); + return; + } + if (ipa_count == profile_count::adjusted_zero ()) + { + make_profile_global0 (GUESSED_GLOBAL0_ADJUSTED); + return; + } + gcc_assert (ipa_count.ipa () == ipa_count + && !inlined_to); + profile_count num = count.combine_with_ipa_count (ipa_count); + profile_count den = count; + profile_count::adjust_for_ipa_scaling (&num, &den); +} + /* Insert a new cgraph_function_version_info node into cgraph_fnver_htab corresponding to cgraph_node NODE. */ cgraph_function_version_info * diff --git a/gcc/cgraph.h b/gcc/cgraph.h index ba9a8a25e396..970ac2cba376 100644 --- a/gcc/cgraph.h +++ b/gcc/cgraph.h @@ -1256,9 +1256,21 @@ struct GTY((tag ("SYMTAB_FUNCTION"))) cgraph_node : public symtab_node it is not used in any other non-standard way. */ bool only_called_directly_p (void); - /* Scale profile by NUM/DEN. Walk into inlined clones. */ + /* Turn profile to global0. Walk into inlined functions. */ + void make_profile_local (); + + /* Turn profile to global0. Walk into inlined functions. */ + void make_profile_global0 (profile_quality quality); + + /* Scale profile by NUM/DEN. Walk into inlined funtion. */ void apply_scale (profile_count num, profile_count den); + /* Scale profile to given IPA_COUNT. + IPA_COUNT should pass ipa_p () with a single exception. + It can be also GUESSED_LOCAL in case we want to + drop any IPA info about the profile. */ + void scale_profile_to (profile_count ipa_count); + /* Return true when function is only called directly or it has alias. i.e. it is not externally visible, address was not taken and it is not used in any other non-standard way. */ diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc index 92e234e67162..3bf0117612e5 100644 --- a/gcc/ipa-cp.cc +++ b/gcc/ipa-cp.cc @@ -4528,7 +4528,7 @@ lenient_count_portion_handling (profile_count remainder, cgraph_node *orig_node) if (remainder.ipa_p () && !remainder.ipa ().nonzero_p () && orig_node->count.ipa_p () && orig_node->count.ipa ().nonzero_p () && opt_for_fn (orig_node->decl, flag_profile_partial_training)) - remainder = remainder.guessed_local (); + remainder = orig_node->count.guessed_local (); return remainder; } @@ -4669,7 +4669,7 @@ update_counts_for_self_gen_clones (cgraph_node *orig_node, profile_count new_count = (redist_sum / self_gen_clones.length () + other_edges_count[i]); new_count = lenient_count_portion_handling (new_count, orig_node); - n->apply_scale (new_count, n->count); + n->scale_profile_to (new_count); for (cgraph_edge *cs = n->callees; cs; cs = cs->next_callee) processed_edges.add (cs); @@ -4769,16 +4769,12 @@ update_profiling_info (struct cgraph_node *orig_node, bool orig_edges_processed = false; if (new_sum > orig_node_count) { - /* TODO: Profile has alreay gone astray, keep what we have but lower it - to global0 category. */ - remainder = orig_node->count.global0 (); - - for (cgraph_edge *cs = orig_node->callees; cs; cs = cs->next_callee) - cs->count = cs->count.global0 (); - for (cgraph_edge *cs = orig_node->indirect_calls; - cs; - cs = cs->next_callee) - cs->count = cs->count.global0 (); + /* Profile has alreay gone astray, keep what we have but lower it + to global0adjusted or to local if we have partial training. */ + if (opt_for_fn (orig_node->decl, flag_profile_partial_training)) + orig_node->make_profile_local (); + else + orig_node->make_profile_global0 (GUESSED_GLOBAL0_ADJUSTED); orig_edges_processed = true; } else if (stats.rec_count_sum.nonzero_p ()) @@ -4805,23 +4801,9 @@ update_profiling_info (struct cgraph_node *orig_node, are still unmodified copies of ORIG_NODE's. Just clear the latter and bail out. */ if (opt_for_fn (orig_node->decl, flag_profile_partial_training)) - orig_node->count = orig_node->count.guessed_local (); + orig_node->make_profile_local (); else - orig_node->count = orig_node->count.global0adjusted (); - for (cgraph_edge *cs = orig_node->callees; - cs; - cs = cs->next_callee) - if (opt_for_fn (orig_node->decl, flag_profile_partial_training)) - cs->count = orig_node->count.guessed_local (); - else - cs->count = orig_node->count.global0adjusted (); - for (cgraph_edge *cs = orig_node->indirect_calls; - cs; - cs = cs->next_callee) - if (opt_for_fn (orig_node->decl, flag_profile_partial_training)) - cs->count = orig_node->count.guessed_local (); - else - cs->count = orig_node->count.global0adjusted (); + orig_node->make_profile_global0 (GUESSED_GLOBAL0_ADJUSTED); return; } } @@ -4870,13 +4852,10 @@ update_profiling_info (struct cgraph_node *orig_node, remainder = lenient_count_portion_handling (orig_node_count - new_sum, orig_node); - new_sum = orig_node_count.combine_with_ipa_count (new_sum); - orig_node->count = remainder; - - new_node->apply_scale (new_sum, new_node->count); + new_node->scale_profile_to (new_sum); if (!orig_edges_processed) - orig_node->apply_scale (remainder, orig_node->count); + orig_node->scale_profile_to (remainder); if (dump_file) { @@ -4906,13 +4885,11 @@ update_specialized_profile (struct cgraph_node *new_node, || !redirected_sum.nonzero_p ()) return; - orig_node->apply_scale + orig_node->scale_profile_to (lenient_count_portion_handling (orig_node->count.ipa () - redirected_sum, - orig_node), - orig_node->count); + orig_node)); - new_node->apply_scale (new_node->count.ipa () + redirected_sum, - new_node->count); + new_node->scale_profile_to (new_node->count.ipa () + redirected_sum); if (dump_file) { diff --git a/gcc/ipa-inline-transform.cc b/gcc/ipa-inline-transform.cc index 3c6a84570b7f..07a102440022 100644 --- a/gcc/ipa-inline-transform.cc +++ b/gcc/ipa-inline-transform.cc @@ -191,7 +191,10 @@ clone_inlined_nodes (struct cgraph_edge *e, bool duplicate, } duplicate = false; e->callee->externally_visible = false; - e->callee->apply_scale (e->count, e->callee->count); + profile_count num = e->count; + profile_count den = e->callee->count; + profile_count::adjust_for_ipa_scaling (&num, &den); + e->callee->apply_scale (num, den); dump_callgraph_transformation (e->callee, inlining_into, "inlining to");