https://gcc.gnu.org/g:260252e7dc07bd6e201c76c24d858efaea4a1a78
commit r16-1545-g260252e7dc07bd6e201c76c24d858efaea4a1a78 Author: Jan Hubicka <hubi...@ucw.cz> Date: Tue Jun 17 17:26:18 2025 +0200 Improve static and AFDO profile combination This patch makes afdo_adjust_guessed_profile more agressive on finding scales on the boundaries of connected components with no annotation. Originaly I looked for edges into or out of the component with known AFDO counts and I also haled edges from basic block with known AFDO count and known static probability estimate. Common problem is with components not containing any in edges, but only out edges (i.e. those with ENTRY_BLOCK). In this case I added logic that looks for edges out of the component to BBs with known AFDO count. If all flow to the BB is either from the component or has AFDO count, we can deterine scale precisely. It may happen that there are edges from other components. In this case we know upper bound and use it, since it is better than nothing. I also noticed that some components have 0 count in all profile and then scaling gives up, which is fixed. I also optimized the code a bit by replacing map holding current component with an array holding component ID and broke out saling logic into separate functions. The patch fixes perl regression I introduced in last change. according to https://lnt.opensuse.org/db_default/v4/SPEC/67674 there were improvements (percentage is runtime change): 538.imagick_r -32.52% 549.fotonik3d_r -22.68% 520.omnetpp_r -12.37% 503.bwaves_r -8.71% 508.namd_r -5.10% 526.blender_r -2.11% and regressions: 554.roms_r 45.95% 527.cam4_r 21.69% 511.povray_r 13.59% 500.perlbench_r 10.19% 507.cactuBSSN_r 9.81% 510.parest_r 9.69% 548.exchange2_r 8.42% 502.gcc_r 5.10% 544.nab_r 3.76% 519.lbm_r 2.34% 541.leela_r 2.16% 525.x264_r 2.14% This is a bit wild, but hope things will settle donw once we chase out obvious problems (such as losing the profile of functions that has not been inlined). gcc/ChangeLog: * auto-profile.cc (afdo_indirect_call): Compute speculative edge probability. (add_scale): Break out from ... (scale_bbs): Break out from ... (afdo_adjust_guessed_profile): ... here; use componet array instead of current_component hash_map; handle components with only 0 profile; be more agressive on finding scales along the boundary. Diff: --- gcc/auto-profile.cc | 244 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 159 insertions(+), 85 deletions(-) diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc index 8918d1e9f09d..3272cbec9b07 100644 --- a/gcc/auto-profile.cc +++ b/gcc/auto-profile.cc @@ -1076,10 +1076,10 @@ afdo_indirect_call (gimple_stmt_iterator *gsi, const icall_target_map &map, fprintf (dump_file, "\n"); } - /* FIXME: Count should be initialized. */ struct cgraph_edge *new_edge - = indirect_edge->make_speculative (direct_call, - profile_count::uninitialized ()); + = indirect_edge->make_speculative + (direct_call, + gimple_bb (stmt)->count.apply_scale (99, 100)); cgraph_edge::redirect_call_stmt_to_callee (new_edge); gimple_remove_histogram_value (cfun, stmt, hist); inline_call (new_edge, true, NULL, NULL, false); @@ -1549,6 +1549,60 @@ cmp (const void *a, const void *b) return 0; } +/* Add scale ORIG/ANNOTATED to SCALES. */ + +static void +add_scale (vec <sreal> *scales, profile_count annotated, profile_count orig) +{ + if (dump_file) + { + orig.dump (dump_file); + fprintf (dump_file, " should be "); + annotated.dump (dump_file); + fprintf (dump_file, "\n"); + } + if (orig.nonzero_p ()) + { + sreal scale + = annotated.guessed_local () + .to_sreal_scale (orig); + if (dump_file) + fprintf (dump_file, " adding scale %.16f\n", + scale.to_double ()); + scales->safe_push (scale); + } +} + +/* Scale counts of all basic blocks in BBS by SCALE and convert them to + IPA quality. */ + +static void +scale_bbs (const vec <basic_block> &bbs, sreal scale) +{ + if (dump_file) + fprintf (dump_file, " Scaling by %.16f\n", scale.to_double ()); + for (basic_block b : bbs) + if (!(b->count == profile_count::zero ()) + && b->count.initialized_p ()) + { + profile_count o = b->count; + b->count = b->count.force_guessed () * scale; + + /* If we scaled to 0, make it auto-fdo since that is treated + less agressively. */ + if (!b->count.nonzero_p () && o.nonzero_p ()) + b->count = profile_count::zero ().afdo (); + if (dump_file) + { + fprintf (dump_file, " bb %i count updated ", b->index); + o.dump (dump_file); + fprintf (dump_file, " -> "); + b->count.dump (dump_file); + fprintf (dump_file, "\n"); + } + } +} + /* In case given basic block was fully optimized out, AutoFDO will have no data about it. In this case try to preserve static profile. Identify connected components (in undirected form of CFG) which has @@ -1558,26 +1612,33 @@ cmp (const void *a, const void *b) void afdo_adjust_guessed_profile (bb_set *annotated_bb) { - auto_sbitmap visited (last_basic_block_for_fn (cfun)); /* Basic blocks of connected component currently processed. */ - auto_vec <basic_block, 20> bbs (n_basic_blocks_for_fn (cfun) + 1); + auto_vec <basic_block, 20> bbs (n_basic_blocks_for_fn (cfun)); /* Scale factors found. */ - auto_vec <sreal, 20> scales (n_basic_blocks_for_fn (cfun) + 1); - auto_vec <basic_block, 20> stack (n_basic_blocks_for_fn (cfun) + 1); - - bitmap_clear (visited); + auto_vec <sreal, 20> scales; + auto_vec <basic_block, 20> stack (n_basic_blocks_for_fn (cfun)); basic_block seed_bb; - FOR_BB_BETWEEN (seed_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun), - EXIT_BLOCK_PTR_FOR_FN (cfun), next_bb) - if (!is_bb_annotated (seed_bb, *annotated_bb) - && bitmap_set_bit (visited, seed_bb->index)) + unsigned int component_id = 1; + + /* Map from basic block to its component. + 0 is used for univisited BBs, + 1 means that BB is annotated, + >=2 is an id of the component BB belongs to. */ + auto_vec <unsigned int, 20> component; + component.safe_grow (last_basic_block_for_fn (cfun)); + FOR_ALL_BB_FN (seed_bb, cfun) + component[seed_bb->index] + = is_bb_annotated (seed_bb, *annotated_bb) ? 1 : 0; + FOR_ALL_BB_FN (seed_bb, cfun) + if (!component[seed_bb->index]) { - hash_set <basic_block> current_component; - stack.quick_push (seed_bb); + component_id++; bbs.truncate (0); scales.truncate (0); + component[seed_bb->index] = component_id; + profile_count max_count = profile_count::zero (); /* Identify connected component starting in BB. */ if (dump_file) @@ -1588,19 +1649,33 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb) basic_block b = stack.pop (); bbs.quick_push (b); - current_component.add (b); + max_count = max_count.max (b->count); for (edge e: b->preds) - if (!is_bb_annotated (e->src, *annotated_bb) - && bitmap_set_bit (visited, e->src->index)) - stack.quick_push (e->src); + if (!component[e->src->index]) + { + stack.quick_push (e->src); + component[e->src->index] = component_id; + } for (edge e: b->succs) - if (!is_bb_annotated (e->dest, *annotated_bb) - && bitmap_set_bit (visited, e->dest->index)) - stack.quick_push (e->dest); + if (!component[e->dest->index]) + { + stack.quick_push (e->dest); + component[e->dest->index] = component_id; + } } while (!stack.is_empty ()); + /* If all blocks in components has 0 count, we do not need + to scale, only we must convert to IPA quality. */ + if (!max_count.nonzero_p ()) + { + if (dump_file) + fprintf (dump_file, " All counts are 0; scale = 1\n"); + scale_bbs (bbs, 1); + continue; + } + /* Now visit the component and try to figure out its desired frequency. */ for (basic_block b : bbs) @@ -1653,84 +1728,83 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb) } else { - gcc_checking_assert (current_component.contains (e->src)); current_component_count += e->count (); + gcc_checking_assert (component[e->src->index] == component_id); } if (boundary && current_component_count.initialized_p ()) { - profile_count in_count = b->count - current_component_count; if (dump_file) - { - fprintf (dump_file, " bb %i in count ", b->index); - in_count.dump (dump_file); - fprintf (dump_file, " should be "); - annotated_count.dump (dump_file); - fprintf (dump_file, "\n"); - } - if (in_count.nonzero_p ()) - { - sreal scale - = annotated_count.guessed_local () - .to_sreal_scale (in_count); - if (dump_file) - fprintf (dump_file, " adding scale %.16f\n", - scale.to_double ()); - scales.safe_push (scale); - } + fprintf (dump_file, " bb %i in count ", b->index); + add_scale (&scales, + annotated_count, + b->count - current_component_count); } for (edge e: b->succs) if (AFDO_EINFO (e)->is_annotated ()) { - profile_count out_count = e->count (); - profile_count annotated_count = AFDO_EINFO (e)->get_count (); if (dump_file) - { - fprintf (dump_file, " edge %i->%i count ", - b->index, e->dest->index); - out_count.dump (dump_file); - fprintf (dump_file, " should be "); - annotated_count.dump (dump_file); - fprintf (dump_file, "\n"); - } - if (out_count.nonzero_p ()) - { - sreal scale - = annotated_count.guessed_local () - .to_sreal_scale (out_count); - if (dump_file) - fprintf (dump_file, " adding scale %.16f\n", - scale.to_double ()); - scales.safe_push (scale); - } + fprintf (dump_file, " edge %i->%i count ", + b->index, e->dest->index); + add_scale (&scales, AFDO_EINFO (e)->get_count (), e->count ()); + } + else if (is_bb_annotated (e->dest, *annotated_bb)) + { + profile_count annotated_count = e->dest->count; + profile_count out_count = profile_count::zero (); + bool ok = true; + for (edge e2: e->dest->preds) + if (AFDO_EINFO (e2)->is_annotated ()) + annotated_count -= AFDO_EINFO (e2)->get_count (); + else if (component[e->src->index] == component_id) + out_count += e->count (); + else if (e->probability.nonzero_p ()) + { + ok = false; + break; + } + if (!ok) + continue; + if (dump_file) + fprintf (dump_file, + " edge %i->%i has annotated sucessor; count ", + b->index, e->dest->index); + add_scale (&scales, annotated_count, e->count ()); } } + + /* If we failed to find annotated entry or exit edge, + look for exit edges and scale profile so the dest + BB get all flow it needs. This is inprecise because + the edge is not annotated and thus BB has more than + one such predecessor. */ + if (!scales.length ()) + for (basic_block b : bbs) + if (b->count.nonzero_p ()) + for (edge e: b->succs) + if (is_bb_annotated (e->dest, *annotated_bb)) + { + profile_count annotated_count = e->dest->count; + for (edge e2: e->dest->preds) + if (AFDO_EINFO (e2)->is_annotated ()) + annotated_count -= AFDO_EINFO (e2)->get_count (); + if (dump_file) + fprintf (dump_file, + " edge %i->%i has annotated sucessor;" + " upper bound count ", + b->index, e->dest->index); + add_scale (&scales, annotated_count, e->count ()); + } if (!scales.length ()) - continue; + { + if (dump_file) + fprintf (dump_file, + " Can not determine count from the boundary; giving up"); + continue; + } + gcc_checking_assert (scales.length ()); scales.qsort (cmp); - sreal scale = scales[scales.length () / 2]; - if (dump_file) - fprintf (dump_file, " Scaling by %.16f\n", scale.to_double ()); - for (basic_block b : bbs) - if (!(b->count == profile_count::zero ()) - && b->count.initialized_p ()) - { - profile_count o = b->count; - b->count = b->count.force_guessed () * scale; - - /* If we scaled to 0, make it auto-fdo since that is treated - less agressively. */ - if (!b->count.nonzero_p () && o.nonzero_p ()) - b->count = profile_count::zero ().afdo (); - if (dump_file) - { - fprintf (dump_file, " bb %i count updated ", b->index); - o.dump (dump_file); - fprintf (dump_file, " -> "); - b->count.dump (dump_file); - fprintf (dump_file, "\n"); - } - } + scale_bbs (bbs, scales[scales.length () / 2]); } }