Hi, main difference between normal profile feedback and auto-fdo is that with profile feedback every basic block with non-zero profile has an incomming edge with non-zero profile. With auto-profile it is possible that none of predecessors was sampled and also the tool has cutoff parameter which makes it to ignore small counts.
This becomes a problem when one tries to specialize code and scale profile. For exmaple if inline function happens to have hot loop with non-zero counts but its entry count has zero counts and we want to inline to zero counts and we want to inline to a call with a non-zero count X, we want to scale the body by X/0 which we currently turn into X/1. This is a problem since I added logic to scale up the auto-profiles (to get some extra bits of precision) so X is often a large value and multiplying by X is not a right answer at all. The multiply factor should be <= 1. Iterating this few times will make counts to cap and we will lost any useful info. Original implementation avoided this by doing all inlines before AFDO readback, bit this is not possible with LTO (unless we move AFDO readback to WPA or add support for context sensitive profiles). I think I can get the scaling work reasonably well and then we can look into possible benefits of context sensitive profiling which can be implemented both atop of AFDO as well as FDO. This patch adds cutoff value to profile_info which is initialized by profile feedback to 1 and by auto-profile to the scale factor (since we do not know the cutoff create_gcov used; llvm's tool streams it and we probably should too). Then force_nonzero forces every value smaller than cutoff/2 to cutoff/2 which should keep scaling factors in reasonable ranges. Bootstrapped/regtested x86_64-linux. gcc/ChangeLog: * auto-profile.cc (autofdo_source_profile::read): Scale cutoff. (read_autofdo_file): Initialize cutoff * coverage.cc (read_counts_file): Initialize cutoff to 1. * gcov-io.h (struct gcov_summary): Add cutoff field. * ipa-inline.cc (inline_small_functions): mac_count can be non-zero also with auto_profile. * lto-cgraph.cc (output_profile_summary): Write cutoff and sum_max. (input_profile_summary): Read cutoff and sum max. (merge_profile_summaries): Initialize and scale global cutoffs and sum max. * profile-count.cc: Include profile.h (profile_count::force_nonzero): move here from ...; use cutoff. * profile-count.h: (profile_count::force_nonzero): ... here. gcc/testsuite/ChangeLog: * gcc.dg/tree-prof/clone-merge-1.c: diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc index 64f4cda1b52..ea237bd484c 100644 --- a/gcc/auto-profile.cc +++ b/gcc/auto-profile.cc @@ -2522,6 +2590,7 @@ autofdo_source_profile::read () afdo_count_scale = MAX (((gcov_type)1 << (profile_count::n_bits / 2)) / afdo_profile_info->sum_max, 1); + afdo_profile_info->cutoff *= afdo_count_scale; afdo_hot_bb_threshod = hot_frac ? afdo_profile_info->sum_max * afdo_count_scale / hot_frac @@ -2531,10 +2600,12 @@ autofdo_source_profile::read () fprintf (dump_file, "Max count in profile %" PRIu64 "\n" "Setting scale %" PRIu64 "\n" "Scaled max count %" PRIu64 "\n" + "Cutoff %" PRIu64 "\n" "Hot count threshold %" PRIu64 "\n\n", (int64_t)afdo_profile_info->sum_max, (int64_t)afdo_count_scale, (int64_t)(afdo_profile_info->sum_max * afdo_count_scale), + (int64_t)afdo_profile_info->cutoff, (int64_t)afdo_hot_bb_threshod); afdo_profile_info->sum_max *= afdo_count_scale; return true; @@ -3865,6 +3936,7 @@ read_autofdo_file (void) autofdo::afdo_profile_info = XNEW (gcov_summary); autofdo::afdo_profile_info->runs = 1; autofdo::afdo_profile_info->sum_max = 0; + autofdo::afdo_profile_info->cutoff = 1; /* Read the profile from the profile file. */ autofdo::read_profile (); diff --git a/gcc/coverage.cc b/gcc/coverage.cc index dd3ed2ed842..75a24c61448 100644 --- a/gcc/coverage.cc +++ b/gcc/coverage.cc @@ -238,6 +238,7 @@ read_counts_file (void) gcov_profile_info = profile_info = XCNEW (gcov_summary); profile_info->runs = gcov_read_unsigned (); profile_info->sum_max = gcov_read_unsigned (); + profile_info->cutoff = 1; } else if (GCOV_TAG_IS_COUNTER (tag) && fn_ident) { diff --git a/gcc/gcov-io.h b/gcc/gcov-io.h index d48291c1fe3..f3e3a1c08da 100644 --- a/gcc/gcov-io.h +++ b/gcc/gcov-io.h @@ -349,6 +349,11 @@ struct gcov_summary { gcov_unsigned_t runs; /* Number of program runs. */ gcov_type sum_max; /* Sum of individual run max values. */ + gcov_type cutoff; /* Values smaller than this value are not + reliable (0 may mean non-zero). + For read profile cutoff is typically 1 + however when we scale up or use auto-fdo + it may become bigger value. */ }; #if !defined(inhibit_libc) diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc index ca605b027dc..0cf97a80687 100644 --- a/gcc/ipa-inline.cc +++ b/gcc/ipa-inline.cc @@ -2222,6 +2222,7 @@ inline_small_functions (void) gcc_assert (in_lto_p || !(max_count > 0) + || flag_auto_profile || (profile_info && flag_branch_probabilities)); while (!edge_heap.empty ()) diff --git a/gcc/lto-cgraph.cc b/gcc/lto-cgraph.cc index ec34f659d6a..b0b355cbadd 100644 --- a/gcc/lto-cgraph.cc +++ b/gcc/lto-cgraph.cc @@ -718,11 +718,12 @@ output_profile_summary (struct lto_simple_output_block *ob) { if (profile_info) { - /* We do not output num and run_max, they are not used by - GCC profile feedback and they are difficult to merge from multiple - units. */ unsigned runs = (profile_info->runs); streamer_write_uhwi_stream (ob->main_stream, runs); + streamer_write_gcov_count_stream (ob->main_stream, + profile_info->sum_max); + streamer_write_gcov_count_stream (ob->main_stream, + profile_info->cutoff); /* IPA-profile computes hot bb threshold based on cumulated whole program profile. We need to stream it down to ltrans. */ @@ -1678,6 +1679,8 @@ input_profile_summary (class lto_input_block *ib, if (runs) { file_data->profile_info.runs = runs; + file_data->profile_info.sum_max = streamer_read_gcov_count (ib); + file_data->profile_info.cutoff = streamer_read_gcov_count (ib); /* IPA-profile computes hot bb threshold based on cumulated whole program profile. We need to stream it down to ltrans. */ @@ -1719,6 +1722,8 @@ merge_profile_summaries (struct lto_file_decl_data **file_data_vec) profile_info = XCNEW (gcov_summary); profile_info->runs = max_runs; + profile_info->sum_max = 0; + profile_info->cutoff = 0; /* If merging already happent at WPA time, we are done. */ if (flag_ltrans) @@ -1735,6 +1740,14 @@ merge_profile_summaries (struct lto_file_decl_data **file_data_vec) scale = RDIV (node->count_materialization_scale * max_runs, node->lto_file_data->profile_info.runs); + gcov_type sum_max = RDIV (node->lto_file_data->profile_info.sum_max * max_runs, + node->lto_file_data->profile_info.runs); + gcov_type cutoff = RDIV (node->lto_file_data->profile_info.cutoff * max_runs, + node->lto_file_data->profile_info.runs); + if (sum_max > profile_info->sum_max) + profile_info->sum_max = sum_max; + if (cutoff > profile_info->cutoff) + profile_info->cutoff = cutoff; node->count_materialization_scale = scale; if (scale < 0) fatal_error (input_location, "Profile information in %s corrupted", diff --git a/gcc/profile-count.cc b/gcc/profile-count.cc index 21477008b70..8f05a79a437 100644 --- a/gcc/profile-count.cc +++ b/gcc/profile-count.cc @@ -32,6 +32,7 @@ along with GCC; see the file COPYING3. If not see #include "cgraph.h" #include "wide-int.h" #include "sreal.h" +#include "profile.h" /* Names from profile_quality enum values. */ @@ -570,3 +571,27 @@ profile_count::operator*= (const sreal &num) { return *this * num; } + +/* Make counter forcibly nonzero. */ +profile_count +profile_count::force_nonzero () const +{ + if (!initialized_p ()) + return *this; + profile_count ret = *this; + /* Generally values are forced non-zero to handle inconsistent profile + where count 0 needs to be scaled up to non-zero. + + Use cutoff value here to avoid situation where profile has large + cutoff and we perform count = count * num / den where num is non-zero + and den is 0. If profile was scaled by large factor, forcing value + to 1 would lead to large scale factor. */ + gcov_unsigned_t small = profile_info ? profile_info->cutoff / 2 + 1 + : 1; + if (ret.m_val < small) + { + ret.m_val = small; + ret.m_quality = MIN (m_quality, ADJUSTED); + } + return ret; +} diff --git a/gcc/profile-count.h b/gcc/profile-count.h index 216054033c5..20c03a29238 100644 --- a/gcc/profile-count.h +++ b/gcc/profile-count.h @@ -1112,18 +1112,7 @@ public: } /* Make counter forcibly nonzero. */ - profile_count force_nonzero () const - { - if (!initialized_p ()) - return *this; - profile_count ret = *this; - if (ret.m_val == 0) - { - ret.m_val = 1; - ret.m_quality = MIN (m_quality, ADJUSTED); - } - return ret; - } + profile_count force_nonzero () const; profile_count max (profile_count other) const { diff --git a/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c b/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c index 43a909054b5..904dd0cfb28 100644 --- a/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c +++ b/gcc/testsuite/gcc.dg/tree-prof/clone-merge-1.c @@ -31,4 +31,4 @@ int main() } /* We will have profiles for test2 and test2.constprop.0 that will have to be merged, */ -/* { dg-final-use-autofdo { scan-ipa-dump "Merging duplicate symbol test2" "afdo_offline"} } */