Hi, this patch moves afdo inlining from early inliner into specialized one. The reason is that early inliner is by design non-recursive while afdo inliner needs to recurse. In the past google handled it by increasing early inliner iterations, but it can be done easily and cheaply without it by simply recusing into inlined functions.
I will also look into moving VPT to early inliner now. Bootstrapped/regtested x86_64-linux, comitted. gcc/ChangeLog: * auto-profile.cc (get_inline_stack): Add fn parameter. * ipa-inline.cc (want_early_inline_function_p): Do not care about AFDO. (inline_functions_by_afdo): New function. (early_inliner): Use it. gcc/testsuite/ChangeLog: * gcc.dg/tree-prof/indir-call-prof-2.c: Fix template. gcc/testsuite/ChangeLog: * gcc.dg/tree-prof/afdo-vpt-earlyinline.c: Update template. * gcc.dg/tree-prof/indir-call-prof-2.c: Likewise. * gcc.dg/tree-prof/afdo-inline.c: New test. diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc index 3272cbec9b0..07580f8cc99 100644 --- a/gcc/auto-profile.cc +++ b/gcc/auto-profile.cc @@ -386,7 +386,8 @@ get_function_decl_from_block (tree block) /* Store inline stack for STMT in STACK. */ static void -get_inline_stack (location_t locus, inline_stack *stack) +get_inline_stack (location_t locus, inline_stack *stack, + tree fn = current_function_decl) { if (LOCATION_LOCUS (locus) == UNKNOWN_LOCATION) return; @@ -408,9 +409,7 @@ get_inline_stack (location_t locus, inline_stack *stack) locus = tmp_locus; } } - stack->safe_push ( - std::make_pair (current_function_decl, - get_combined_location (locus, current_function_decl))); + stack->safe_push (std::make_pair (fn, get_combined_location (locus, fn))); } /* Return STMT's combined location, which is a 32bit integer in which @@ -822,7 +821,19 @@ autofdo_source_profile::get_callsite_total_count ( { inline_stack stack; stack.safe_push (std::make_pair (edge->callee->decl, 0)); - get_inline_stack (gimple_location (edge->call_stmt), &stack); + + cgraph_edge *e = edge; + do + { + get_inline_stack (gimple_location (e->call_stmt), &stack, + e->caller->decl); + /* If caller is inlined, continue building stack. */ + if (!e->caller->inlined_to) + e = NULL; + else + e = e->caller->callers; + } + while (e); function_instance *s = get_function_instance_by_inline_stack (stack); if (s == NULL diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc index 35e5496d846..c4ea3782091 100644 --- a/gcc/ipa-inline.cc +++ b/gcc/ipa-inline.cc @@ -782,14 +782,6 @@ want_early_inline_function_p (struct cgraph_edge *e) if (DECL_DISREGARD_INLINE_LIMITS (callee->decl)) ; - /* For AutoFDO, we need to make sure that before profile summary, all - hot paths' IR look exactly the same as profiled binary. As a result, - in einliner, we will disregard size limit and inline those callsites - that are: - * inlined in the profiled binary, and - * the cloned callee has enough samples to be considered "hot". */ - else if (flag_auto_profile && afdo_callsite_hot_enough_for_early_inline (e)) - ; else if (!DECL_DECLARED_INLINE_P (callee->decl) && !opt_for_fn (e->caller->decl, flag_inline_small_functions)) { @@ -3117,6 +3109,81 @@ early_inline_small_functions (struct cgraph_node *node) return inlined; } +/* With auto-fdo inline all functions that was inlined in the train run + and inlining seems useful. That is there are enough samples in the callee + function. + + Unlike early inlining, we inline recursively. + TODO: We should also integrate VPT. */ + +static bool +inline_functions_by_afdo (struct cgraph_node *node) +{ + if (!flag_auto_profile) + return false; + struct cgraph_edge *e; + bool inlined = false; + + for (e = node->callees; e; e = e->next_callee) + { + struct cgraph_node *callee = e->callee->ultimate_alias_target (); + + if (!e->inline_failed) + { + inlined |= inline_functions_by_afdo (e->callee); + continue; + } + if (!afdo_callsite_hot_enough_for_early_inline (e)) + continue; + + if (callee->definition + && !ipa_fn_summaries->get (callee)) + compute_fn_summary (callee, true); + + if (!can_early_inline_edge_p (e)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt, + "Not inlining %C -> %C using auto-profile, %s.", + e->caller, e->callee, + cgraph_inline_failed_string (e->inline_failed)); + continue; + } + /* We can handle recursive inlining by first producing + inline clone. */ + if (e->recursive_p ()) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt, + "Not inlining %C recursively" + " using auto-profile.\n", + e->callee); + continue; + } + + if (dump_enabled_p ()) + { + if (e->caller->inlined_to) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt, + "Inlining using auto-profile %C into %C " + "which is transitively inlined to %C.\n", + callee, e->caller, e->caller->inlined_to); + else + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt, + "Inlining using auto-profile %C into %C.\n", + callee, e->caller); + } + inline_call (e, true, NULL, NULL, false); + inlined |= inline_functions_by_afdo (e->callee); + inlined = true; + } + + if (inlined && !node->inlined_to) + ipa_update_overall_fn_summary (node); + + return inlined; +} + unsigned int early_inliner (function *fun) { @@ -3192,9 +3259,12 @@ early_inliner (function *fun) /* We iterate incremental inlining to get trivial cases of indirect inlining. */ while (iterations < opt_for_fn (node->decl, - param_early_inliner_max_iterations) - && early_inline_small_functions (node)) + param_early_inliner_max_iterations)) { + bool inlined = early_inline_small_functions (node); + inlined |= inline_functions_by_afdo (node); + if (!inlined) + break; timevar_push (TV_INTEGRATION); todo |= optimize_inline_calls (current_function_decl); diff --git a/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c b/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c new file mode 100644 index 00000000000..b67b3cb895a --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c @@ -0,0 +1,27 @@ +/* { dg-options "-O2 -fdump-tree-einline-details --param early-inlining-insns=1" } */ +/* { dg-require-profiling "-fauto-profile" } */ +volatile int a[1000]; +int reta (int i) +{ + if (a[i]) + __builtin_printf ("It is one\n"); + if (a[i] == 2) + __builtin_printf ("It is two\n"); + return a[i]; +} +int test () +{ + int s = 0; + for (int pos = 0; pos < 1000; pos++) + reta(pos); + if (s) + __builtin_printf ("sum error\n"); +} +int main() +{ + for (int i = 0; i < 10000; i++) + test(); + return 0; +} +/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile test" "einline"} } */ +/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile reta.*transitively inlined to main" "einline"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c b/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c index 3b51ea9f8a9..48a404942db 100644 --- a/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c +++ b/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c @@ -1,4 +1,4 @@ -/* { dg-options "-O2 -fdump-ipa-afdo-details -fdump-tree-einline-details" } */ +/* { dg-options "-O2 -fdump-ipa-afdo-details -fdump-tree-einline-details --param early-inlining-insns=1" } */ /* { dg-require-profiling "-fauto-profile" } */ volatile int array[1000]; @@ -25,8 +25,8 @@ int main() test(&p); return 0; } -/* { dg-final-use-autofdo { scan-tree-dump "Inlining test" "einline"} } */ +/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile test" "einline"} } */ /* { dg-final-use-autofdo { scan-ipa-dump "Checking indirect call -> direct call reta" "afdo"} } */ -/* { dg-final-use-autofdo { scan-ipa-dump "looks good" "afdo"} } */ +/* { dg-final-use-autofdo { scan-ipa-dump-times "looks good" 0 "afdo"} } */ /* If we inlined reta->test->main, it will contian array[pos]. */ /* { dg-final-use-autofdo { scan-ipa-dump "array.pos_" "afdo"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c index 1d64d9f3f62..53cc753cab5 100644 --- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c +++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c @@ -31,5 +31,5 @@ main (void) } /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* add1 .will resolve by ipa-profile" "profile"} } */ /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* sub1 .will resolve by ipa-profile" "profile"} } */ -/* { dg-final-use-autofdo { scan-ipa-dump "Inlining add1/1 into main/4." "afdo"} } */ -/* { dg-final-use-autofdo { scan-ipa-dump "Inlining sub1/2 into main/4." "afdo"} } */ +/* { dg-final-use-autofdo { scan-ipa-dump "Inlining add1/. into main/" "afdo"} } */ +/* { dg-final-use-autofdo { scan-ipa-dump "Inlining sub1/. into main/" "afdo"} } */