https://gcc.gnu.org/g:8f40a8e8f8d1ebe931d52f914533036c2f950814

commit r16-1598-g8f40a8e8f8d1ebe931d52f914533036c2f950814
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Wed Jun 18 12:10:25 2025 +0200

    Implement afdo inliner
    
    This patch moves afdo inlining from early inliner into specialized one.
    The reason is that early inliner is by design non-recursive while afdo
    inliner needs to recurse.  In the past google handled it by increasing
    early inliner iterations, but it can be done easily and cheaply without
    it by simply recusing into inlined functions.
    
    I will also look into moving VPT to early inliner now.
    
    Bootstrapped/regtested x86_64-linux, comitted.
    
    gcc/ChangeLog:
    
            * auto-profile.cc (get_inline_stack): Add fn parameter.
            * ipa-inline.cc (want_early_inline_function_p): Do not care
            about AFDO.
            (inline_functions_by_afdo): New function.
            (early_inliner): Use it.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.dg/tree-prof/afdo-vpt-earlyinline.c: Update template.
            * gcc.dg/tree-prof/indir-call-prof-2.c: Likewise.
            * gcc.dg/tree-prof/afdo-inline.c: New test.

Diff:
---
 gcc/auto-profile.cc                                | 21 +++--
 gcc/ipa-inline.cc                                  | 90 +++++++++++++++++++---
 gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c       | 27 +++++++
 .../gcc.dg/tree-prof/afdo-vpt-earlyinline.c        |  6 +-
 gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c |  4 +-
 5 files changed, 128 insertions(+), 20 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 3272cbec9b07..07580f8cc998 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -386,7 +386,8 @@ get_function_decl_from_block (tree block)
 /* Store inline stack for STMT in STACK.  */
 
 static void
-get_inline_stack (location_t locus, inline_stack *stack)
+get_inline_stack (location_t locus, inline_stack *stack,
+                 tree fn = current_function_decl)
 {
   if (LOCATION_LOCUS (locus) == UNKNOWN_LOCATION)
     return;
@@ -408,9 +409,7 @@ get_inline_stack (location_t locus, inline_stack *stack)
           locus = tmp_locus;
         }
     }
-  stack->safe_push (
-      std::make_pair (current_function_decl,
-                      get_combined_location (locus, current_function_decl)));
+  stack->safe_push (std::make_pair (fn, get_combined_location (locus, fn)));
 }
 
 /* Return STMT's combined location, which is a 32bit integer in which
@@ -822,7 +821,19 @@ autofdo_source_profile::get_callsite_total_count (
 {
   inline_stack stack;
   stack.safe_push (std::make_pair (edge->callee->decl, 0));
-  get_inline_stack (gimple_location (edge->call_stmt), &stack);
+
+  cgraph_edge *e = edge;
+  do
+    {
+      get_inline_stack (gimple_location (e->call_stmt), &stack,
+                       e->caller->decl);
+      /* If caller is inlined, continue building stack.  */
+      if (!e->caller->inlined_to)
+       e = NULL;
+      else
+       e = e->caller->callers;
+    }
+  while (e);
 
   function_instance *s = get_function_instance_by_inline_stack (stack);
   if (s == NULL
diff --git a/gcc/ipa-inline.cc b/gcc/ipa-inline.cc
index 35e5496d8463..c4ea37820913 100644
--- a/gcc/ipa-inline.cc
+++ b/gcc/ipa-inline.cc
@@ -782,14 +782,6 @@ want_early_inline_function_p (struct cgraph_edge *e)
 
   if (DECL_DISREGARD_INLINE_LIMITS (callee->decl))
     ;
-  /* For AutoFDO, we need to make sure that before profile summary, all
-     hot paths' IR look exactly the same as profiled binary. As a result,
-     in einliner, we will disregard size limit and inline those callsites
-     that are:
-       * inlined in the profiled binary, and
-       * the cloned callee has enough samples to be considered "hot".  */
-  else if (flag_auto_profile && afdo_callsite_hot_enough_for_early_inline (e))
-    ;
   else if (!DECL_DECLARED_INLINE_P (callee->decl)
           && !opt_for_fn (e->caller->decl, flag_inline_small_functions))
     {
@@ -3117,6 +3109,81 @@ early_inline_small_functions (struct cgraph_node *node)
   return inlined;
 }
 
+/* With auto-fdo inline all functions that was inlined in the train run
+   and inlining seems useful.  That is there are enough samples in the callee
+   function.
+
+   Unlike early inlining, we inline recursively.
+   TODO: We should also integrate VPT.  */
+
+static bool
+inline_functions_by_afdo (struct cgraph_node *node)
+{
+  if (!flag_auto_profile)
+    return false;
+  struct cgraph_edge *e;
+  bool inlined = false;
+
+  for (e = node->callees; e; e = e->next_callee)
+    {
+      struct cgraph_node *callee = e->callee->ultimate_alias_target ();
+
+      if (!e->inline_failed)
+       {
+         inlined |= inline_functions_by_afdo (e->callee);
+         continue;
+       }
+      if (!afdo_callsite_hot_enough_for_early_inline (e))
+       continue;
+
+      if (callee->definition
+         && !ipa_fn_summaries->get (callee))
+       compute_fn_summary (callee, true);
+
+      if (!can_early_inline_edge_p (e))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
+                            "Not inlining %C -> %C using auto-profile, %s.",
+                            e->caller, e->callee,
+                            cgraph_inline_failed_string (e->inline_failed));
+         continue;
+       }
+      /* We can handle recursive inlining by first producing
+        inline clone.  */
+      if (e->recursive_p ())
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, e->call_stmt,
+                            "Not inlining %C recursively"
+                            " using auto-profile.\n",
+                            e->callee);
+         continue;
+       }
+
+      if (dump_enabled_p ())
+       {
+         if (e->caller->inlined_to)
+           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt,
+                            "Inlining using auto-profile %C into %C "
+                            "which is transitively inlined to %C.\n",
+                            callee, e->caller, e->caller->inlined_to);
+         else
+           dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, e->call_stmt,
+                            "Inlining using auto-profile %C into %C.\n",
+                            callee, e->caller);
+       }
+      inline_call (e, true, NULL, NULL, false);
+      inlined |= inline_functions_by_afdo (e->callee);
+      inlined = true;
+    }
+
+  if (inlined && !node->inlined_to)
+    ipa_update_overall_fn_summary (node);
+
+  return inlined;
+}
+
 unsigned int
 early_inliner (function *fun)
 {
@@ -3192,9 +3259,12 @@ early_inliner (function *fun)
       /* We iterate incremental inlining to get trivial cases of indirect
         inlining.  */
       while (iterations < opt_for_fn (node->decl,
-                                     param_early_inliner_max_iterations)
-            && early_inline_small_functions (node))
+                                     param_early_inliner_max_iterations))
        {
+         bool inlined = early_inline_small_functions (node);
+         inlined |= inline_functions_by_afdo (node);
+         if (!inlined)
+           break;
          timevar_push (TV_INTEGRATION);
          todo |= optimize_inline_calls (current_function_decl);
 
diff --git a/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c 
b/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c
new file mode 100644
index 000000000000..b67b3cb895a8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-prof/afdo-inline.c
@@ -0,0 +1,27 @@
+/* { dg-options "-O2 -fdump-tree-einline-details --param 
early-inlining-insns=1" } */
+/* { dg-require-profiling "-fauto-profile" } */ 
+volatile int a[1000];
+int reta (int i)
+{
+       if (a[i])
+               __builtin_printf ("It is one\n");
+       if (a[i] == 2)
+               __builtin_printf ("It is two\n");
+       return a[i];
+}
+int test ()
+{
+       int s = 0;
+       for (int pos = 0; pos < 1000; pos++)
+         reta(pos);
+       if (s)
+               __builtin_printf ("sum error\n");
+}
+int main()
+{
+       for (int i = 0; i < 10000; i++)
+               test();
+       return 0;
+}
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile test" 
"einline"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile 
reta.*transitively inlined to main" "einline"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c 
b/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c
index 3b51ea9f8a9d..48a404942db7 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/afdo-vpt-earlyinline.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-ipa-afdo-details -fdump-tree-einline-details" } */
+/* { dg-options "-O2 -fdump-ipa-afdo-details -fdump-tree-einline-details 
--param early-inlining-insns=1" } */
 /* { dg-require-profiling "-fauto-profile" } */ 
 
 volatile int array[1000];
@@ -25,8 +25,8 @@ int main()
                test(&p);
        return 0;
 }
-/* { dg-final-use-autofdo { scan-tree-dump "Inlining test" "einline"} } */
+/* { dg-final-use-autofdo { scan-tree-dump "Inlining using auto-profile test" 
"einline"} } */
 /* { dg-final-use-autofdo { scan-ipa-dump "Checking indirect call -> direct 
call reta" "afdo"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "looks good" "afdo"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump-times "looks good" 0 "afdo"} } */
 /* If we inlined reta->test->main, it will contian array[pos].  */
 /* { dg-final-use-autofdo { scan-ipa-dump "array.pos_" "afdo"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c 
b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
index 1d64d9f3f622..53cc753cab53 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c
@@ -31,5 +31,5 @@ main (void)
 }
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* 
add1 .will resolve by ipa-profile" "profile"} } */
 /* { dg-final-use-not-autofdo { scan-ipa-dump "Indirect call -> direct call.* 
sub1 .will resolve by ipa-profile" "profile"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Inlining add1/1 into main/4." 
"afdo"} } */
-/* { dg-final-use-autofdo { scan-ipa-dump "Inlining sub1/2 into main/4." 
"afdo"} } */
+/* { dg-final-use-autofdo { scan-ipa-dump "Inlining add1/. into main/" "afdo"} 
} */
+/* { dg-final-use-autofdo { scan-ipa-dump "Inlining sub1/. into main/" "afdo"} 
} */

Reply via email to