https://gcc.gnu.org/g:260252e7dc07bd6e201c76c24d858efaea4a1a78

commit r16-1545-g260252e7dc07bd6e201c76c24d858efaea4a1a78
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Tue Jun 17 17:26:18 2025 +0200

    Improve static and AFDO profile combination
    
    This patch makes afdo_adjust_guessed_profile more agressive on finding 
scales
    on the boundaries of connected components with no annotation.  Originaly I
    looked for edges into or out of the component with known AFDO counts and I 
also
    haled edges from basic block with known AFDO count and known static 
probability
    estimate.
    
    Common problem is with components not containing any in edges, but only out
    edges (i.e.  those with ENTRY_BLOCK).  In this case I added logic that looks
    for edges out of the component to BBs with known AFDO count.  If all flow to
    the BB is either from the component or has AFDO count, we can deterine scale
    precisely.  It may happen that there are edges from other components. In 
this
    case we know upper bound and use it, since it is better than nothing.
    
    I also noticed that some components have 0 count in all profile and then 
scaling
    gives up, which is fixed.  I also optimized the code a bit by replacing
    map holding current component with an array holding component ID and broke 
out
    saling logic into separate functions.
    
    The patch fixes perl regression I introduced in last change.
    according to
    https://lnt.opensuse.org/db_default/v4/SPEC/67674
    there were improvements (percentage is runtime change):
    
    538.imagick_r   -32.52%
    549.fotonik3d_r -22.68%
    520.omnetpp_r   -12.37%
    503.bwaves_r    -8.71%
    508.namd_r      -5.10%
    526.blender_r   -2.11%
    
    and regressions:
    
    554.roms_r      45.95%
    527.cam4_r      21.69%
    511.povray_r    13.59%
    500.perlbench_r 10.19%
    507.cactuBSSN_r 9.81%
    510.parest_r    9.69%
    548.exchange2_r 8.42%
    502.gcc_r       5.10%
    544.nab_r       3.76%
    519.lbm_r       2.34%
    541.leela_r     2.16%
    525.x264_r      2.14%
    
    This is a bit wild, but hope things will settle donw once we chase out
    obvious problems (such as losing the profile of functions that has not been
    inlined).
    
    gcc/ChangeLog:
    
            * auto-profile.cc (afdo_indirect_call): Compute speculative edge
            probability.
            (add_scale): Break out from ...
            (scale_bbs): Break out from ...
            (afdo_adjust_guessed_profile): ... here; use componet array instead 
of
            current_component hash_map; handle components with only 0 profile;
            be more agressive on finding scales along the boundary.

Diff:
---
 gcc/auto-profile.cc | 244 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 159 insertions(+), 85 deletions(-)

diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index 8918d1e9f09d..3272cbec9b07 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -1076,10 +1076,10 @@ afdo_indirect_call (gimple_stmt_iterator *gsi, const 
icall_target_map &map,
       fprintf (dump_file, "\n");
     }
 
-  /* FIXME: Count should be initialized.  */
   struct cgraph_edge *new_edge
-      = indirect_edge->make_speculative (direct_call,
-                                        profile_count::uninitialized ());
+      = indirect_edge->make_speculative
+               (direct_call,
+                gimple_bb (stmt)->count.apply_scale (99, 100));
   cgraph_edge::redirect_call_stmt_to_callee (new_edge);
   gimple_remove_histogram_value (cfun, stmt, hist);
   inline_call (new_edge, true, NULL, NULL, false);
@@ -1549,6 +1549,60 @@ cmp (const void *a, const void *b)
   return 0;
 }
 
+/* Add scale ORIG/ANNOTATED to SCALES.  */
+
+static void
+add_scale (vec <sreal> *scales, profile_count annotated, profile_count orig)
+{
+  if (dump_file)
+    {
+      orig.dump (dump_file);
+      fprintf (dump_file, " should be ");
+      annotated.dump (dump_file);
+      fprintf (dump_file, "\n");
+    }
+  if (orig.nonzero_p ())
+    {
+      sreal scale
+       = annotated.guessed_local ()
+               .to_sreal_scale (orig);
+      if (dump_file)
+       fprintf (dump_file, "    adding scale %.16f\n",
+                scale.to_double ());
+      scales->safe_push (scale);
+    }
+}
+
+/* Scale counts of all basic blocks in BBS by SCALE and convert them to
+   IPA quality.  */
+
+static void
+scale_bbs (const vec <basic_block> &bbs, sreal scale)
+{
+  if (dump_file)
+    fprintf (dump_file, "  Scaling by %.16f\n", scale.to_double ());
+  for (basic_block b : bbs)
+    if (!(b->count == profile_count::zero ())
+       && b->count.initialized_p ())
+      {
+       profile_count o = b->count;
+       b->count = b->count.force_guessed () * scale;
+
+       /* If we scaled to 0, make it auto-fdo since that is treated
+          less agressively.  */
+       if (!b->count.nonzero_p () && o.nonzero_p ())
+         b->count = profile_count::zero ().afdo ();
+       if (dump_file)
+         {
+           fprintf (dump_file, "    bb %i count updated ", b->index);
+           o.dump (dump_file);
+           fprintf (dump_file, " -> ");
+           b->count.dump (dump_file);
+           fprintf (dump_file, "\n");
+         }
+      }
+}
+
 /* In case given basic block was fully optimized out, AutoFDO
    will have no data about it.  In this case try to preserve static profile.
    Identify connected components (in undirected form of CFG) which has
@@ -1558,26 +1612,33 @@ cmp (const void *a, const void *b)
 void
 afdo_adjust_guessed_profile (bb_set *annotated_bb)
 {
-  auto_sbitmap visited (last_basic_block_for_fn (cfun));
   /* Basic blocks of connected component currently processed.  */
-  auto_vec <basic_block, 20> bbs (n_basic_blocks_for_fn (cfun) + 1);
+  auto_vec <basic_block, 20> bbs (n_basic_blocks_for_fn (cfun));
   /* Scale factors found.  */
-  auto_vec <sreal, 20> scales (n_basic_blocks_for_fn (cfun) + 1);
-  auto_vec <basic_block, 20> stack (n_basic_blocks_for_fn (cfun) + 1);
-
-  bitmap_clear (visited);
+  auto_vec <sreal, 20> scales;
+  auto_vec <basic_block, 20> stack (n_basic_blocks_for_fn (cfun));
 
   basic_block seed_bb;
-  FOR_BB_BETWEEN (seed_bb, ENTRY_BLOCK_PTR_FOR_FN (cfun),
-                 EXIT_BLOCK_PTR_FOR_FN (cfun), next_bb)
-   if (!is_bb_annotated (seed_bb, *annotated_bb)
-       && bitmap_set_bit (visited, seed_bb->index))
+  unsigned int component_id = 1;
+
+  /* Map from basic block to its component.
+     0   is used for univisited BBs,
+     1   means that BB is annotated,
+     >=2 is an id of the component BB belongs to.  */
+  auto_vec <unsigned int, 20> component;
+  component.safe_grow (last_basic_block_for_fn (cfun));
+  FOR_ALL_BB_FN (seed_bb, cfun)
+    component[seed_bb->index]
+       = is_bb_annotated (seed_bb, *annotated_bb) ? 1 : 0;
+  FOR_ALL_BB_FN (seed_bb, cfun)
+   if (!component[seed_bb->index])
      {
-       hash_set <basic_block> current_component;
-
        stack.quick_push (seed_bb);
+       component_id++;
        bbs.truncate (0);
        scales.truncate (0);
+       component[seed_bb->index] = component_id;
+       profile_count max_count = profile_count::zero ();
 
        /* Identify connected component starting in BB.  */
        if (dump_file)
@@ -1588,19 +1649,33 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
           basic_block b = stack.pop ();
 
           bbs.quick_push (b);
-          current_component.add (b);
+          max_count = max_count.max (b->count);
 
           for (edge e: b->preds)
-            if (!is_bb_annotated (e->src, *annotated_bb)
-                && bitmap_set_bit (visited, e->src->index))
-               stack.quick_push (e->src);
+            if (!component[e->src->index])
+              {
+                 stack.quick_push (e->src);
+                 component[e->src->index] = component_id;
+              }
           for (edge e: b->succs)
-            if (!is_bb_annotated (e->dest, *annotated_bb)
-                && bitmap_set_bit (visited, e->dest->index))
-               stack.quick_push (e->dest);
+            if (!component[e->dest->index])
+              {
+                 stack.quick_push (e->dest);
+                 component[e->dest->index] = component_id;
+              }
         }
        while (!stack.is_empty ());
 
+       /* If all blocks in components has 0 count, we do not need
+         to scale, only we must convert to IPA quality.  */
+       if (!max_count.nonzero_p ())
+        {
+          if (dump_file)
+            fprintf (dump_file, "  All counts are 0; scale = 1\n");
+          scale_bbs (bbs, 1);
+          continue;
+        }
+
        /* Now visit the component and try to figure out its desired
          frequency.  */
        for (basic_block b : bbs)
@@ -1653,84 +1728,83 @@ afdo_adjust_guessed_profile (bb_set *annotated_bb)
               }
             else
               {
-                gcc_checking_assert (current_component.contains (e->src));
                 current_component_count += e->count ();
+                gcc_checking_assert (component[e->src->index] == component_id);
               }
           if (boundary && current_component_count.initialized_p ())
             {
-              profile_count in_count = b->count - current_component_count;
               if (dump_file)
-                {
-                  fprintf (dump_file, "    bb %i in count ", b->index);
-                  in_count.dump (dump_file);
-                  fprintf (dump_file, " should be ");
-                  annotated_count.dump (dump_file);
-                  fprintf (dump_file, "\n");
-                }
-              if (in_count.nonzero_p ())
-                {
-                  sreal scale
-                    = annotated_count.guessed_local ()
-                            .to_sreal_scale (in_count);
-                  if (dump_file)
-                    fprintf (dump_file, "    adding scale %.16f\n",
-                             scale.to_double ());
-                  scales.safe_push (scale);
-                }
+                fprintf (dump_file, "    bb %i in count ", b->index);
+              add_scale (&scales,
+                         annotated_count,
+                         b->count - current_component_count);
             }
           for (edge e: b->succs)
             if (AFDO_EINFO (e)->is_annotated ())
               {
-                profile_count out_count = e->count ();
-                profile_count annotated_count = AFDO_EINFO (e)->get_count ();
                 if (dump_file)
-                  {
-                    fprintf (dump_file, "    edge %i->%i count ",
-                             b->index, e->dest->index);
-                    out_count.dump (dump_file);
-                    fprintf (dump_file, " should be ");
-                    annotated_count.dump (dump_file);
-                    fprintf (dump_file, "\n");
-                  }
-                if (out_count.nonzero_p ())
-                  {
-                    sreal scale
-                      = annotated_count.guessed_local ()
-                              .to_sreal_scale (out_count);
-                    if (dump_file)
-                      fprintf (dump_file, "    adding scale %.16f\n",
-                               scale.to_double ());
-                    scales.safe_push (scale);
-                  }
+                  fprintf (dump_file, "    edge %i->%i count ",
+                           b->index, e->dest->index);
+                add_scale (&scales, AFDO_EINFO (e)->get_count (), e->count ());
+              }
+            else if (is_bb_annotated (e->dest, *annotated_bb))
+              {
+                profile_count annotated_count = e->dest->count;
+                profile_count out_count = profile_count::zero ();
+                bool ok = true;
+                for (edge e2: e->dest->preds)
+                  if (AFDO_EINFO (e2)->is_annotated ())
+                    annotated_count -= AFDO_EINFO (e2)->get_count ();
+                  else if (component[e->src->index] == component_id)
+                    out_count += e->count ();
+                  else if (e->probability.nonzero_p ())
+                    {
+                      ok = false;
+                      break;
+                    }
+                if (!ok)
+                  continue;
+                if (dump_file)
+                  fprintf (dump_file,
+                           "    edge %i->%i has annotated sucessor; count ",
+                           b->index, e->dest->index);
+                add_scale (&scales, annotated_count, e->count ());
               }
 
         }
+
+       /* If we failed to find annotated entry or exit edge,
+         look for exit edges and scale profile so the dest
+         BB get all flow it needs.  This is inprecise because
+         the edge is not annotated and thus BB has more than
+         one such predecessor.  */
+       if (!scales.length ())
+        for (basic_block b : bbs)
+          if (b->count.nonzero_p ())
+            for (edge e: b->succs)
+              if (is_bb_annotated (e->dest, *annotated_bb))
+                {
+                  profile_count annotated_count = e->dest->count;
+                  for (edge e2: e->dest->preds)
+                    if (AFDO_EINFO (e2)->is_annotated ())
+                      annotated_count -= AFDO_EINFO (e2)->get_count ();
+                  if (dump_file)
+                    fprintf (dump_file,
+                             "    edge %i->%i has annotated sucessor;"
+                             " upper bound count ",
+                             b->index, e->dest->index);
+                  add_scale (&scales, annotated_count, e->count ());
+                }
        if (!scales.length ())
-        continue;
+        {
+          if (dump_file)
+            fprintf (dump_file,
+                     "  Can not determine count from the boundary; giving up");
+          continue;
+        }
+       gcc_checking_assert (scales.length ());
        scales.qsort (cmp);
-       sreal scale = scales[scales.length () / 2];
-       if (dump_file)
-        fprintf (dump_file, "  Scaling by %.16f\n", scale.to_double ());
-       for (basic_block b : bbs)
-        if (!(b->count == profile_count::zero ())
-            && b->count.initialized_p ())
-          {
-            profile_count o = b->count;
-            b->count = b->count.force_guessed () * scale;
-
-            /* If we scaled to 0, make it auto-fdo since that is treated
-               less agressively.  */
-            if (!b->count.nonzero_p () && o.nonzero_p ())
-              b->count = profile_count::zero ().afdo ();
-            if (dump_file)
-              {
-                fprintf (dump_file, "    bb %i count updated ", b->index);
-                o.dump (dump_file);
-                fprintf (dump_file, " -> ");
-                b->count.dump (dump_file);
-                fprintf (dump_file, "\n");
-              }
-          }
+       scale_bbs (bbs, scales[scales.length () / 2]);
      }
 }

Reply via email to