The following patch changes the inliner callsite filter with FDO/LIPO.
Previously, cold callsites were unconditionally rejected. Now the
callsite may still be inlined if the _caller_ is sufficiently hot (max
count of any bb in the function is above hot threshold). This gives
about 0.5 - 1% geomean performance on x86-64 (depending on microarch)
on internal benchmarks with < 1% average code size increase.
Bootstrapped and reg tested. Ok for google/gcc-4_6?
Mark
2011-08-23 Mark Heffernan <[email protected]>
* basic-block.h (maybe_hot_frequency_p): Add prototype.
* cgraph.c (dump_cgraph_node): Add field to dump.
(cgraph_clone_node) Handle new field.
* cgraph.h (cgraph_node): New field max_bb_count.
* cgraphbuild.c (rebuild_cgraph_edges): Compute max_bb_count.
* cgraphunit.c (cgraph_copy_node_for_versioning) Handle new field.
* common.opt (finline-hot-caller): New option.
* ipa-inline.c (cgraph_mark_inline_edge) Update max_bb_count.
(edge_hot_enough_p) New function.
(cgraph_decide_inlining_of_small_functions) Call edge_hot_enough_p.
* predict.c (maybe_hot_frequency_p): Remove static keyword and
guard with profile_info check.
* testsuite/gcc.dg/tree-prof/inliner-1.c: Add flag.
* testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c: Add flag.
Index: cgraphbuild.c
===================================================================
--- cgraphbuild.c (revision 177964)
+++ cgraphbuild.c (working copy)
@@ -591,9 +591,12 @@ rebuild_cgraph_edges (void)
ipa_remove_all_references (&node->ref_list);
node->count = ENTRY_BLOCK_PTR->count;
+ node->max_bb_count = 0;
FOR_EACH_BB (bb)
{
+ if (bb->count > node->max_bb_count)
+ node->max_bb_count = bb->count;
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
{
gimple stmt = gsi_stmt (gsi);
Index: cgraph.c
===================================================================
--- cgraph.c (revision 177964)
+++ cgraph.c (working copy)
@@ -1904,6 +1904,9 @@ dump_cgraph_node (FILE *f, struct cgraph
if (node->count)
fprintf (f, " executed "HOST_WIDEST_INT_PRINT_DEC"x",
(HOST_WIDEST_INT)node->count);
+ if (node->max_bb_count)
+ fprintf (f, " hottest bb executed "HOST_WIDEST_INT_PRINT_DEC"x",
+ (HOST_WIDEST_INT)node->max_bb_count);
if (node->local.inline_summary.self_time)
fprintf (f, " %i time, %i benefit", node->local.inline_summary.self_time,
node->local.inline_summary.time_inlining_benefit);
@@ -2234,6 +2237,9 @@ cgraph_clone_node (struct cgraph_node *n
new_node->global = n->global;
new_node->rtl = n->rtl;
new_node->count = count;
+ new_node->max_bb_count = count;
+ if (n->count)
+ new_node->max_bb_count = count * n->max_bb_count / n->count;
new_node->is_versioned_clone = n->is_versioned_clone;
new_node->frequency = n->frequency;
new_node->clone = n->clone;
@@ -2252,6 +2258,9 @@ cgraph_clone_node (struct cgraph_node *n
n->count -= count;
if (n->count < 0)
n->count = 0;
+ n->max_bb_count -= new_node->max_bb_count;
+ if (n->max_bb_count < 0)
+ n->max_bb_count = 0;
}
FOR_EACH_VEC_ELT (cgraph_edge_p, redirect_callers, i, e)
Index: cgraph.h
===================================================================
--- cgraph.h (revision 177964)
+++ cgraph.h (working copy)
@@ -235,6 +235,8 @@ struct GTY((chain_next ("%h.next"), chai
/* Expected number of executions: calculated in profile.c. */
gcov_type count;
+ /* Maximum count of any basic block in the function. */
+ gcov_type max_bb_count;
/* How to scale counts at materialization time; used to merge
LTO units with different number of profile runs. */
int count_materialization_scale;
Index: cgraphunit.c
===================================================================
--- cgraphunit.c (revision 177964)
+++ cgraphunit.c (working copy)
@@ -2187,6 +2187,7 @@ cgraph_copy_node_for_versioning (struct
new_version->rtl = old_version->rtl;
new_version->reachable = true;
new_version->count = old_version->count;
+ new_version->max_bb_count = old_version->max_bb_count;
new_version->is_versioned_clone = true;
for (e = old_version->callees; e; e=e->next_callee)
Index: testsuite/gcc.dg/tree-prof/inliner-1.c
===================================================================
--- testsuite/gcc.dg/tree-prof/inliner-1.c (revision 177964)
+++ testsuite/gcc.dg/tree-prof/inliner-1.c (working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-options "-O2 -fno-inline-hot-caller -fdump-tree-optimized" } */
int a;
int b[100];
void abort (void);
@@ -34,7 +34,7 @@ main ()
return 0;
}
-/* cold function should be inlined, while hot function should not.
+/* cold function should be not inlined, while hot function should be.
Look for "cold_function () [tail call];" call statement not for the
declaration or other apperances of the string in dump. */
/* { dg-final-use { scan-tree-dump "cold_function ..;" "optimized"} } */
Index: testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c
===================================================================
--- testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c (revision 177964)
+++ testsuite/gcc.dg/tree-prof/lipo/inliner-1_0.c (working copy)
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-options "-O2 -fno-inline-hot-caller -fdump-tree-optimized" } */
int a;
int b[100];
void abort (void);
@@ -34,7 +34,7 @@ main ()
return 0;
}
-/* cold function should be inlined, while hot function should not.
+/* cold function should not be inlined, while hot function should be.
Look for "cold_function () [tail call];" call statement not for the
declaration or other apperances of the string in dump. */
/* { dg-final-use { scan-tree-dump "cold_function ..;" "optimized"} } */
Index: ipa-inline.c
===================================================================
--- ipa-inline.c (revision 177964)
+++ ipa-inline.c (working copy)
@@ -332,6 +332,9 @@ cgraph_mark_inline_edge (struct cgraph_e
new_size = cgraph_estimate_size_after_inlining (to, what);
to->global.size = new_size;
to->global.time = cgraph_estimate_time_after_inlining (freq, to, what);
+
+ if (to->max_bb_count < e->callee->max_bb_count)
+ to->max_bb_count = e->callee->max_bb_count;
}
gcc_assert (what->global.inlined_to == to);
if (new_size > old_size)
@@ -1057,6 +1060,19 @@ add_new_edges_to_heap (fibheap_t heap, V
}
}
+/* Returns true if an edge or its caller are hot enough to
+ be considered for inlining. */
+
+static bool
+edge_hot_enough_p (struct cgraph_edge *edge)
+{
+ if (cgraph_maybe_hot_edge_p (edge))
+ return true;
+ if (flag_inline_hot_caller && maybe_hot_count_p (edge->caller->max_bb_count))
+ return true;
+ return false;
+}
+
/* We use greedy algorithm for inlining of small functions:
All inline candidates are put into prioritized heap based on estimated
@@ -1201,7 +1217,7 @@ cgraph_decide_inlining_of_small_function
if (edge->callee->local.disregard_inline_limits)
;
- else if (!cgraph_maybe_hot_edge_p (edge))
+ else if (!edge_hot_enough_p (edge))
not_good = CIF_UNLIKELY_CALL;
else if (!flag_inline_functions
&& !DECL_DECLARED_INLINE_P (edge->callee->decl))
Index: predict.c
===================================================================
--- predict.c (revision 177964)
+++ predict.c (working copy)
@@ -131,13 +131,13 @@ maybe_hot_frequency_p (int freq)
return true;
}
-/* Return TRUE if frequency FREQ is considered to be hot. */
+/* Return TRUE if frequency COUNT is considered to be hot. */
-static inline bool
+bool
maybe_hot_count_p (gcov_type count)
{
- if (profile_status != PROFILE_READ)
- return true;
+ if (!profile_info)
+ return false;
/* Code executed at most once is not hot. */
if (profile_info->runs >= count)
return false;
Index: common.opt
===================================================================
--- common.opt (revision 177964)
+++ common.opt (working copy)
@@ -1327,6 +1327,10 @@ finline-limit=
Common RejectNegative Joined UInteger
-finline-limit=<number> Limit the size of inlined functions to <number>
+finline-hot-caller
+Common Report Var(flag_inline_hot_caller) Init(1) Optimization
+Consider inlining cold callsites if the caller includes hot code
+
finstrument-functions
Common Report Var(flag_instrument_function_entry_exit)
Instrument function entry and exit with profiling calls
Index: basic-block.h
===================================================================
--- basic-block.h (revision 177964)
+++ basic-block.h (working copy)
@@ -744,6 +744,7 @@ extern struct edge_list *pre_edge_rev_lc
extern void compute_available (sbitmap *, sbitmap *, sbitmap *, sbitmap *);
/* In predict.c */
+extern bool maybe_hot_count_p (gcov_type);
extern bool maybe_hot_bb_p (const_basic_block);
extern bool maybe_hot_edge_p (edge);
extern bool probably_never_executed_bb_p (const_basic_block);