From 1b33857d03375fda3cc9730daca96f5def2d285a Mon Sep 17 00:00:00 2001
From: Prachi Godbole <pgodbole@nvidia.com>
Date: Mon, 28 Jul 2025 23:14:11 -0700
Subject: [PATCH] [Patch] Address compile time issues for locality cloning pass

This patch attempts to reduce compile time for locality cloning pass by
reducing recursive calls to partition_callchain ().  This is achieved by
precomputing caller callee information into locality_info.  locality_info
stores all callees of a node, either directly or via inlined nodes thereby
avoiding calls to partition_callchain () for inlined nodes which are already
partitioned with their inlined_to nodes.  locality_info also stores precomputed
accumulated incoming edge frequencies per unique caller and avoids repeated
computation within partition_callchain ().

Approximately 45% compile time improvement is observed for
bootstrap-lto-locality config, and takes 2-5% more time on top of
bootstrap-lto.

This patch also handles appropriate memory management of pass specific data
structures.

Bootstrapped and tested on aarch64-none-linux-gnu.
Ok for mainline?

Thanks,
Prachi

Signed-off-by: Prachi Godbole <pgodbole@nvidia.com>

config/ChangeLog:

	* bootstrap-lto-locality.mk (STAGE2_CFLAGS): Add param
	lto-max-locality-partition.
	(STAGE3_CFLAGS): Ditto.
	(STAGEprofile_CFLAGS): Remove -fipa-reorder-for-locality.
	(STAGEtrain_CFLAGS): Ditto.

gcc/ChangeLog:

	* ipa-locality-cloning.cc (struct locality_info): New struct.
	(loc_infos): Ditto.
	(get_locality_info): New function.
	(populate_callee_locality_info): Ditto.
	(populate_caller_locality_info): Ditto.
	(create_locality_info): Ditto.
	(adjust_recursive_callees): Access node_to_clone by reference.
	(inline_clones): Access node_to_clone and clone_to_node by reference.
	(clone_node_as_needed): Ditto.
	(accumulate_incoming_edge_frequency): Remove function.
	(clone_node_p): New function.
	(partition_callchain): Change prototype.
	(locality_determine_ipa_order): Call create_locality_info ().
	(locality_determine_static_order): Ditto.
	(locality_partition_and_clone): Update call to partition_callchain ()
	according prototype.
	(lc_execute): Allocate and free node_to_ch_info, node_to_clone,
	clone_to_node.
---
 config/bootstrap-lto-locality.mk |  10 +-
 gcc/ipa-locality-cloning.cc      | 302 +++++++++++++++++++++----------
 2 files changed, 209 insertions(+), 103 deletions(-)

diff --git a/config/bootstrap-lto-locality.mk b/config/bootstrap-lto-locality.mk
index b31565c4c52..a5103724c09 100644
--- a/config/bootstrap-lto-locality.mk
+++ b/config/bootstrap-lto-locality.mk
@@ -1,9 +1,11 @@
 # This option enables LTO and locality partitioning for stage2 and stage3 in slim mode
 
-STAGE2_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality
-STAGE3_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality
-STAGEprofile_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality
-STAGEtrain_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality
+STAGE2_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality \
+		 --param lto-max-locality-partition=500000
+STAGE3_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality \
+		 --param lto-max-locality-partition=500000
+STAGEprofile_CFLAGS += -flto=jobserver -frandom-seed=1
+STAGEtrain_CFLAGS += -flto=jobserver -frandom-seed=1
 STAGEfeedback_CFLAGS += -flto=jobserver -frandom-seed=1 -fipa-reorder-for-locality
 
 # assumes the host supports the linker plugin
diff --git a/gcc/ipa-locality-cloning.cc b/gcc/ipa-locality-cloning.cc
index 2684046bd2d..a9adb6adfdf 100644
--- a/gcc/ipa-locality-cloning.cc
+++ b/gcc/ipa-locality-cloning.cc
@@ -73,9 +73,9 @@ vec<locality_partition> locality_partitions;
 
 /* Map from original node to its latest clone.  Gets overwritten whenever a new
    clone is created from the same node.  */
-hash_map<cgraph_node *, cgraph_node *> node_to_clone;
+static hash_map<cgraph_node *, cgraph_node *> *node_to_clone;
 /* Map from clone to its original node.  */
-hash_map<cgraph_node *, cgraph_node *> clone_to_node;
+static hash_map<cgraph_node *, cgraph_node *> *clone_to_node;
 
 /* Data structure to hold static heuristics and orders for cgraph_nodes.  */
 struct locality_order
@@ -86,6 +86,92 @@ struct locality_order
   {}
 };
 
+/* Data structure to hold precomputed callchain information.  */
+struct locality_info
+{
+  cgraph_node *node;
+
+  /* Consolidated callees, including callees of inlined nodes.  */
+  hash_set<cgraph_node *> unique_callees;
+  auto_vec<cgraph_edge *> all_callees;
+
+  /* Accumulated caller->node edge frequencies for unique callers.  */
+  hash_map<cgraph_node *, sreal> caller_freq;
+};
+
+/* Pool allocation for locality_info.  */
+static object_allocator<locality_info> loc_infos ("IPA locality callchain");
+static hash_map<cgraph_node *, locality_info *> *node_to_ch_info;
+
+/* Return locality_info for NODE if present, otherwise return NULL.  */
+static inline locality_info *
+get_locality_info (cgraph_node *node)
+{
+  locality_info **ninfo = node_to_ch_info->get (node);
+  if (ninfo)
+    return *ninfo;
+  return NULL;
+}
+
+/* Populate locality_info for NODE from its direct callees and callees via
+   inlined nodes.  N is used to iterate callees of NODE and callees of inlined
+   callees of NODE.  */
+static void
+populate_callee_locality_info (cgraph_node *node, cgraph_node *n,
+			       locality_info *info)
+{
+  for (cgraph_edge *e = n->callees; e; e = e->next_callee)
+    {
+      cgraph_node *c = e->callee;
+      if (c->inlined_to == node)
+	populate_callee_locality_info (node, c, info);
+      else
+	{
+	  if (!info->unique_callees.add (c))
+	    info->all_callees.safe_push (e);
+	}
+    }
+}
+
+/* Populate locality_info for NODE from its direct callers.  */
+static void
+populate_caller_locality_info (cgraph_node *node, locality_info *info)
+{
+  struct cgraph_edge *e;
+  for (e = node->callers; e; e = e->next_caller)
+    {
+      /* Make a local decision about all edges for EDGE->caller but not the
+	 other nodes already in the partition.  Their edges will be visited
+	 later or may have been visited before and not fit the
+	 cut-off criteria.  */
+      if (auto cfreq = info->caller_freq.get (e->caller))
+	(*cfreq) = (*cfreq) + e->sreal_frequency ();
+      else
+	info->caller_freq.put (e->caller, e->sreal_frequency ());
+    }
+}
+
+/* Initialize locality_info for node V.  If CLONE_P is true, V is a locality
+   clone; populate callee information for locality clones because caller info
+   is needed for cloning decisions and clones are not cloned again.  Populate
+   both caller and callee info for non-clone nodes.  */
+
+static inline void
+create_locality_info (cgraph_node *v, bool clone_p = false)
+{
+  locality_info **info = node_to_ch_info->get (v);
+  gcc_assert (!info);
+
+  locality_info *vinfo = loc_infos.allocate ();
+  vinfo->node = v;
+  node_to_ch_info->put (v, vinfo);
+
+  /* Locality clones are not cloned again.  */
+  if (!clone_p)
+    populate_caller_locality_info (v, vinfo);
+  populate_callee_locality_info (v, v, vinfo);
+}
+
 /* Return true if NODE is already in some partition.  */
 static inline bool
 node_partitioned_p (cgraph_node *node)
@@ -511,7 +597,7 @@ adjust_recursive_callees (cgraph_node *clone, cgraph_node *new_callee,
       cgraph_node *callee = e->callee;
       if (callee == orig_callee)
 	{
-	  cgraph_node **cl = node_to_clone.get (orig_callee);
+	  cgraph_node **cl = node_to_clone->get (orig_callee);
 	  gcc_assert (cl && *cl == new_callee);
 	  e->redirect_callee_duplicating_thunks (new_callee);
 	  if (dump_file)
@@ -571,8 +657,8 @@ inline_clones (cgraph_node *caller, cgraph_node *orig_inlined_to)
 				 "locality_clone" /*suffix*/);
       edge->redirect_callee (cl);
 
-      node_to_clone.put (callee, cl);
-      clone_to_node.put (cl, callee);
+      node_to_clone->put (callee, cl);
+      clone_to_node->put (cl, callee);
 
       if (callee->thunk)
 	{
@@ -659,11 +745,11 @@ clone_node_as_needed (cgraph_edge *edge, locality_partition partition,
      a -> b or ac -> b or ac -> bc0  */
 
   cgraph_node *orig_cnode = cnode;
-  cgraph_node **o_cnode = clone_to_node.get (cnode);
+  cgraph_node **o_cnode = clone_to_node->get (cnode);
   if (o_cnode)
     orig_cnode = *o_cnode;
 
-  cgraph_node **cnode_cl = node_to_clone.get (orig_cnode);
+  cgraph_node **cnode_cl = node_to_clone->get (orig_cnode);
 
   if (cnode_cl && node_in_partition_p (partition, *cnode_cl))
     {
@@ -714,8 +800,8 @@ clone_node_as_needed (cgraph_edge *edge, locality_partition partition,
   if (!cloned_node)
     return NULL;
 
-  node_to_clone.put (cnode, cloned_node);
-  clone_to_node.put (cloned_node, cnode);
+  node_to_clone->put (cnode, cloned_node);
+  clone_to_node->put (cloned_node, cnode);
 
   adjust_recursive_callees (cloned_node, cloned_node, cnode);
   symtab->call_cgraph_duplication_hooks (cnode, cloned_node);
@@ -727,25 +813,6 @@ clone_node_as_needed (cgraph_edge *edge, locality_partition partition,
   return cloned_node;
 }
 
-/* Accumulate frequency of all edges from EDGE->caller to EDGE->callee.  */
-
-static sreal
-accumulate_incoming_edge_frequency (cgraph_edge *edge)
-{
-  sreal count = 0;
-  struct cgraph_edge *e;
-  for (e = edge->callee->callers; e; e = e->next_caller)
-    {
-      /* Make a local decision about all edges for EDGE->caller but not the
-	 other nodes already in the partition.  Their edges will be visited
-	 later or may have been visited before and not fit the
-	 cut-off criteria.  */
-      if (e->caller == edge->caller)
-	count += e->sreal_frequency ();
-    }
-  return count;
-}
-
 /* Determine if EDGE->CALLEE is suitable for cloning.  It is assummed that the
    callee is not an inlined node.  */
 
@@ -801,94 +868,123 @@ suitable_for_locality_cloning_p (cgraph_edge *edge,
   return true;
 }
 
-/* Map from caller to all callees already visited for partitioning.  */
-hash_map<cgraph_node *, auto_vec<cgraph_node *> > caller_to_callees;
+/* Return true if edge->callee->ultimate_alias_target can be cloned.  */
+static bool
+clone_node_p (cgraph_edge *edge, lto_locality_cloning_model cloning_model,
+	      double freq_cutoff, int size)
+{
+  cgraph_node *node = edge->callee->ultimate_alias_target ();
+
+  if (!suitable_for_locality_cloning_p (edge, cloning_model))
+    return false;
+
+  if (!node->alias)
+    if (ipa_size_summaries->get (node)->size >= size)
+      return false;
+
+  if (freq_cutoff != 0.0)
+    {
+      locality_info *info = get_locality_info (node);
+      gcc_assert (info);
+      if (auto cfreq = info->caller_freq.get (edge->caller))
+	{
+	  if ((*cfreq).to_double () < freq_cutoff)
+	    return false;
+	}
+      else if (edge->sreal_frequency ().to_double () < freq_cutoff)
+	return false;
+    }
+
+  return true;
+}
 
-/* Partition EDGE->CALLEE into PARTITION or clone if already partitioned and
+/* Partition NODE's callees into PARTITION or clone if already partitioned and
    satisfies cloning criteria such as CLONING_MODEL, REAL_FREQ and SIZE
-   cut-offs and CLONE_FURTHER_P set by previous caller.  */
+   cut-offs.  */
 
 /* callgraph can have multiple caller to callee edges for multiple callsites
    For the first such edge, we make decisions about cutoffs and cloning because
    we redirect ALL callsites to cloned callee, not just one of them.  */
 
 static void
-partition_callchain (cgraph_edge *edge, locality_partition partition,
-		     bool clone_further_p,
+partition_callchain (cgraph_node *node, locality_partition &partition,
 		     lto_locality_cloning_model cloning_model,
-		     double freq_cutoff, int size, int &cl_num)
+		     double freq_cutoff, int size, int &cl_num,
+		     int &npartitions, int64_t partition_size)
 {
   /* Aliases are added in the same partition as their targets.
      Aliases are not cloned and their callees are not processed separately.  */
-  cgraph_node *node = edge->callee->ultimate_alias_target ();
-  cgraph_node *caller = edge->caller;
-  cgraph_node *caller_node = node, *cl_node = NULL;
-
-  /* Already visited the caller to callee edges.  */
-  auto_vec<cgraph_node *> &callees = caller_to_callees.get_or_insert (caller);
-  if (std::find (callees.begin (), callees.end (), node) != callees.end ())
-    return;
-
-  callees.safe_push (node);
-
-  if (node->get_partitioning_class () == SYMBOL_PARTITION)
+  cgraph_node *cl_node = NULL;
+  if (partition->insns > partition_size)
+    partition = create_partition (npartitions);
+
+  /* Iterate over all unique callees of NODE, direct callees and callees via
+     inlined nodes.  This avoids calling partition_callchain () separately for
+     inlined nodes which themselves are already partitioned along with their
+     inlined_to nodes.  */
+  locality_info *info = get_locality_info (node);
+  for (unsigned i = 0; i < info->all_callees.length (); i++)
     {
-      if (!node_partitioned_p (node))
+      cgraph_edge *e = info->all_callees[i];
+      cgraph_node *n = e->callee->ultimate_alias_target ();
+      if (n->get_partitioning_class () == SYMBOL_PARTITION)
 	{
-	  add_node_to_partition (partition, node);
-	  if (dump_file)
-	    fprintf (dump_file, "Partitioned node: %s\n",
-		     node->dump_asm_name ());
-	}
-      else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING
-	       && !node_in_partition_p (partition, node))
-	{
-	  /* Non-inlined node, or alias, already partitioned
-	     If cut-off, don't clone callees but partition unpartitioned
-	     callees.
-	     size is node + inlined nodes.  */
-	  if (clone_further_p)
+	  if (!node_partitioned_p (n))
 	    {
-	      if (!node->alias)
-		if (ipa_size_summaries->get (node)->size >= size)
-		  clone_further_p = false;
-
-	      if (freq_cutoff != 0.0)
-		{
-		  sreal acc_freq = accumulate_incoming_edge_frequency (edge);
-		  if (acc_freq.to_double () < freq_cutoff)
-		    clone_further_p = false;
-		}
+	      add_node_to_partition (partition, n);
+	      if (dump_file)
+	      fprintf (dump_file, "Partitioned node: %s\n",
+		       n->dump_asm_name ());
+	      partition_callchain (n, partition, cloning_model, freq_cutoff,
+				   size, cl_num, npartitions, partition_size);
 	    }
+	  else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING
+		   && (!e->callee->alias)
+		   && node_in_partition_p (partition, e->caller)
+		   && (!node_in_partition_p (partition, n)))
 
-	  if (!suitable_for_locality_cloning_p (edge, cloning_model))
-	    clone_further_p = false;
-
-	  if (clone_further_p)
 	    {
-	      /* Try to clone NODE and its inline chain.  */
-	      if (dump_file)
-		fprintf (dump_file, "Cloning node: %s\n",
-			 node->dump_asm_name ());
-	      cl_node = clone_node_as_needed (edge, partition, cl_num,
-					      cloning_model);
-	      if (cl_node)
+	      /* 3 possible scenarios if N is already partitioned but not in
+		 present in PARTITION:
+		 1.  There's a clone of N present in PARTITION, redirect to that
+		     clone, no need to check for suitability.
+		 2.  N itself is a locality clone, cloned as part of another
+		     callchain.  If a clone of N's original node is present in
+		     PARTITION, redirect to it without checking for suitability.
+		     Cloned node itself is not cloned again.
+		     Example: suppose N = B_clone ().
+		     In partition X, edge A->B was transformed to A->B_clone0.
+		     In current partition, A was cloned to A_clone0 and now
+		     B_clone0 is visited via edge A_clone0->B_clone0.  If a
+		     B_clonei is present, redirect A_clone0 to it, otherise do
+		     nothing.
+		 3.  N is not a locality clone and no clone of N is present in
+		     PARTITION, check for suitability and clone.  */
+	      cgraph_node *orig_cnode = n;
+	      cgraph_node **o_cnode = clone_to_node->get (n);
+	      if (o_cnode)
+		orig_cnode = *o_cnode;
+
+	      cgraph_node **cnode_cl = node_to_clone->get (orig_cnode);
+
+	      if ((cnode_cl && node_in_partition_p (partition, *cnode_cl))
+		  || (orig_cnode == n
+		      && clone_node_p (e, cloning_model, freq_cutoff, size)))
 		{
-		  add_node_to_partition (partition, cl_node);
-		  caller_node = cl_node;
+		  cl_node = clone_node_as_needed (e, partition, cl_num,
+						  cloning_model);
+		  if (cl_node)
+		    {
+		      create_locality_info (cl_node, true);
+		      add_node_to_partition (partition, cl_node);
+		      partition_callchain (cl_node, partition, cloning_model,
+					   freq_cutoff, size, cl_num,
+					   npartitions, partition_size);
+		    }
 		}
-	      else
-		caller_node = NULL;
 	    }
 	}
     }
-  else if (!node->inlined_to)
-    return;
-
-  if (caller_node)
-    for (cgraph_edge *e = caller_node->callees; e; e = e->next_callee)
-      partition_callchain (e, partition, clone_further_p, cloning_model,
-			   freq_cutoff, size, cl_num);
 }
 
 /* Determine whether NODE is an entrypoint to a callchain.  */
@@ -925,6 +1021,7 @@ locality_determine_ipa_order (auto_vec<locality_order *> *order)
   FOR_EACH_DEFINED_FUNCTION (node)
     if (node->get_partitioning_class () == SYMBOL_PARTITION)
       {
+	create_locality_info (node);
 	if (node->no_reorder)
 	  {
 	    if (dump_file)
@@ -971,6 +1068,7 @@ locality_determine_static_order (auto_vec<locality_order *> *order)
   FOR_EACH_DEFINED_FUNCTION (node)
     if (node->get_partitioning_class () == SYMBOL_PARTITION)
       {
+	create_locality_info (node);
 	if (node->no_reorder)
 	  {
 	    if (dump_file)
@@ -1057,17 +1155,15 @@ locality_partition_and_clone (int max_locality_partition_size,
       if (dump_file)
 	fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ());
 
-      for (cgraph_edge *edge = node->callees; edge; edge = edge->next_callee)
-	{
-	  /* Recursively partition the callchain of edge->callee.  */
-	  partition_callchain (edge, partition, true, cloning_model, real_freq,
-			       size, cl_num);
-	}
+      partition_callchain (node, partition, cloning_model, real_freq, size,
+			   cl_num, npartitions, partition_size);
     }
 
   for (unsigned i = 0; i < order.length (); i++)
     delete order[i];
   order = vNULL;
+
+  loc_infos.release ();
 }
 
 /* Entry point to locality-clone pass.  */
@@ -1078,11 +1174,19 @@ lc_execute (void)
   FOR_EACH_SYMBOL (node)
     node->aux = NULL;
 
+  node_to_ch_info = new hash_map<cgraph_node *, locality_info *>;
+  node_to_clone = new hash_map<cgraph_node *, cgraph_node *>;
+  clone_to_node = new hash_map<cgraph_node *, cgraph_node *>;
+
   locality_partition_and_clone (param_max_locality_partition_size,
 				flag_lto_locality_cloning,
 				param_lto_locality_frequency,
 				param_lto_locality_size);
 
+  delete node_to_ch_info;
+  delete node_to_clone;
+  delete clone_to_node;
+
   FOR_EACH_SYMBOL (node)
     node->aux = NULL;
   return 0;
-- 
2.34.1

