On Fri, 30 Aug 2013, Jan Hubicka wrote:

> Hi,
> this patch makes COMDAT profiles right with LTO. (made jointly with Martin 
> Liska)
> 
> To recap the issue:  COMDAT functions are produced many times. Each copy gets
> its own set of counters and depending on inlining and linker decision, one 
> or more of copies of a given COMDAT function will get executed on runtime.
> After reading profile in, the profiles can be wrong when inlining/linker
> decision differs at -fprofile-use stage.
> 
> This has nasty effect of optimizing important stuff for size.
> 
> Now with LTO the problem is solved, since early inlining things works 
> magically
> well.  Catch is that for large projects, we tend to explode with
> -fprofile-generate -flto and we explicitely ask users to not use -flto when
> generating profile.  This brings the problem back.
> 
> This patch makes profiles of multiple copies of given comdat to be merged 
> during
> LTO symtab merging.  This is done in the busy way: both functions are read 
> into
> memory, compared if CFGs match, profiles are merged, cgraph profile is updated
> based on CFG profile and one of the bodies is released from memory.  The other
> body will then be streamed by WPA as if the function was born during WPA time.
> 
> This is not terribly cheap by design (we load COMDATs into WPA memory), but
> reading happens only when COMDAT gets merged and more than one of them has
> non-0 count from profile feedback. Moreover this whole path is executed only
> when -fno-lto is used for -fprofile-generate.
> 
> The compile time/memory impact seems under 1% both on GCC and firefox.  For 
> GCC
> profiledbootstrap we merge 1600 functions, mostly vector accestors and tree
> checking (I tried checking enabled build). 
> 
> To make things even ugier, there is nasty special case where we already merged
> function declarations, but we absolutely need to read two different bodies of
> the same function.  I did so by copying the declaration of function I am going
> to remove.  This provoke checking failure on labels that are in global stream
> and thus have wrong context pointers in one of the bodies.  This is harmless
> since we are going to throw it away, but it requires special case silencing of
> the sanity check for LTO stream in.  We may want to avoid streaming in gimple
> statements completely, but we need to merge statement histograms so that will
> require a bit of massaging of the on-disk format to make this possible.
> (histograms are currently stored into statement section that needs to be
> changed). I plan to look into this incrementally.
> 
> Now for longer term, we want to make function CFGs independent of gimple body
> and we want to decide on instrumentation at linktime, so we won't require user
> to rebuild with -fprofile-use, just relink.  I plan to work on this, but 
> not for 4.9.  Thus I hope we can go with this fix - the COMDAT profile loss
> issue has proved itself to be very difficult to deal with and very common
> for C++ programs.
> 
> Bootstrapped/regtested x86_64-liux, profiledbootstrapped and tested
> on firefox.
> 
> If there will be no real opposition, I will go ahead with this patch during
> weekend, so it is picked by periodic testers.
> 
> Honza
> 
>       * lto-symtab.c (lto_cgraph_replace_node): Merge function profiles.
>       * cgraph.c (cgraph_get_body): Update call of input_function_body.
>       * ipa-utils.c: Include lto-streamer.h and ipa-inline.h
>       (ipa_merge_profiles): New function.
>       * ipa-utils.h (ipa_merge_profiles): Declare.
>       * lto-streamer-in.c (lto_input_function_body): Change to use
>       cgraph_node as parameter.
>       (lto_read_body): Take cgraph node as parameter.
> Index: lto-symtab.c
> ===================================================================
> --- lto-symtab.c      (revision 202099)
> +++ lto-symtab.c      (working copy)
> @@ -80,6 +82,7 @@
>    /* Redirect incomming references.  */
>    ipa_clone_referring ((symtab_node)prevailing_node, &node->symbol.ref_list);
>  
> +  ipa_merge_profiles (prevailing_node, node);
>    lto_free_function_in_decl_state_for_node ((symtab_node)node);
>  
>    if (node->symbol.decl != prevailing_node->symbol.decl)
> Index: cgraph.c
> ===================================================================
> --- cgraph.c  (revision 202099)
> +++ cgraph.c  (working copy)
> @@ -3109,7 +3109,7 @@
>  
>    gcc_assert (DECL_STRUCT_FUNCTION (decl) == NULL);
>  
> -  lto_input_function_body (file_data, node->symbol.decl, data);
> +  lto_input_function_body (file_data, node, data);
>    lto_stats.num_function_bodies++;
>    lto_free_section_data (file_data, LTO_section_function_body, name,
>                        data, len);
> Index: ipa-utils.c
> ===================================================================
> --- ipa-utils.c       (revision 202099)
> +++ ipa-utils.c       (working copy)
> @@ -37,6 +37,8 @@
>  #include "flags.h"
>  #include "diagnostic.h"
>  #include "langhooks.h"
> +#include "lto-streamer.h"
> +#include "ipa-inline.h"
>  
>  /* Debugging function for postorder and inorder code. NOTE is a string
>     that is printed before the nodes are printed.  ORDER is an array of
> @@ -618,3 +620,174 @@
>  {
>    dump_varpool_node_set (stderr, set);
>  }
> +
> +
> +/* SRC and DST are going to be merged.  Take SRC's profile and merge it into
> +   DST so it is not going to be lost.  Destroy SRC's body on the way.  */
> +
> +void
> +ipa_merge_profiles (struct cgraph_node *dst,
> +                 struct cgraph_node *src)
> +{
> +  tree oldsrcdecl = src->symbol.decl;
> +  struct function *srccfun, *dstcfun;
> +  bool match = true;
> +
> +  if (!src->symbol.definition
> +      || !dst->symbol.definition)
> +    return;
> +  if (src->frequency < dst->frequency)
> +    src->frequency = dst->frequency;
> +  if (!dst->count)
> +    return;
> +  if (cgraph_dump_file)
> +    {
> +      fprintf (cgraph_dump_file, "Merging profiles of %s/%i to %s/%i\n",
> +            xstrdup (cgraph_node_name (src)), src->symbol.order,
> +            xstrdup (cgraph_node_name (dst)), dst->symbol.order);
> +    }
> +  dst->count += src->count;
> +
> +  /* This is ugly.  We need to get both function bodies into memory.
> +     If declaration is merged, we need to duplicate it to be able
> +     to load body that is being replaced.  This makes symbol table
> +     temporarily inconsistent.  */
> +  if (src->symbol.decl == dst->symbol.decl)
> +    {
> +      void **slot;
> +      struct lto_in_decl_state temp;
> +      struct lto_in_decl_state *state;
> +
> +      /* We are going to move the decl, we want to remove its file decl data.
> +      and link these with the new decl. */
> +      temp.fn_decl = src->symbol.decl;
> +      slot = htab_find_slot (src->symbol.lto_file_data->function_decl_states,
> +                          &temp, NO_INSERT);
> +      state = (lto_in_decl_state *)*slot;
> +      htab_clear_slot (src->symbol.lto_file_data->function_decl_states, 
> slot);
> +      gcc_assert (state);
> +
> +      /* Duplicate the decl and be sure it does not link into body of DST.  
> */
> +      src->symbol.decl = copy_node (src->symbol.decl);
> +      DECL_STRUCT_FUNCTION (src->symbol.decl) = NULL;
> +      DECL_ARGUMENTS (src->symbol.decl) = NULL;
> +      DECL_INITIAL (src->symbol.decl) = NULL;
> +      DECL_RESULT (src->symbol.decl) = NULL;
> +
> +      /* Associate the decl state with new declaration, so LTO streamer
> +      can look it up.  */
> +      state->fn_decl = src->symbol.decl;
> +      slot = htab_find_slot (src->symbol.lto_file_data->function_decl_states,
> +                          state, INSERT);
> +      gcc_assert (!*slot);
> +      *slot = state;
> +    }
> +  cgraph_get_body (src);
> +  cgraph_get_body (dst);
> +  srccfun = DECL_STRUCT_FUNCTION (src->symbol.decl);
> +  dstcfun = DECL_STRUCT_FUNCTION (dst->symbol.decl);
> +  if (n_basic_blocks_for_function (srccfun)
> +      != n_basic_blocks_for_function (dstcfun))
> +    {
> +      if (cgraph_dump_file)
> +     fprintf (cgraph_dump_file,
> +              "Giving up; number of basic block mismatch.\n");
> +      match = false;
> +    }
> +  else if (last_basic_block_for_function (srccfun)
> +        != last_basic_block_for_function (dstcfun))
> +    {
> +      if (cgraph_dump_file)
> +     fprintf (cgraph_dump_file,
> +              "Giving up; last block mismatch.\n");
> +      match = false;
> +    }
> +  else 
> +    {
> +      basic_block srcbb, dstbb;
> +
> +      FOR_ALL_BB_FN (srcbb, srccfun)
> +     {
> +       unsigned int i;
> +
> +       dstbb = BASIC_BLOCK_FOR_FUNCTION (dstcfun, srcbb->index);
> +       if (dstbb == NULL)
> +         {
> +           if (cgraph_dump_file)
> +             fprintf (cgraph_dump_file,
> +                      "No matching block for bb %i.\n",
> +                      srcbb->index);
> +           match = false;
> +           break;
> +         }
> +       if (EDGE_COUNT (srcbb->succs) != EDGE_COUNT (dstbb->succs))
> +         {
> +           if (cgraph_dump_file)
> +             fprintf (cgraph_dump_file,
> +                      "Edge count mistmatch for bb %i.\n",
> +                      srcbb->index);
> +           match = false;
> +           break;
> +         }
> +       for (i = 0; i < EDGE_COUNT (srcbb->succs); i++)
> +         {
> +           edge srce = EDGE_SUCC (srcbb, i);
> +           edge dste = EDGE_SUCC (dstbb, i);
> +           if (srce->dest->index != dste->dest->index)
> +             {
> +               if (cgraph_dump_file)
> +                 fprintf (cgraph_dump_file,
> +                          "Succ edge mistmatch for bb %i.\n",
> +                          srce->dest->index);
> +               match = false;
> +               break;
> +             }
> +         }
> +     }
> +    }
> +  if (match)
> +    {
> +      struct cgraph_edge *e;
> +      basic_block srcbb, dstbb;
> +
> +      /* TODO: merge also statement histograms.  */
> +      FOR_ALL_BB_FN (srcbb, srccfun)
> +     {
> +       unsigned int i;
> +
> +       dstbb = BASIC_BLOCK_FOR_FUNCTION (dstcfun, srcbb->index);
> +       dstbb->count += srcbb->count;
> +       for (i = 0; i < EDGE_COUNT (srcbb->succs); i++)
> +         {
> +           edge srce = EDGE_SUCC (srcbb, i);
> +           edge dste = EDGE_SUCC (dstbb, i);
> +           dste->count += srce->count;
> +         }
> +     }
> +      push_cfun (dstcfun);
> +      counts_to_freqs ();
> +      compute_function_frequency ();
> +      pop_cfun ();
> +      for (e = dst->callees; e; e = e->next_callee)
> +     {
> +       gcc_assert (!e->speculative);
> +       e->count = gimple_bb (e->call_stmt)->count;
> +       e->frequency = compute_call_stmt_bb_frequency
> +                          (dst->symbol.decl,
> +                           gimple_bb (e->call_stmt));
> +     }
> +      for (e = dst->indirect_calls; e; e = e->next_callee)
> +     {
> +       gcc_assert (!e->speculative);
> +       e->count = gimple_bb (e->call_stmt)->count;
> +       e->frequency = compute_call_stmt_bb_frequency
> +                          (dst->symbol.decl,
> +                           gimple_bb (e->call_stmt));
> +     }
> +      cgraph_release_function_body (src);
> +      inline_update_overall_summary (dst);
> +    }
> +  /* TODO: if there is no match, we can scale up.  */
> +  src->symbol.decl = oldsrcdecl;
> +}
> +
> Index: ipa-utils.h
> ===================================================================
> --- ipa-utils.h       (revision 202099)
> +++ ipa-utils.h       (working copy)
> @@ -44,6 +44,8 @@
>  vec<cgraph_node_ptr> ipa_get_nodes_in_cycle (struct cgraph_node *);
>  int ipa_reverse_postorder (struct cgraph_node **);
>  tree get_base_var (tree);
> +void ipa_merge_profiles (struct cgraph_node *dst,
> +                      struct cgraph_node *src);
>  
>  /* In ipa-devirt.c  */
>  
> Index: gimple-streamer-in.c
> ===================================================================
> --- gimple-streamer-in.c      (revision 202099)
> +++ gimple-streamer-in.c      (working copy)
> @@ -284,7 +284,11 @@
>      }
>    else if (code == GIMPLE_LABEL)
>      gcc_assert (emit_label_in_global_context_p (gimple_label_label (stmt))
> -             || DECL_CONTEXT (gimple_label_label (stmt)) == fn->decl);
> +             || DECL_CONTEXT (gimple_label_label (stmt)) == fn->decl
> +             /* Do not sanity check before decl merging is completed.
> +                This is needed for profile merging during symtab
> +                resolution.  */
> +             || cgraph_state == CGRAPH_LTO_STREAMING);

Please instead remove this assert and put the checking into
tree-cfg.c:verify_gimple_label where it should need no special-casing
on cgraph_state.

Otherwise the non-profile bits look ok.

Thanks,
Richard.

>    else if (code == GIMPLE_ASM)
>      {
>        unsigned i;
> Index: lto-streamer.h
> ===================================================================
> --- lto-streamer.h    (revision 202099)
> +++ lto-streamer.h    (working copy)
> @@ -834,7 +834,8 @@
>  /* In lto-streamer-in.c */
>  extern void lto_input_cgraph (struct lto_file_decl_data *, const char *);
>  extern void lto_reader_init (void);
> -extern void lto_input_function_body (struct lto_file_decl_data *, tree,
> +extern void lto_input_function_body (struct lto_file_decl_data *,
> +                                  struct cgraph_node *,
>                                    const char *);
>  extern void lto_input_constructors_and_inits (struct lto_file_decl_data *,
>                                             const char *);
> Index: lto-streamer-in.c
> ===================================================================
> --- lto-streamer-in.c (revision 202099)
> +++ lto-streamer-in.c (working copy)
> @@ -1001,14 +1064,14 @@
>  }
>  
>  
> -/* Read the body from DATA for function FN_DECL and fill it in.
> +/* Read the body from DATA for function NODE and fill it in.
>     FILE_DATA are the global decls and types.  SECTION_TYPE is either
>     LTO_section_function_body or LTO_section_static_initializer.  If
>     section type is LTO_section_function_body, FN must be the decl for
>     that function.  */
>  
>  static void
> -lto_read_body (struct lto_file_decl_data *file_data, tree fn_decl,
> +lto_read_body (struct lto_file_decl_data *file_data, struct cgraph_node 
> *node,
>              const char *data, enum lto_section_type section_type)
>  {
>    const struct lto_function_header *header;
> @@ -1018,6 +1081,7 @@
>    int string_offset;
>    struct lto_input_block ib_cfg;
>    struct lto_input_block ib_main;
> +  tree fn_decl = node->symbol.decl;
>  
>    header = (const struct lto_function_header *) data;
>    cfg_offset = sizeof (struct lto_function_header);
> @@ -1044,7 +1108,6 @@
>    if (section_type == LTO_section_function_body)
>      {
>        struct lto_in_decl_state *decl_state;
> -      struct cgraph_node *node = cgraph_get_node (fn_decl);
>        unsigned from;
>  
>        gcc_checking_assert (node);
> @@ -1094,14 +1157,14 @@
>  }
>  
>  
> -/* Read the body of FN_DECL using DATA.  FILE_DATA holds the global
> +/* Read the body of NODE using DATA.  FILE_DATA holds the global
>     decls and types.  */
>  
>  void
>  lto_input_function_body (struct lto_file_decl_data *file_data,
> -                      tree fn_decl, const char *data)
> +                      struct cgraph_node *node, const char *data)
>  {
> -  lto_read_body (file_data, fn_decl, data, LTO_section_function_body);
> +  lto_read_body (file_data, node, data, LTO_section_function_body);
>  }
>  
>  
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend

Reply via email to