On Thu, 11 Apr 2019, Jan Hubicka wrote:

> Hi,
> the LTO streaming forks for every partition. With the number of
> partitions incrased to 128 and relatively large memory usage (around
> 5GB) needed to WPA firefox this causes kernel to spend a lot of time
> probably by copying the page tables.
> 
> This patch makes the streamer to for only lto_parallelism times
> and strem num_partitions/lto_paralleism in each thread.
> I have also added parameter because currently -flto=jobserv leads
> to unlimited parallelism.  This should be fixed by conneting to Make's
> jobsever and build our own mini jobserver to distribute partitions
> between worker threads, but this seems bit too involved for last minute
> change in stage4.  I plan to work on this and hopefully bacport it to .2
> release.
> 
> I have tested the performance on by 32CPU 64threads box and got best
> wall time with 32 partitions and therefore I set it by default.  I get
> 
> --param max-lto-streaming-parallelism=1
> Time variable                                   usr           sys          
> wall               GGC
>  phase stream out                   :  50.65 ( 30%)  20.66 ( 61%)  71.38 ( 
> 35%)     921 kB (  0%)
>  TOTAL                              : 170.73         33.69        204.64      
>   7459610 kB
> 
> --param max-lto-streaming-parallelism=4
>  phase stream out                   :  13.79 ( 11%)   6.80 ( 35%)  20.94 ( 
> 14%)     155 kB (  0%)
>  TOTAL                              : 130.26         19.68        150.46      
>   7458844 kB
> 
> --param max-lto-streaming-parallelism=8
>  phase stream out                   :   8.94 (  7%)   5.21 ( 29%)  14.15 ( 
> 10%)      83 kB (  0%)
>  TOTAL                              : 125.28         18.09        143.54      
>   7458773 kB
> 
> --param max-lto-streaming-parallelism=16
>  phase stream out                   :   4.56 (  4%)   4.34 ( 25%)   9.46 (  
> 7%)      35 kB (  0%)
>  TOTAL                              : 122.60         17.21        140.56      
>   7458725 kB
> 
> --param max-lto-streaming-parallelism=32
>  phase stream out                   :   2.34 (  2%)   5.69 ( 31%)   8.03 (  
> 6%)      15 kB (  0%)
>  TOTAL                              : 118.53         18.36        137.08      
>   7458705 kB
> 
> --param max-lto-streaming-parallelism=64
>  phase stream out                   :   1.63 (  1%)  15.76 ( 55%)  17.40 ( 
> 12%)      13 kB (  0%)
>  TOTAL                              : 122.17         28.66        151.00      
>   7458702 kB
> 
> --param max-lto-streaming-parallelism=256
>  phase stream out                   :   1.28 (  1%)   9.24 ( 41%)  10.53 (  
> 8%)      13 kB (  0%)
>  TOTAL                              : 116.78         22.56        139.53      
>   7458702 kB
> 
> Note that it is bit odd that 64 leads to worse results that full
> parallelism but it seems to reproduce relatively well. Also the usr/sys
> times for streaming are not representative since they do not account sys
> time of the forked threads. I am not sure where the fork time is
> accounted.
> 
> Generally it seems that the forking performance is not at all that
> bad and scales reasonably, but I still we should limit the default for
> something less than 128 we do now. Definitly there are diminishing
> returns after increasing from 16 or 32 and memory use goes up
> noticeably. With current trunk memory use also does not seem terribly
> bad (less global stream streaming makes the workers cheaper) and in all
> memory traces I collected it is dominated by compilation stage during
> the full rebuild.
> 
> I did similar tests for cc1 binary. There the relative time spent in
> streaming is lower so it goes from 17% to 1% (for parallelism 1 and 32
> respectively)
> 
> Bootstrapped/regtested x86_64-linux, OK?

Please document the new param in invoke.texi.  Otherwise looks good
to me.  Btw, do we actually allocate garbage at write-out time?
Thus, would using threads work as well?

Thanks,
Richard.

>       * params.def (PARAM_MAX_LTO_STREAMING_PARALLELISM): New parameter.
>       * lto.c (do_stream_out): rename to ...
>       (stream_out): ... this one; move original code to ...
>       (stream_out_partitions_1, stream_out_partitions): ... these new
>       functions.
>       (lto_wpa_write_files): Honnor lto_parallelism
> Index: params.def
> ===================================================================
> --- params.def        (revision 270143)
> +++ params.def        (working copy)
> @@ -1146,6 +1146,11 @@ DEFPARAM (MAX_PARTITION_SIZE,
>         "Maximal size of a partition for LTO (in estimated instructions).",
>         1000000, 0, INT_MAX)
>  
> +DEFPARAM (PARAM_MAX_LTO_STREAMING_PARALLELISM,
> +       "max-lto-streaming-parallelism",
> +       "maximal number of LTO partitions streamed in parallel.",
> +       32, 1, 0)
> +
>  /* Diagnostic parameters.  */
>  
>  DEFPARAM (CXX_MAX_NAMESPACES_FOR_DIAGNOSTIC_HELP,
> Index: lto/lto.c
> ===================================================================
> --- lto/lto.c (revision 270143)
> +++ lto/lto.c (working copy)
> @@ -2304,7 +2304,7 @@ static lto_file *current_lto_file;
>  /* Actually stream out ENCODER into TEMP_FILENAME.  */
>  
>  static void
> -do_stream_out (char *temp_filename, lto_symtab_encoder_t encoder, int part)
> +stream_out (char *temp_filename, lto_symtab_encoder_t encoder, int part)
>  {
>    lto_file *file = lto_obj_file_open (temp_filename, true);
>    if (!file)
> @@ -2352,19 +2352,31 @@ wait_for_child ()
>  }
>  #endif
>  
> +static void
> +stream_out_partitions_1 (char *temp_filename, int blen, int min, int max)
> +{
> +   /* Write all the nodes in SET.  */
> +   for (int p = min; p < max; p ++)
> +     {
> +       sprintf (temp_filename + blen, "%u.o", p);
> +       stream_out (temp_filename, ltrans_partitions[p]->encoder, p);
> +       ltrans_partitions[p]->encoder = NULL;
> +     }
> +}
> +
>  /* Stream out ENCODER into TEMP_FILENAME
>     Fork if that seems to help.  */
>  
>  static void
> -stream_out (char *temp_filename, lto_symtab_encoder_t encoder,
> -         bool ARG_UNUSED (last), int part)
> +stream_out_partitions (char *temp_filename, int blen, int min, int max,
> +                    bool ARG_UNUSED (last))
>  {
>  #ifdef HAVE_WORKING_FORK
>    static int nruns;
>  
>    if (lto_parallelism <= 1)
>      {
> -      do_stream_out (temp_filename, encoder, part);
> +      stream_out_partitions_1 (temp_filename, blen, min, max);
>        return;
>      }
>  
> @@ -2384,12 +2396,12 @@ stream_out (char *temp_filename, lto_sym
>        if (!cpid)
>       {
>         setproctitle ("lto1-wpa-streaming");
> -       do_stream_out (temp_filename, encoder, part);
> +          stream_out_partitions_1 (temp_filename, blen, min, max);
>         exit (0);
>       }
>        /* Fork failed; lets do the job ourseleves.  */
>        else if (cpid == -1)
> -        do_stream_out (temp_filename, encoder, part);
> +        stream_out_partitions_1 (temp_filename, blen, min, max);
>        else
>       nruns++;
>      }
> @@ -2397,13 +2409,13 @@ stream_out (char *temp_filename, lto_sym
>    else
>      {
>        int i;
> -      do_stream_out (temp_filename, encoder, part);
> +      stream_out_partitions_1 (temp_filename, blen, min, max);
>        for (i = 0; i < nruns; i++)
>       wait_for_child ();
>      }
>    asm_nodes_output = true;
>  #else
> -  do_stream_out (temp_filename, encoder, part);
> +  stream_out_partitions_1 (temp_filename, blen, min, max);
>  #endif
>  }
>  
> @@ -2445,6 +2457,13 @@ lto_wpa_write_files (void)
>    blen = strlen (temp_filename);
>  
>    n_sets = ltrans_partitions.length ();
> +  unsigned sets_per_worker = n_sets;
> +  if (lto_parallelism > 1)
> +    {
> +      if (lto_parallelism > (int)n_sets)
> +     lto_parallelism = n_sets;
> +      sets_per_worker = (n_sets + lto_parallelism - 1) / lto_parallelism;
> +    }
>  
>    for (i = 0; i < n_sets; i++)
>      {
> @@ -2493,13 +2512,17 @@ lto_wpa_write_files (void)
>       }
>        gcc_checking_assert (lto_symtab_encoder_size (part->encoder) || !i);
>  
> -      stream_out (temp_filename, part->encoder, i == n_sets - 1, i);
> -
> -      part->encoder = NULL;
> -
>        temp_priority.safe_push (part->insns);
>        temp_filenames.safe_push (xstrdup (temp_filename));
>      }
> +
> +  for (int set = 0; set < MAX (lto_parallelism, 1); set++)
> +    {
> +      stream_out_partitions (temp_filename, blen, set * sets_per_worker,
> +                          MIN ((set + 1) * sets_per_worker, n_sets),
> +                          set == MAX (lto_parallelism, 1) - 1);
> +    }
> +
>    ltrans_output_list_stream = fopen (ltrans_output_list, "w");
>    if (ltrans_output_list_stream == NULL)
>      fatal_error (input_location,
> @@ -3113,14 +3136,16 @@ do_whole_program_analysis (void)
>  
>    lto_parallelism = 1;
>  
> -  /* TODO: jobserver communicatoin is not supported, yet.  */
> +  /* TODO: jobserver communication is not supported, yet.  */
>    if (!strcmp (flag_wpa, "jobserver"))
> -    lto_parallelism = -1;
> +    lto_parallelism = PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM);
>    else
>      {
>        lto_parallelism = atoi (flag_wpa);
>        if (lto_parallelism <= 0)
>       lto_parallelism = 0;
> +      if (lto_parallelism >= PARAM_VALUE 
> (PARAM_MAX_LTO_STREAMING_PARALLELISM))
> +     lto_parallelism = PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM);
>      }
>  
>    timevar_start (TV_PHASE_OPT_GEN);
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany;
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)

Reply via email to