Hi, the LTO streaming forks for every partition. With the number of partitions incrased to 128 and relatively large memory usage (around 5GB) needed to WPA firefox this causes kernel to spend a lot of time probably by copying the page tables.
This patch makes the streamer to for only lto_parallelism times and strem num_partitions/lto_paralleism in each thread. I have also added parameter because currently -flto=jobserv leads to unlimited parallelism. This should be fixed by conneting to Make's jobsever and build our own mini jobserver to distribute partitions between worker threads, but this seems bit too involved for last minute change in stage4. I plan to work on this and hopefully bacport it to .2 release. I have tested the performance on by 32CPU 64threads box and got best wall time with 32 partitions and therefore I set it by default. I get --param max-lto-streaming-parallelism=1 Time variable usr sys wall GGC phase stream out : 50.65 ( 30%) 20.66 ( 61%) 71.38 ( 35%) 921 kB ( 0%) TOTAL : 170.73 33.69 204.64 7459610 kB --param max-lto-streaming-parallelism=4 phase stream out : 13.79 ( 11%) 6.80 ( 35%) 20.94 ( 14%) 155 kB ( 0%) TOTAL : 130.26 19.68 150.46 7458844 kB --param max-lto-streaming-parallelism=8 phase stream out : 8.94 ( 7%) 5.21 ( 29%) 14.15 ( 10%) 83 kB ( 0%) TOTAL : 125.28 18.09 143.54 7458773 kB --param max-lto-streaming-parallelism=16 phase stream out : 4.56 ( 4%) 4.34 ( 25%) 9.46 ( 7%) 35 kB ( 0%) TOTAL : 122.60 17.21 140.56 7458725 kB --param max-lto-streaming-parallelism=32 phase stream out : 2.34 ( 2%) 5.69 ( 31%) 8.03 ( 6%) 15 kB ( 0%) TOTAL : 118.53 18.36 137.08 7458705 kB --param max-lto-streaming-parallelism=64 phase stream out : 1.63 ( 1%) 15.76 ( 55%) 17.40 ( 12%) 13 kB ( 0%) TOTAL : 122.17 28.66 151.00 7458702 kB --param max-lto-streaming-parallelism=256 phase stream out : 1.28 ( 1%) 9.24 ( 41%) 10.53 ( 8%) 13 kB ( 0%) TOTAL : 116.78 22.56 139.53 7458702 kB Note that it is bit odd that 64 leads to worse results that full parallelism but it seems to reproduce relatively well. Also the usr/sys times for streaming are not representative since they do not account sys time of the forked threads. I am not sure where the fork time is accounted. Generally it seems that the forking performance is not at all that bad and scales reasonably, but I still we should limit the default for something less than 128 we do now. Definitly there are diminishing returns after increasing from 16 or 32 and memory use goes up noticeably. With current trunk memory use also does not seem terribly bad (less global stream streaming makes the workers cheaper) and in all memory traces I collected it is dominated by compilation stage during the full rebuild. I did similar tests for cc1 binary. There the relative time spent in streaming is lower so it goes from 17% to 1% (for parallelism 1 and 32 respectively) Bootstrapped/regtested x86_64-linux, OK? * params.def (PARAM_MAX_LTO_STREAMING_PARALLELISM): New parameter. * lto.c (do_stream_out): rename to ... (stream_out): ... this one; move original code to ... (stream_out_partitions_1, stream_out_partitions): ... these new functions. (lto_wpa_write_files): Honnor lto_parallelism Index: params.def =================================================================== --- params.def (revision 270143) +++ params.def (working copy) @@ -1146,6 +1146,11 @@ DEFPARAM (MAX_PARTITION_SIZE, "Maximal size of a partition for LTO (in estimated instructions).", 1000000, 0, INT_MAX) +DEFPARAM (PARAM_MAX_LTO_STREAMING_PARALLELISM, + "max-lto-streaming-parallelism", + "maximal number of LTO partitions streamed in parallel.", + 32, 1, 0) + /* Diagnostic parameters. */ DEFPARAM (CXX_MAX_NAMESPACES_FOR_DIAGNOSTIC_HELP, Index: lto/lto.c =================================================================== --- lto/lto.c (revision 270143) +++ lto/lto.c (working copy) @@ -2304,7 +2304,7 @@ static lto_file *current_lto_file; /* Actually stream out ENCODER into TEMP_FILENAME. */ static void -do_stream_out (char *temp_filename, lto_symtab_encoder_t encoder, int part) +stream_out (char *temp_filename, lto_symtab_encoder_t encoder, int part) { lto_file *file = lto_obj_file_open (temp_filename, true); if (!file) @@ -2352,19 +2352,31 @@ wait_for_child () } #endif +static void +stream_out_partitions_1 (char *temp_filename, int blen, int min, int max) +{ + /* Write all the nodes in SET. */ + for (int p = min; p < max; p ++) + { + sprintf (temp_filename + blen, "%u.o", p); + stream_out (temp_filename, ltrans_partitions[p]->encoder, p); + ltrans_partitions[p]->encoder = NULL; + } +} + /* Stream out ENCODER into TEMP_FILENAME Fork if that seems to help. */ static void -stream_out (char *temp_filename, lto_symtab_encoder_t encoder, - bool ARG_UNUSED (last), int part) +stream_out_partitions (char *temp_filename, int blen, int min, int max, + bool ARG_UNUSED (last)) { #ifdef HAVE_WORKING_FORK static int nruns; if (lto_parallelism <= 1) { - do_stream_out (temp_filename, encoder, part); + stream_out_partitions_1 (temp_filename, blen, min, max); return; } @@ -2384,12 +2396,12 @@ stream_out (char *temp_filename, lto_sym if (!cpid) { setproctitle ("lto1-wpa-streaming"); - do_stream_out (temp_filename, encoder, part); + stream_out_partitions_1 (temp_filename, blen, min, max); exit (0); } /* Fork failed; lets do the job ourseleves. */ else if (cpid == -1) - do_stream_out (temp_filename, encoder, part); + stream_out_partitions_1 (temp_filename, blen, min, max); else nruns++; } @@ -2397,13 +2409,13 @@ stream_out (char *temp_filename, lto_sym else { int i; - do_stream_out (temp_filename, encoder, part); + stream_out_partitions_1 (temp_filename, blen, min, max); for (i = 0; i < nruns; i++) wait_for_child (); } asm_nodes_output = true; #else - do_stream_out (temp_filename, encoder, part); + stream_out_partitions_1 (temp_filename, blen, min, max); #endif } @@ -2445,6 +2457,13 @@ lto_wpa_write_files (void) blen = strlen (temp_filename); n_sets = ltrans_partitions.length (); + unsigned sets_per_worker = n_sets; + if (lto_parallelism > 1) + { + if (lto_parallelism > (int)n_sets) + lto_parallelism = n_sets; + sets_per_worker = (n_sets + lto_parallelism - 1) / lto_parallelism; + } for (i = 0; i < n_sets; i++) { @@ -2493,13 +2512,17 @@ lto_wpa_write_files (void) } gcc_checking_assert (lto_symtab_encoder_size (part->encoder) || !i); - stream_out (temp_filename, part->encoder, i == n_sets - 1, i); - - part->encoder = NULL; - temp_priority.safe_push (part->insns); temp_filenames.safe_push (xstrdup (temp_filename)); } + + for (int set = 0; set < MAX (lto_parallelism, 1); set++) + { + stream_out_partitions (temp_filename, blen, set * sets_per_worker, + MIN ((set + 1) * sets_per_worker, n_sets), + set == MAX (lto_parallelism, 1) - 1); + } + ltrans_output_list_stream = fopen (ltrans_output_list, "w"); if (ltrans_output_list_stream == NULL) fatal_error (input_location, @@ -3113,14 +3136,16 @@ do_whole_program_analysis (void) lto_parallelism = 1; - /* TODO: jobserver communicatoin is not supported, yet. */ + /* TODO: jobserver communication is not supported, yet. */ if (!strcmp (flag_wpa, "jobserver")) - lto_parallelism = -1; + lto_parallelism = PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM); else { lto_parallelism = atoi (flag_wpa); if (lto_parallelism <= 0) lto_parallelism = 0; + if (lto_parallelism >= PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM)) + lto_parallelism = PARAM_VALUE (PARAM_MAX_LTO_STREAMING_PARALLELISM); } timevar_start (TV_PHASE_OPT_GEN);