Hi Vlad, Thanks for the testing and the logs. You must have good hardware, your timings are all ~3 times faster than mine :-)
On Sat, Sep 29, 2012 at 3:01 AM, Vladimir Makarov <vmaka...@redhat.com> wrote: > ----------------------------------32-bit------------------------------------ > Reload: > 581.85user 29.91system 27:15.18elapsed 37%CPU (0avgtext+0avgdata > LRA: > 629.67user 24.16system 24:31.08elapsed 44%CPU (0avgtext+0avgdata This is a ~8% slowdown. > ----------------------------------64-bit:----------------------------------- > Reload: > 503.26user 36.54system 30:16.62elapsed 29%CPU (0avgtext+0avgdata > LRA: > 598.70user 30.90system 27:26.92elapsed 38%CPU (0avgtext+0avgdata This is a ~19% slowdown > Here is the numbers for PR54146 on the same machine with -O1 only for > 64-bit (compiler reports error for -m32). Right, the test case is for 64-bits only, I think it's preprocessed code for AMD64. > Reload: > 350.40user 21.59system 17:09.75elapsed 36%CPU (0avgtext+0avgdata > LRA: > 468.29user 21.35system 15:47.76elapsed 51%CPU (0avgtext+0avgdata This is a ~34% slowdown. To put it in another perspective, here are my timings of trunk vs lra (both checkouts done today): trunk: integrated RA : 181.68 (24%) usr 1.68 (11%) sys 183.43 (24%) wall 643564 kB (20%) ggc reload : 11.00 ( 1%) usr 0.18 ( 1%) sys 11.17 ( 1%) wall 32394 kB ( 1%) ggc TOTAL : 741.64 14.76 756.41 3216164 kB lra branch: integrated RA : 174.65 (16%) usr 1.33 ( 8%) sys 176.33 (16%) wall 643560 kB (20%) ggc reload : 399.69 (36%) usr 2.48 (15%) sys 402.69 (36%) wall 41852 kB ( 1%) ggc TOTAL :1102.06 16.05 1120.83 3231738 kB That's a 49% slowdown. The difference is completely accounted for by the timing difference between reload and LRA. (Timings done on gcc17, which is AMD Opteron(tm) Processor 8354 with 15GB ram, so swapping is no issue.) It looks like the reload timevar is used for LRA. Why not have multiple timevars, one per phase of LRA? Sth like the patch below would be nice. This gives me the following timings: integrated RA : 189.34 (16%) usr 1.84 (11%) sys 191.18 (16%) wall 643560 kB (20%) ggc LRA non-specific : 59.82 ( 5%) usr 0.22 ( 1%) sys 60.12 ( 5%) wall 18202 kB ( 1%) ggc LRA virtuals eliminatenon: 56.79 ( 5%) usr 0.03 ( 0%) sys 56.80 ( 5%) wall 19223 kB ( 1%) ggc LRA reload inheritance : 6.41 ( 1%) usr 0.01 ( 0%) sys 6.42 ( 1%) wall 1665 kB ( 0%) ggc LRA create live ranges : 175.30 (15%) usr 2.14 (13%) sys 177.44 (15%) wall 2761 kB ( 0%) ggc LRA hard reg assignment : 130.85 (11%) usr 0.20 ( 1%) sys 131.17 (11%) wall 0 kB ( 0%) ggc LRA coalesce pseudo regs: 2.54 ( 0%) usr 0.00 ( 0%) sys 2.55 ( 0%) wall 0 kB ( 0%) ggc reload : 6.73 ( 1%) usr 0.20 ( 1%) sys 6.92 ( 1%) wall 0 kB ( 0%) ggc so the LRA "slowness" (for lack of a better word) appears to be due to scalability problems in all sub-passes. The code size changes are impressive, but I think that this kind of slowdown should be addressed before making LRA the default for any target. Ciao! Steven Index: lra-assigns.c =================================================================== --- lra-assigns.c (revision 191858) +++ lra-assigns.c (working copy) @@ -1261,6 +1261,8 @@ lra_assign (void) bitmap_head insns_to_process; bool no_spills_p; + timevar_push (TV_LRA_ASSIGN); + init_lives (); sorted_pseudos = (int *) xmalloc (sizeof (int) * max_reg_num ()); sorted_reload_pseudos = (int *) xmalloc (sizeof (int) * max_reg_num ()); @@ -1312,5 +1314,6 @@ lra_assign (void) free (sorted_pseudos); free (sorted_reload_pseudos); finish_lives (); + timevar_pop (TV_LRA_ASSIGN); return no_spills_p; } Index: lra.c =================================================================== --- lra.c (revision 191858) +++ lra.c (working copy) @@ -2193,6 +2193,7 @@ lra (FILE *f) lra_dump_file = f; + timevar_push (TV_LRA); init_insn_recog_data (); @@ -2271,6 +2272,7 @@ lra (FILE *f) to use a constant pool. */ lra_eliminate (false); lra_inheritance (); + /* We need live ranges for lra_assign -- so build them. */ lra_create_live_ranges (true); live_p = true; @@ -2343,6 +2345,8 @@ lra (FILE *f) #ifdef ENABLE_CHECKING check_rtl (true); #endif + + timevar_pop (TV_LRA); } /* Called once per compiler to initialize LRA data once. */ Index: lra-eliminations.c =================================================================== --- lra-eliminations.c (revision 191858) +++ lra-eliminations.c (working copy) @@ -1297,6 +1297,8 @@ lra_eliminate (bool final_p) struct elim_table *ep; int regs_num = max_reg_num (); + timevar_push (TV_LRA_ELIMINATE); + bitmap_initialize (&insns_with_changed_offsets, ®_obstack); if (final_p) { @@ -1317,7 +1319,7 @@ lra_eliminate (bool final_p) { update_reg_eliminate (&insns_with_changed_offsets); if (bitmap_empty_p (&insns_with_changed_offsets)) - return; + goto lra_eliminate_done; } if (lra_dump_file != NULL) { @@ -1349,4 +1351,7 @@ lra_eliminate (bool final_p) process_insn_for_elimination (insn, final_p); } bitmap_clear (&insns_with_changed_offsets); + +lra_eliminate_done: + timevar_pop (TV_LRA_ELIMINATE); } Index: lra-lives.c =================================================================== --- lra-lives.c (revision 191858) +++ lra-lives.c (working copy) @@ -962,6 +962,8 @@ lra_create_live_ranges (bool all_p) basic_block bb; int i, hard_regno, max_regno = max_reg_num (); + timevar_push (TV_LRA_CREATE_LIVE_RANGES); + complete_info_p = all_p; if (lra_dump_file != NULL) fprintf (lra_dump_file, @@ -1016,6 +1018,7 @@ lra_create_live_ranges (bool all_p) sparseset_free (pseudos_live_through_setjumps); sparseset_free (pseudos_live); compress_live_ranges (); + timevar_pop (TV_LRA_CREATE_LIVE_RANGES); } /* Finish all live ranges. */ Index: timevar.def =================================================================== --- timevar.def (revision 191858) +++ timevar.def (working copy) @@ -223,10 +223,16 @@ DEFTIMEVAR (TV_REGMOVE , " DEFTIMEVAR (TV_MODE_SWITCH , "mode switching") DEFTIMEVAR (TV_SMS , "sms modulo scheduling") DEFTIMEVAR (TV_SCHED , "scheduling") -DEFTIMEVAR (TV_IRA , "integrated RA") -DEFTIMEVAR (TV_RELOAD , "reload") +DEFTIMEVAR (TV_IRA , "integrated RA") +DEFTIMEVAR (TV_LRA , "LRA non-specific") +DEFTIMEVAR (TV_LRA_ELIMINATE , "LRA virtuals eliminatenon") +DEFTIMEVAR (TV_LRA_INHERITANCE , "LRA reload inheritance") +DEFTIMEVAR (TV_LRA_CREATE_LIVE_RANGES, "LRA create live ranges") +DEFTIMEVAR (TV_LRA_ASSIGN , "LRA hard reg assignment") +DEFTIMEVAR (TV_LRA_COALESCE , "LRA coalesce pseudo regs") +DEFTIMEVAR (TV_RELOAD , "reload") DEFTIMEVAR (TV_RELOAD_CSE_REGS , "reload CSE regs") -DEFTIMEVAR (TV_GCSE_AFTER_RELOAD , "load CSE after reload") +DEFTIMEVAR (TV_GCSE_AFTER_RELOAD , "load CSE after reload") DEFTIMEVAR (TV_REE , "ree") DEFTIMEVAR (TV_THREAD_PROLOGUE_AND_EPILOGUE, "thread pro- & epilogue") DEFTIMEVAR (TV_IFCVT2 , "if-conversion 2") Index: lra-coalesce.c =================================================================== --- lra-coalesce.c (revision 191858) +++ lra-coalesce.c (working copy) @@ -221,6 +221,8 @@ lra_coalesce (void) bitmap_head involved_insns_bitmap, split_origin_bitmap; bitmap_iterator bi; + timevar_push (TV_LRA_COALESCE); + if (lra_dump_file != NULL) fprintf (lra_dump_file, "\n********** Pseudos coalescing #%d: **********\n\n", @@ -371,5 +373,6 @@ lra_coalesce (void) free (sorted_moves); free (next_coalesced_pseudo); free (first_coalesced_pseudo); + timevar_pop (TV_LRA_COALESCE); return coalesced_moves != 0; } Index: lra-constraints.c =================================================================== --- lra-constraints.c (revision 191858) +++ lra-constraints.c (working copy) @@ -4859,6 +4859,8 @@ lra_inheritance (void) basic_block bb, start_bb; edge e; + timevar_push (TV_LRA_INHERITANCE); + lra_inheritance_iter++; if (lra_dump_file != NULL) fprintf (lra_dump_file, "\n********** Inheritance #%d: **********\n\n", @@ -4907,6 +4909,8 @@ lra_inheritance (void) bitmap_clear (&live_regs); bitmap_clear (&check_only_regs); free (usage_insns); + + timevar_pop (TV_LRA_INHERITANCE); } ^L