Hi, I benchmarked the patch moving loop header copying and it is quite noticeable win.
Some testsuite updating is needed. In many cases it is just because the optimizations are now happening earlier. There are however few testusite failures I have torubles to deal with: ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/pr21559.c scan-tree-dump-times vrp1 "Threaded jump" 3 ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/ssa-dom-thread-2.c scan-tree-dump-times vrp1 "Jumps threaded: 1" 1 ./testsuite/gcc/gcc.sum:FAIL: gcc.dg/vect/O3-slp-reduc-10.c scan-tree-dump-times vect "vectorized 1 loops" 2 ./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++98 scan-tree-dump-times vrp1 "if " 1 ./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++11 scan-tree-dump-times vrp1 "if " 1 This is mostly about VRP losing its ability to thread some jumps from the duplicated loop header out of the loop across the loopback edge. This seems to be due to loop updating logic. Do we care about these? Honza Index: tree-ssa-threadupdate.c =================================================================== *** tree-ssa-threadupdate.c (revision 192123) --- tree-ssa-threadupdate.c (working copy) *************** static bool *** 846,854 **** def_split_header_continue_p (const_basic_block bb, const void *data) { const_basic_block new_header = (const_basic_block) data; ! return (bb != new_header ! && (loop_depth (bb->loop_father) ! >= loop_depth (new_header->loop_father))); } /* Thread jumps through the header of LOOP. Returns true if cfg changes. --- 846,860 ---- def_split_header_continue_p (const_basic_block bb, const void *data) { const_basic_block new_header = (const_basic_block) data; ! const struct loop *l; ! ! if (bb == new_header ! || loop_depth (bb->loop_father) < loop_depth (new_header->loop_father)) ! return false; ! for (l = bb->loop_father; l; l = loop_outer (l)) ! if (l == new_header->loop_father) ! return true; ! return false; } /* Thread jumps through the header of LOOP. Returns true if cfg changes. Index: testsuite/gcc.dg/unroll_2.c =================================================================== *** testsuite/gcc.dg/unroll_2.c (revision 192123) --- testsuite/gcc.dg/unroll_2.c (working copy) *************** *** 1,5 **** /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo -fenable-rtl-loop2_unroll" } */ unsigned a[100], b[100]; inline void bar() --- 1,5 ---- /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo -fenable-rtl-loop2_unroll -fno-tree-dominator-opts" } */ unsigned a[100], b[100]; inline void bar() Index: testsuite/gcc.dg/unroll_3.c =================================================================== *** testsuite/gcc.dg/unroll_3.c (revision 192123) --- testsuite/gcc.dg/unroll_3.c (working copy) *************** *** 1,5 **** /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo" } */ unsigned a[100], b[100]; inline void bar() --- 1,5 ---- /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo -fno-tree-dominator-opts" } */ unsigned a[100], b[100]; inline void bar() Index: testsuite/gcc.dg/torture/pr23821.c =================================================================== *** testsuite/gcc.dg/torture/pr23821.c (revision 192123) --- testsuite/gcc.dg/torture/pr23821.c (working copy) *************** *** 1,9 **** /* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */ ! /* At -O1 DOM threads a jump in a non-optimal way which leads to the bogus propagation. */ ! /* { dg-skip-if "" { *-*-* } { "-O1" } { "" } } */ ! /* { dg-options "-fdump-tree-ivcanon-details" } */ int a[199]; --- 1,8 ---- /* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */ ! /* DOM threads a jump in a non-optimal way which leads to the bogus propagation. */ ! /* { dg-options "-fdump-tree-ivcanon-details -fno-tree-dominator-opts" } */ int a[199]; Index: testsuite/gcc.dg/tree-ssa/ivopt_1.c =================================================================== *** testsuite/gcc.dg/tree-ssa/ivopt_1.c (revision 192123) --- testsuite/gcc.dg/tree-ssa/ivopt_1.c (working copy) *************** void foo (int i_width, TYPE dst, TYPE sr *** 14,18 **** } ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ --- 14,18 ---- } ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ Index: testsuite/gcc.dg/tree-ssa/ivopt_2.c =================================================================== *** testsuite/gcc.dg/tree-ssa/ivopt_2.c (revision 192123) --- testsuite/gcc.dg/tree-ssa/ivopt_2.c (working copy) *************** void foo (int i_width, TYPE dst, TYPE sr *** 13,17 **** } } ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ --- 13,17 ---- } } ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ Index: testsuite/gcc.dg/tree-ssa/ivopt_3.c =================================================================== *** testsuite/gcc.dg/tree-ssa/ivopt_3.c (revision 192123) --- testsuite/gcc.dg/tree-ssa/ivopt_3.c (working copy) *************** void foo (int i_width, char* dst, char* *** 16,20 **** } } ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ --- 16,20 ---- } } ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ Index: testsuite/gcc.dg/tree-ssa/ivopt_4.c =================================================================== *** testsuite/gcc.dg/tree-ssa/ivopt_4.c (revision 192123) --- testsuite/gcc.dg/tree-ssa/ivopt_4.c (working copy) *************** void foo (int i_width, TYPE dst, TYPE sr *** 15,19 **** } } ! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ --- 15,19 ---- } } ! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */ /* { dg-final { cleanup-tree-dump "ivopts" } } */ Index: testsuite/gcc.dg/unroll_4.c =================================================================== *** testsuite/gcc.dg/unroll_4.c (revision 192123) --- testsuite/gcc.dg/unroll_4.c (working copy) *************** *** 1,5 **** /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2" } */ unsigned a[100], b[100]; inline void bar() --- 1,5 ---- /* { dg-do compile { target i?86-*-linux* x86_64-*-linux* } } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2 -fno-tree-dominator-opts" } */ unsigned a[100], b[100]; inline void bar() Index: testsuite/gcc.dg/tree-prof/update-loopch.c =================================================================== *** testsuite/gcc.dg/tree-prof/update-loopch.c (revision 192123) --- testsuite/gcc.dg/tree-prof/update-loopch.c (working copy) *************** main () *** 11,20 **** } return 0; } ! /* Loop header copying will peel away the initial conditional, so the loop body ! is once reached directly from entry point of function, rest via loopback ! edge. */ ! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33334" "profile"} } */ /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } */ /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */ /* { dg-final-use { cleanup-ipa-dump "profile" } } */ --- 11,20 ---- } return 0; } ! /* Loop header copying, now happening before profiling, will peel away the ! initial conditional, so the loop body is once reached directly from entry ! point of function, rest via loopback edge. */ ! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33332" "profile"} } */ /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } */ /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */ /* { dg-final-use { cleanup-ipa-dump "profile" } } */ Index: testsuite/gcc.dg/unroll_1.c =================================================================== *** testsuite/gcc.dg/unroll_1.c (revision 192123) --- testsuite/gcc.dg/unroll_1.c (working copy) *************** *** 1,5 **** /* { dg-do compile } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll" } */ unsigned a[100], b[100]; inline void bar() --- 1,5 ---- /* { dg-do compile } */ ! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll -fno-tree-dominator-opts" } */ unsigned a[100], b[100]; inline void bar() Index: passes.c =================================================================== *** passes.c (revision 192123) --- passes.c (working copy) *************** init_optimization_passes (void) *** 1330,1335 **** --- 1330,1340 ---- NEXT_PASS (pass_convert_switch); NEXT_PASS (pass_cleanup_eh); NEXT_PASS (pass_profile); + /* Scheduling header copying before pass_ipa_tree_profile is important + to get loop iteration counts estimated right. + Scheduling it after pass_profile prevents it to copy loop headers + in cold functions declares by the user. */ + NEXT_PASS (pass_ch); NEXT_PASS (pass_local_pure_const); /* Split functions creates parts that are not run through early optimizations again. It is thus good idea to do this *************** init_optimization_passes (void) *** 1406,1412 **** NEXT_PASS (pass_tree_ifcombine); NEXT_PASS (pass_phiopt); NEXT_PASS (pass_tail_recursion); - NEXT_PASS (pass_ch); NEXT_PASS (pass_stdarg); NEXT_PASS (pass_lower_complex); NEXT_PASS (pass_sra); --- 1411,1416 ----