Hi,
I benchmarked the patch moving loop header copying and it is quite noticeable 
win.

Some testsuite updating is needed. In many cases it is just because the
optimizations are now happening earlier.
There are however few testusite failures I have torubles to deal with:
./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/pr21559.c scan-tree-dump-times 
vrp1 "Threaded jump" 3
./testsuite/gcc/gcc.sum:FAIL: gcc.dg/tree-ssa/ssa-dom-thread-2.c 
scan-tree-dump-times vrp1 "Jumps threaded: 1" 1
./testsuite/gcc/gcc.sum:FAIL: gcc.dg/vect/O3-slp-reduc-10.c 
scan-tree-dump-times vect "vectorized 1 loops" 2
./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++98  
scan-tree-dump-times vrp1 "if " 1
./testsuite/g++/g++.sum:FAIL: g++.dg/tree-ssa/pr18178.C -std=gnu++11  
scan-tree-dump-times vrp1 "if " 1

This is mostly about VRP losing its ability to thread some jumps from the
duplicated loop header out of the loop across the loopback edge.  This seems to
be due to loop updating logic.  Do we care about these?

Honza

Index: tree-ssa-threadupdate.c
===================================================================
*** tree-ssa-threadupdate.c     (revision 192123)
--- tree-ssa-threadupdate.c     (working copy)
*************** static bool
*** 846,854 ****
  def_split_header_continue_p (const_basic_block bb, const void *data)
  {
    const_basic_block new_header = (const_basic_block) data;
!   return (bb != new_header
!         && (loop_depth (bb->loop_father)
!             >= loop_depth (new_header->loop_father)));
  }
  
  /* Thread jumps through the header of LOOP.  Returns true if cfg changes.
--- 846,860 ----
  def_split_header_continue_p (const_basic_block bb, const void *data)
  {
    const_basic_block new_header = (const_basic_block) data;
!   const struct loop *l;
! 
!   if (bb == new_header
!       || loop_depth (bb->loop_father) < loop_depth (new_header->loop_father))
!     return false;
!   for (l = bb->loop_father; l; l = loop_outer (l))
!     if (l == new_header->loop_father)
!       return true;
!   return false;
  }
  
  /* Thread jumps through the header of LOOP.  Returns true if cfg changes.
Index: testsuite/gcc.dg/unroll_2.c
===================================================================
*** testsuite/gcc.dg/unroll_2.c (revision 192123)
--- testsuite/gcc.dg/unroll_2.c (working copy)
***************
*** 1,5 ****
  /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo 
-fenable-rtl-loop2_unroll" } */
  
  unsigned a[100], b[100];
  inline void bar()
--- 1,5 ----
  /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo 
-fenable-rtl-loop2_unroll -fno-tree-dominator-opts" } */
  
  unsigned a[100], b[100];
  inline void bar()
Index: testsuite/gcc.dg/unroll_3.c
===================================================================
*** testsuite/gcc.dg/unroll_3.c (revision 192123)
--- testsuite/gcc.dg/unroll_3.c (working copy)
***************
*** 1,5 ****
  /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo" } 
*/
  
  unsigned a[100], b[100];
  inline void bar()
--- 1,5 ----
  /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo 
-fno-tree-dominator-opts" } */
  
  unsigned a[100], b[100];
  inline void bar()
Index: testsuite/gcc.dg/torture/pr23821.c
===================================================================
*** testsuite/gcc.dg/torture/pr23821.c  (revision 192123)
--- testsuite/gcc.dg/torture/pr23821.c  (working copy)
***************
*** 1,9 ****
  /* { dg-do compile } */
  /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */
! /* At -O1 DOM threads a jump in a non-optimal way which leads to
     the bogus propagation.  */
! /* { dg-skip-if "" { *-*-* } { "-O1" } { "" } } */
! /* { dg-options "-fdump-tree-ivcanon-details" } */
  
  int a[199];
  
--- 1,8 ----
  /* { dg-do compile } */
  /* { dg-skip-if "" { *-*-* } { "-O0" "-fno-fat-lto-objects" } { "" } } */
! /* DOM threads a jump in a non-optimal way which leads to
     the bogus propagation.  */
! /* { dg-options "-fdump-tree-ivcanon-details -fno-tree-dominator-opts" } */
  
  int a[199];
  
Index: testsuite/gcc.dg/tree-ssa/ivopt_1.c
===================================================================
*** testsuite/gcc.dg/tree-ssa/ivopt_1.c (revision 192123)
--- testsuite/gcc.dg/tree-ssa/ivopt_1.c (working copy)
*************** void foo (int i_width, TYPE dst, TYPE sr
*** 14,18 ****
  }
  
  
! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
--- 14,18 ----
  }
  
  
! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: testsuite/gcc.dg/tree-ssa/ivopt_2.c
===================================================================
*** testsuite/gcc.dg/tree-ssa/ivopt_2.c (revision 192123)
--- testsuite/gcc.dg/tree-ssa/ivopt_2.c (working copy)
*************** void foo (int i_width, TYPE dst, TYPE sr
*** 13,17 ****
         }
  }
  
! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
--- 13,17 ----
         }
  }
  
! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: testsuite/gcc.dg/tree-ssa/ivopt_3.c
===================================================================
*** testsuite/gcc.dg/tree-ssa/ivopt_3.c (revision 192123)
--- testsuite/gcc.dg/tree-ssa/ivopt_3.c (working copy)
*************** void foo (int i_width, char* dst, char* 
*** 16,20 ****
         }
  } 
  
! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
--- 16,20 ----
         }
  } 
  
! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: testsuite/gcc.dg/tree-ssa/ivopt_4.c
===================================================================
*** testsuite/gcc.dg/tree-ssa/ivopt_4.c (revision 192123)
--- testsuite/gcc.dg/tree-ssa/ivopt_4.c (working copy)
*************** void foo (int i_width, TYPE dst, TYPE sr
*** 15,19 ****
         }
  }
  
! /* { dg-final { scan-tree-dump-times "PHI <ivtmp" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
--- 15,19 ----
         }
  }
  
! /* { dg-final { scan-tree-dump-times "PHI" 1 "ivopts"} } */
  /* { dg-final { cleanup-tree-dump "ivopts" } } */
Index: testsuite/gcc.dg/unroll_4.c
===================================================================
*** testsuite/gcc.dg/unroll_4.c (revision 192123)
--- testsuite/gcc.dg/unroll_4.c (working copy)
***************
*** 1,5 ****
  /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2" 
} */
  
  unsigned a[100], b[100];
  inline void bar()
--- 1,5 ----
  /* { dg-do compile  { target i?86-*-linux* x86_64-*-linux* } } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo2 
-fno-tree-dominator-opts" } */
  
  unsigned a[100], b[100];
  inline void bar()
Index: testsuite/gcc.dg/tree-prof/update-loopch.c
===================================================================
*** testsuite/gcc.dg/tree-prof/update-loopch.c  (revision 192123)
--- testsuite/gcc.dg/tree-prof/update-loopch.c  (working copy)
*************** main ()
*** 11,20 ****
      }
    return 0;
  }
! /* Loop header copying will peel away the initial conditional, so the loop 
body
!    is once reached directly from entry point of function, rest via loopback
!    edge.  */
! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33334" "profile"} } */
  /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } 
*/
  /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
  /* { dg-final-use { cleanup-ipa-dump "profile" } } */
--- 11,20 ----
      }
    return 0;
  }
! /* Loop header copying, now happening before profiling, will peel away the
!    initial conditional, so the loop body is once reached directly from entry
!    point of function, rest via loopback edge.  */
! /* { dg-final-use { scan-ipa-dump "loop depth 0, count 33332" "profile"} } */
  /* { dg-final-use { scan-tree-dump "loop depth 1, count 33332" "optimized"} } 
*/
  /* { dg-final-use { scan-tree-dump-not "Invalid sum" "optimized"} } */
  /* { dg-final-use { cleanup-ipa-dump "profile" } } */
Index: testsuite/gcc.dg/unroll_1.c
===================================================================
*** testsuite/gcc.dg/unroll_1.c (revision 192123)
--- testsuite/gcc.dg/unroll_1.c (working copy)
***************
*** 1,5 ****
  /* { dg-do compile } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll" } */
  
  unsigned a[100], b[100];
  inline void bar()
--- 1,5 ----
  /* { dg-do compile } */
! /* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops 
-fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll 
-fno-tree-dominator-opts" } */
  
  unsigned a[100], b[100];
  inline void bar()
Index: passes.c
===================================================================
*** passes.c    (revision 192123)
--- passes.c    (working copy)
*************** init_optimization_passes (void)
*** 1330,1335 ****
--- 1330,1340 ----
          NEXT_PASS (pass_convert_switch);
            NEXT_PASS (pass_cleanup_eh);
            NEXT_PASS (pass_profile);
+         /* Scheduling header copying before pass_ipa_tree_profile is important
+            to get loop iteration counts estimated right.
+            Scheduling it after pass_profile prevents it to copy loop headers
+            in cold functions declares by the user.  */
+           NEXT_PASS (pass_ch);
            NEXT_PASS (pass_local_pure_const);
          /* Split functions creates parts that are not run through
             early optimizations again.  It is thus good idea to do this
*************** init_optimization_passes (void)
*** 1406,1412 ****
        NEXT_PASS (pass_tree_ifcombine);
        NEXT_PASS (pass_phiopt);
        NEXT_PASS (pass_tail_recursion);
-       NEXT_PASS (pass_ch);
        NEXT_PASS (pass_stdarg);
        NEXT_PASS (pass_lower_complex);
        NEXT_PASS (pass_sra);
--- 1411,1416 ----

Reply via email to