On Tue, 14 Oct 2014, Jan Hubicka wrote: > Hi, > this is update of my 2013 update to 2012 patch to move rtl loop peeling > to tree level. This is to expose optimization oppurtunities earlier. > Incrementally I think I can also improve profiling to provide a histogram > on loop iterations and get more sensible peeling decisions. > > profiled-bootstrapped/regtested x86_64-linux, OK?
Ok. Thanks, Richard. > Honza > > * loop-unroll.c: (decide_unrolling_and_peeling): Rename to > (decide_unrolling): ... this one. > (peel_loops_completely): Remove. > (decide_peel_simple): Remove. > (decide_peel_once_rolling): Remove. > (decide_peel_completely): Remove. > (peel_loop_simple): Remove. > (peel_loop_completely): Remove. > (unroll_and_peel_loops): Rename to ... > (unroll_loops): ... this one; handle only unrolling. > * cfgloop.h (lpt_dec): Remove LPT_PEEL_COMPLETELY and > LPT_PEEL_SIMPLE. > (UAP_PEEL): Remove. > (unroll_and_peel_loops): Remove. > (unroll_loops): New. > * passes.def: Replace > pass_rtl_unroll_and_peel_loops by pass_rtl_unroll_loops. > * loop-init.c (gate_rtl_unroll_and_peel_loops, > rtl_unroll_and_peel_loops): Rename to ... > (gate_rtl_unroll_loops, rtl_unroll_loops): ... these; update. > (pass_rtl_unroll_and_peel_loops): Rename to ... > (pass_rtl_unroll_loops): ... this one. > * tree-pass.h (make_pass_rtl_unroll_and_peel_loops): Remove. > (make_pass_rtl_unroll_loops): New. > * tree-ssa-loop-ivcanon.c: (estimated_peeled_sequence_size, > try_peel_loop): New. > (canonicalize_loop_induction_variables): Update. > > * gcc.dg/tree-prof/peel-1.c: Update. > * gcc.dg/tree-prof/unroll-1.c: Update. > * gcc.dg/gcc.dg/unroll_1.c: Update. > * gcc.dg/gcc.dg/unroll_2.c: Update. > * gcc.dg/gcc.dg/unroll_3.c: Update. > * gcc.dg/gcc.dg/unroll_4.c: Update. > Index: tree-pass.h > =================================================================== > --- tree-pass.h (revision 216145) > +++ tree-pass.h (working copy) > @@ -504,7 +504,7 @@ extern rtl_opt_pass *make_pass_outof_cfg > extern rtl_opt_pass *make_pass_loop2 (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_rtl_loop_init (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_rtl_move_loop_invariants (gcc::context *ctxt); > -extern rtl_opt_pass *make_pass_rtl_unroll_and_peel_loops (gcc::context > *ctxt); > +extern rtl_opt_pass *make_pass_rtl_unroll_loops (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_rtl_doloop (gcc::context *ctxt); > extern rtl_opt_pass *make_pass_rtl_loop_done (gcc::context *ctxt); > > Index: tree-ssa-loop-ivcanon.c > =================================================================== > --- tree-ssa-loop-ivcanon.c (revision 216145) > +++ tree-ssa-loop-ivcanon.c (working copy) > @@ -28,9 +28,12 @@ along with GCC; see the file COPYING3. > variables. In that case the created optimization possibilities are likely > to pay up. > > - Additionally in case we detect that it is beneficial to unroll the > - loop completely, we do it right here to expose the optimization > - possibilities to the following passes. */ > + We also perform > + - complette unrolling (or peeling) when the loops is rolling few enough > + times > + - simple peeling (i.e. copying few initial iterations prior the loop) > + when number of iteration estimate is known (typically by the profile > + info). */ > > #include "config.h" > #include "system.h" > @@ -657,11 +660,12 @@ try_unroll_loop_completely (struct loop > HOST_WIDE_INT maxiter, > location_t locus) > { > - unsigned HOST_WIDE_INT n_unroll, ninsns, max_unroll, unr_insns; > + unsigned HOST_WIDE_INT n_unroll = 0, ninsns, max_unroll, unr_insns; > gimple cond; > struct loop_size size; > bool n_unroll_found = false; > edge edge_to_cancel = NULL; > + int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS; > > /* See if we proved number of iterations to be low constant. > > @@ -821,6 +825,8 @@ try_unroll_loop_completely (struct loop > loop->num); > return false; > } > + dump_printf_loc (report_flags, locus, > + "loop turned into non-loop; it never loops.\n"); > > initialize_original_copy_tables (); > wont_exit = sbitmap_alloc (n_unroll + 1); > @@ -902,6 +908,133 @@ try_unroll_loop_completely (struct loop > return true; > } > > +/* Return number of instructions after peeling. */ > +static unsigned HOST_WIDE_INT > +estimated_peeled_sequence_size (struct loop_size *size, > + unsigned HOST_WIDE_INT npeel) > +{ > + return MAX (npeel * (HOST_WIDE_INT) (size->overall > + - size->eliminated_by_peeling), 1); > +} > + > +/* If the loop is expected to iterate N times and is > + small enough, duplicate the loop body N+1 times before > + the loop itself. This way the hot path will never > + enter the loop. > + Parameters are the same as for try_unroll_loops_completely */ > + > +static bool > +try_peel_loop (struct loop *loop, > + edge exit, tree niter, > + HOST_WIDE_INT maxiter) > +{ > + int npeel; > + struct loop_size size; > + int peeled_size; > + sbitmap wont_exit; > + unsigned i; > + vec<edge> to_remove = vNULL; > + edge e; > + > + /* If the iteration bound is known and large, then we can safely eliminate > + the check in peeled copies. */ > + if (TREE_CODE (niter) != INTEGER_CST) > + exit = NULL; > + > + if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0) > + return false; > + > + /* Peel only innermost loops. */ > + if (loop->inner) > + { > + if (dump_file) > + fprintf (dump_file, "Not peeling: outer loop\n"); > + return false; > + } > + > + if (!optimize_loop_for_speed_p (loop)) > + { > + if (dump_file) > + fprintf (dump_file, "Not peeling: cold loop\n"); > + return false; > + } > + > + /* Check if there is an estimate on the number of iterations. */ > + npeel = estimated_loop_iterations_int (loop); > + if (npeel < 0) > + { > + if (dump_file) > + fprintf (dump_file, "Not peeling: number of iterations is not " > + "estimated\n"); > + return false; > + } > + if (maxiter >= 0 && maxiter <= npeel) > + { > + if (dump_file) > + fprintf (dump_file, "Not peeling: upper bound is known so can " > + "unroll complettely\n"); > + return false; > + } > + > + /* We want to peel estimated number of iterations + 1 (so we never > + enter the loop on quick path). Check against PARAM_MAX_PEEL_TIMES > + and be sure to avoid overflows. */ > + if (npeel > PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1) > + { > + if (dump_file) > + fprintf (dump_file, "Not peeling: rolls too much " > + "(%i + 1 > --param max-peel-times)\n", npeel); > + return false; > + } > + npeel++; > + > + /* Check peeled loops size. */ > + tree_estimate_loop_size (loop, exit, NULL, &size, > + PARAM_VALUE (PARAM_MAX_PEELED_INSNS)); > + if ((peeled_size = estimated_peeled_sequence_size (&size, npeel)) > + > PARAM_VALUE (PARAM_MAX_PEELED_INSNS)) > + { > + if (dump_file) > + fprintf (dump_file, "Not peeling: peeled sequence size is too large " > + "(%i insns > --param max-peel-insns)", peeled_size); > + return false; > + } > + > + /* Duplicate possibly eliminating the exits. */ > + initialize_original_copy_tables (); > + wont_exit = sbitmap_alloc (npeel + 1); > + bitmap_ones (wont_exit); > + bitmap_clear_bit (wont_exit, 0); > + if (!gimple_duplicate_loop_to_header_edge (loop, loop_preheader_edge > (loop), > + npeel, wont_exit, > + exit, &to_remove, > + DLTHE_FLAG_UPDATE_FREQ > + | DLTHE_FLAG_COMPLETTE_PEEL)) > + { > + free_original_copy_tables (); > + free (wont_exit); > + return false; > + } > + FOR_EACH_VEC_ELT (to_remove, i, e) > + { > + bool ok = remove_path (e); > + gcc_assert (ok); > + } > + free (wont_exit); > + free_original_copy_tables (); > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Peeled loop %d, %i times.\n", > + loop->num, npeel); > + } > + if (loop->any_upper_bound) > + loop->nb_iterations_upper_bound -= npeel; > + loop->nb_iterations_estimate = 0; > + /* Make sure to mark loop cold so we do not try to peel it more. */ > + scale_loop_profile (loop, 1, 0); > + loop->header->count = 0; > + return true; > +} > /* Adds a canonical induction variable to LOOP if suitable. > CREATE_IV is true if we may create a new iv. UL determines > which loops we are allowed to completely unroll. If TRY_EVAL is true, we > try > @@ -981,6 +1114,9 @@ canonicalize_loop_induction_variables (s > && exit && just_once_each_iteration_p (loop, exit->src)) > create_canonical_iv (loop, exit, niter); > > + if (ul == UL_ALL) > + modified |= try_peel_loop (loop, exit, niter, maxiter); > + > return modified; > } > > Index: loop-init.c > =================================================================== > --- loop-init.c (revision 216145) > +++ loop-init.c (working copy) > @@ -357,7 +357,6 @@ pass_loop2::gate (function *fun) > if (optimize > 0 > && (flag_move_loop_invariants > || flag_unswitch_loops > - || flag_peel_loops > || flag_unroll_loops > #ifdef HAVE_doloop_end > || (flag_branch_on_count_reg && HAVE_doloop_end) > @@ -537,7 +536,7 @@ make_pass_rtl_move_loop_invariants (gcc: > > namespace { > > -const pass_data pass_data_rtl_unroll_and_peel_loops = > +const pass_data pass_data_rtl_unroll_loops = > { > RTL_PASS, /* type */ > "loop2_unroll", /* name */ > @@ -550,11 +549,11 @@ const pass_data pass_data_rtl_unroll_and > 0, /* todo_flags_finish */ > }; > > -class pass_rtl_unroll_and_peel_loops : public rtl_opt_pass > +class pass_rtl_unroll_loops : public rtl_opt_pass > { > public: > - pass_rtl_unroll_and_peel_loops (gcc::context *ctxt) > - : rtl_opt_pass (pass_data_rtl_unroll_and_peel_loops, ctxt) > + pass_rtl_unroll_loops (gcc::context *ctxt) > + : rtl_opt_pass (pass_data_rtl_unroll_loops, ctxt) > {} > > /* opt_pass methods: */ > @@ -565,10 +564,10 @@ public: > > virtual unsigned int execute (function *); > > -}; // class pass_rtl_unroll_and_peel_loops > +}; // class pass_rtl_unroll_loops > > unsigned int > -pass_rtl_unroll_and_peel_loops::execute (function *fun) > +pass_rtl_unroll_loops::execute (function *fun) > { > if (number_of_loops (fun) > 1) > { > @@ -576,14 +575,12 @@ pass_rtl_unroll_and_peel_loops::execute > if (dump_file) > df_dump (dump_file); > > - if (flag_peel_loops) > - flags |= UAP_PEEL; > if (flag_unroll_loops) > flags |= UAP_UNROLL; > if (flag_unroll_all_loops) > flags |= UAP_UNROLL_ALL; > > - unroll_and_peel_loops (flags); > + unroll_loops (flags); > } > return 0; > } > @@ -591,9 +588,9 @@ pass_rtl_unroll_and_peel_loops::execute > } // anon namespace > > rtl_opt_pass * > -make_pass_rtl_unroll_and_peel_loops (gcc::context *ctxt) > +make_pass_rtl_unroll_loops (gcc::context *ctxt) > { > - return new pass_rtl_unroll_and_peel_loops (ctxt); > + return new pass_rtl_unroll_loops (ctxt); > } > > > Index: cfgloop.h > =================================================================== > --- cfgloop.h (revision 216145) > +++ cfgloop.h (working copy) > @@ -30,8 +30,6 @@ along with GCC; see the file COPYING3. > enum lpt_dec > { > LPT_NONE, > - LPT_PEEL_COMPLETELY, > - LPT_PEEL_SIMPLE, > LPT_UNROLL_CONSTANT, > LPT_UNROLL_RUNTIME, > LPT_UNROLL_STUPID > @@ -731,12 +729,11 @@ extern void loop_optimizer_finalize (voi > /* Optimization passes. */ > enum > { > - UAP_PEEL = 1, /* Enables loop peeling. */ > - UAP_UNROLL = 2, /* Enables unrolling of loops if it seems profitable. > */ > - UAP_UNROLL_ALL = 4 /* Enables unrolling of all loops. */ > + UAP_UNROLL = 1, /* Enables unrolling of loops if it seems profitable. > */ > + UAP_UNROLL_ALL = 2 /* Enables unrolling of all loops. */ > }; > > -extern void unroll_and_peel_loops (int); > +extern void unroll_loops (int); > extern void doloop_optimize_loops (void); > extern void move_loop_invariants (void); > extern void scale_loop_profile (struct loop *loop, int scale, gcov_type > iteration_bound); > Index: passes.def > =================================================================== > --- passes.def (revision 216145) > +++ passes.def (working copy) > @@ -359,7 +359,7 @@ along with GCC; see the file COPYING3. > PUSH_INSERT_PASSES_WITHIN (pass_loop2) > NEXT_PASS (pass_rtl_loop_init); > NEXT_PASS (pass_rtl_move_loop_invariants); > - NEXT_PASS (pass_rtl_unroll_and_peel_loops); > + NEXT_PASS (pass_rtl_unroll_loops); > NEXT_PASS (pass_rtl_doloop); > NEXT_PASS (pass_rtl_loop_done); > TERMINATE_PASS_LIST () > Index: loop-unroll.c > =================================================================== > --- loop-unroll.c (revision 216145) > +++ loop-unroll.c (working copy) > @@ -1,4 +1,4 @@ > -/* Loop unrolling and peeling. > +/* Loop unrolling. > Copyright (C) 2002-2014 Free Software Foundation, Inc. > > This file is part of GCC. > @@ -34,8 +34,8 @@ along with GCC; see the file COPYING3. > #include "target.h" > #include "dumpfile.h" > > -/* This pass performs loop unrolling and peeling. We only perform these > - optimizations on innermost loops (with single exception) because > +/* This pass performs loop unrolling. We only perform this > + optimization on innermost loops (with single exception) because > the impact on performance is greatest here, and we want to avoid > unnecessary code size growth. The gain is caused by greater sequentiality > of code, better code to optimize for further passes and in some cases > @@ -44,12 +44,6 @@ along with GCC; see the file COPYING3. > > What we do: > > - -- complete peeling of once-rolling loops; this is the above mentioned > - exception, as this causes loop to be cancelled completely and > - does not cause code growth > - -- complete peeling of loops that roll (small) constant times. > - -- simple peeling of first iterations of loops that do not roll much > - (according to profile feedback) > -- unrolling of loops that roll constant times; this is almost always > win, as we get rid of exit condition tests. > -- unrolling of loops that roll number of times that we can compute > @@ -62,7 +56,7 @@ along with GCC; see the file COPYING3. > appropriate function below. > > There is a lot of parameters (defined and described in params.def) that > - control how much we unroll/peel. > + control how much we unroll. > > ??? A great problem is that we don't have a good way how to determine > how many times we should unroll the loop; the experiments I have made > @@ -170,17 +164,11 @@ struct opt_info > basic_block loop_preheader; /* The loop preheader basic block. */ > }; > > -static void decide_unrolling_and_peeling (int); > -static void peel_loops_completely (int); > -static void decide_peel_simple (struct loop *, int); > -static void decide_peel_once_rolling (struct loop *, int); > -static void decide_peel_completely (struct loop *, int); > static void decide_unroll_stupid (struct loop *, int); > static void decide_unroll_constant_iterations (struct loop *, int); > static void decide_unroll_runtime_iterations (struct loop *, int); > -static void peel_loop_simple (struct loop *); > -static void peel_loop_completely (struct loop *); > static void unroll_loop_stupid (struct loop *); > +static void decide_unrolling (int); > static void unroll_loop_constant_iterations (struct loop *); > static void unroll_loop_runtime_iterations (struct loop *); > static struct opt_info *analyze_insns_in_loop (struct loop *); > @@ -197,15 +185,13 @@ static void combine_var_copies_in_loop_e > basic_block); > static rtx get_expansion (struct var_to_expand *); > > -/* Emit a message summarizing the unroll or peel that will be > +/* Emit a message summarizing the unroll that will be > performed for LOOP, along with the loop's location LOCUS, if > appropriate given the dump or -fopt-info settings. */ > > static void > -report_unroll_peel (struct loop *loop, location_t locus) > +report_unroll (struct loop *loop, location_t locus) > { > - struct niter_desc *desc; > - int niters = 0; > int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS; > > if (loop->lpt_decision.decision == LPT_NONE) > @@ -214,169 +200,20 @@ report_unroll_peel (struct loop *loop, l > if (!dump_enabled_p ()) > return; > > - /* In the special case where the loop never iterated, emit > - a different message so that we don't report an unroll by 0. > - This matches the equivalent message emitted during tree unrolling. */ > - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY > - && !loop->lpt_decision.times) > - { > - dump_printf_loc (report_flags, locus, > - "loop turned into non-loop; it never loops.\n"); > - return; > - } > - > - desc = get_simple_loop_desc (loop); > - > - if (desc->const_iter) > - niters = desc->niter; > - else if (loop->header->count) > - niters = expected_loop_iterations (loop); > - > - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY) > - dump_printf_loc (report_flags, locus, > - "loop with %d iterations completely unrolled", > - loop->lpt_decision.times + 1); > - else > - dump_printf_loc (report_flags, locus, > - "loop %s %d times", > - (loop->lpt_decision.decision == LPT_PEEL_SIMPLE > - ? "peeled" : "unrolled"), > - loop->lpt_decision.times); > + dump_printf_loc (report_flags, locus, > + "loop unrolled %d times", > + loop->lpt_decision.times); > if (profile_info) > dump_printf (report_flags, > - " (header execution count %d", > + " (header execution count %d)", > (int)loop->header->count); > - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY) > - dump_printf (report_flags, > - "%s%s iterations %d)", > - profile_info ? ", " : " (", > - desc->const_iter ? "const" : "average", > - niters); > - else if (profile_info) > - dump_printf (report_flags, ")"); > > dump_printf (report_flags, "\n"); > } > > -/* Unroll and/or peel (depending on FLAGS) LOOPS. */ > -void > -unroll_and_peel_loops (int flags) > -{ > - struct loop *loop; > - bool changed = false; > - > - /* First perform complete loop peeling (it is almost surely a win, > - and affects parameters for further decision a lot). */ > - peel_loops_completely (flags); > - > - /* Now decide rest of unrolling and peeling. */ > - decide_unrolling_and_peeling (flags); > - > - /* Scan the loops, inner ones first. */ > - FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) > - { > - /* And perform the appropriate transformations. */ > - switch (loop->lpt_decision.decision) > - { > - case LPT_PEEL_COMPLETELY: > - /* Already done. */ > - gcc_unreachable (); > - case LPT_PEEL_SIMPLE: > - peel_loop_simple (loop); > - changed = true; > - break; > - case LPT_UNROLL_CONSTANT: > - unroll_loop_constant_iterations (loop); > - changed = true; > - break; > - case LPT_UNROLL_RUNTIME: > - unroll_loop_runtime_iterations (loop); > - changed = true; > - break; > - case LPT_UNROLL_STUPID: > - unroll_loop_stupid (loop); > - changed = true; > - break; > - case LPT_NONE: > - break; > - default: > - gcc_unreachable (); > - } > - } > - > - if (changed) > - { > - calculate_dominance_info (CDI_DOMINATORS); > - fix_loop_structure (NULL); > - } > - > - iv_analysis_done (); > -} > - > -/* Check whether exit of the LOOP is at the end of loop body. */ > - > -static bool > -loop_exit_at_end_p (struct loop *loop) > -{ > - struct niter_desc *desc = get_simple_loop_desc (loop); > - rtx_insn *insn; > - > - if (desc->in_edge->dest != loop->latch) > - return false; > - > - /* Check that the latch is empty. */ > - FOR_BB_INSNS (loop->latch, insn) > - { > - if (NONDEBUG_INSN_P (insn)) > - return false; > - } > - > - return true; > -} > - > -/* Depending on FLAGS, check whether to peel loops completely and do so. */ > -static void > -peel_loops_completely (int flags) > -{ > - struct loop *loop; > - bool changed = false; > - > - /* Scan the loops, the inner ones first. */ > - FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) > - { > - loop->lpt_decision.decision = LPT_NONE; > - location_t locus = get_loop_location (loop); > - > - if (dump_enabled_p ()) > - dump_printf_loc (TDF_RTL, locus, > - ";; *** Considering loop %d at BB %d for " > - "complete peeling ***\n", > - loop->num, loop->header->index); > - > - loop->ninsns = num_loop_insns (loop); > - > - decide_peel_once_rolling (loop, flags); > - if (loop->lpt_decision.decision == LPT_NONE) > - decide_peel_completely (loop, flags); > - > - if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY) > - { > - report_unroll_peel (loop, locus); > - peel_loop_completely (loop); > - changed = true; > - } > - } > - > - if (changed) > - { > - calculate_dominance_info (CDI_DOMINATORS); > - fix_loop_structure (NULL); > - } > -} > - > -/* Decide whether unroll or peel loops (depending on FLAGS) and how much. */ > +/* Decide whether unroll loops and how much. */ > static void > -decide_unrolling_and_peeling (int flags) > +decide_unrolling (int flags) > { > struct loop *loop; > > @@ -389,7 +226,7 @@ decide_unrolling_and_peeling (int flags) > if (dump_enabled_p ()) > dump_printf_loc (TDF_RTL, locus, > ";; *** Considering loop %d at BB %d for " > - "unrolling and peeling ***\n", > + "unrolling ***\n", > loop->num, loop->header->index); > > /* Do not peel cold areas. */ > @@ -428,204 +265,77 @@ decide_unrolling_and_peeling (int flags) > decide_unroll_runtime_iterations (loop, flags); > if (loop->lpt_decision.decision == LPT_NONE) > decide_unroll_stupid (loop, flags); > - if (loop->lpt_decision.decision == LPT_NONE) > - decide_peel_simple (loop, flags); > - > - report_unroll_peel (loop, locus); > - } > -} > - > -/* Decide whether the LOOP is once rolling and suitable for complete > - peeling. */ > -static void > -decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED) > -{ > - struct niter_desc *desc; > - > - if (dump_file) > - fprintf (dump_file, "\n;; Considering peeling once rolling loop\n"); > - > - /* Is the loop small enough? */ > - if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns) > - { > - if (dump_file) > - fprintf (dump_file, ";; Not considering loop, is too big\n"); > - return; > - } > > - /* Check for simple loops. */ > - desc = get_simple_loop_desc (loop); > - > - /* Check number of iterations. */ > - if (!desc->simple_p > - || desc->assumptions > - || desc->infinite > - || !desc->const_iter > - || (desc->niter != 0 > - && get_max_loop_iterations_int (loop) != 0)) > - { > - if (dump_file) > - fprintf (dump_file, > - ";; Unable to prove that the loop rolls exactly once\n"); > - return; > + report_unroll (loop, locus); > } > - > - /* Success. */ > - loop->lpt_decision.decision = LPT_PEEL_COMPLETELY; > } > > -/* Decide whether the LOOP is suitable for complete peeling. */ > -static void > -decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED) > +/* Unroll LOOPS. */ > +void > +unroll_loops (int flags) > { > - unsigned npeel; > - struct niter_desc *desc; > - > - if (dump_file) > - fprintf (dump_file, "\n;; Considering peeling completely\n"); > - > - /* Skip non-innermost loops. */ > - if (loop->inner) > - { > - if (dump_file) > - fprintf (dump_file, ";; Not considering loop, is not innermost\n"); > - return; > - } > - > - /* Do not peel cold areas. */ > - if (optimize_loop_for_size_p (loop)) > - { > - if (dump_file) > - fprintf (dump_file, ";; Not considering loop, cold area\n"); > - return; > - } > - > - /* Can the loop be manipulated? */ > - if (!can_duplicate_loop_p (loop)) > - { > - if (dump_file) > - fprintf (dump_file, > - ";; Not considering loop, cannot duplicate\n"); > - return; > - } > - > - /* npeel = number of iterations to peel. */ > - npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns; > - if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES)) > - npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES); > - > - /* Is the loop small enough? */ > - if (!npeel) > - { > - if (dump_file) > - fprintf (dump_file, ";; Not considering loop, is too big\n"); > - return; > - } > - > - /* Check for simple loops. */ > - desc = get_simple_loop_desc (loop); > + struct loop *loop; > + bool changed = false; > > - /* Check number of iterations. */ > - if (!desc->simple_p > - || desc->assumptions > - || !desc->const_iter > - || desc->infinite) > - { > - if (dump_file) > - fprintf (dump_file, > - ";; Unable to prove that the loop iterates constant times\n"); > - return; > - } > + /* Now decide rest of unrolling. */ > + decide_unrolling (flags); > > - if (desc->niter > npeel - 1) > + /* Scan the loops, inner ones first. */ > + FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) > { > - if (dump_file) > + /* And perform the appropriate transformations. */ > + switch (loop->lpt_decision.decision) > { > - fprintf (dump_file, > - ";; Not peeling loop completely, rolls too much ("); > - fprintf (dump_file, "%"PRId64, desc->niter); > - fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel); > + case LPT_UNROLL_CONSTANT: > + unroll_loop_constant_iterations (loop); > + changed = true; > + break; > + case LPT_UNROLL_RUNTIME: > + unroll_loop_runtime_iterations (loop); > + changed = true; > + break; > + case LPT_UNROLL_STUPID: > + unroll_loop_stupid (loop); > + changed = true; > + break; > + case LPT_NONE: > + break; > + default: > + gcc_unreachable (); > } > - return; > } > > - /* Success. */ > - loop->lpt_decision.decision = LPT_PEEL_COMPLETELY; > -} > - > -/* Peel all iterations of LOOP, remove exit edges and cancel the loop > - completely. The transformation done: > + if (changed) > + { > + calculate_dominance_info (CDI_DOMINATORS); > + fix_loop_structure (NULL); > + } > > - for (i = 0; i < 4; i++) > - body; > + iv_analysis_done (); > +} > > - ==> > +/* Check whether exit of the LOOP is at the end of loop body. */ > > - i = 0; > - body; i++; > - body; i++; > - body; i++; > - body; i++; > - */ > -static void > -peel_loop_completely (struct loop *loop) > +static bool > +loop_exit_at_end_p (struct loop *loop) > { > - sbitmap wont_exit; > - unsigned HOST_WIDE_INT npeel; > - unsigned i; > - edge ein; > struct niter_desc *desc = get_simple_loop_desc (loop); > - struct opt_info *opt_info = NULL; > - > - npeel = desc->niter; > - > - if (npeel) > - { > - bool ok; > - > - wont_exit = sbitmap_alloc (npeel + 1); > - bitmap_ones (wont_exit); > - bitmap_clear_bit (wont_exit, 0); > - if (desc->noloop_assumptions) > - bitmap_clear_bit (wont_exit, 1); > - > - auto_vec<edge> remove_edges; > - if (flag_split_ivs_in_unroller) > - opt_info = analyze_insns_in_loop (loop); > - > - opt_info_start_duplication (opt_info); > - ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), > - npeel, > - wont_exit, desc->out_edge, > - &remove_edges, > - DLTHE_FLAG_UPDATE_FREQ > - | DLTHE_FLAG_COMPLETTE_PEEL > - | (opt_info > - ? DLTHE_RECORD_COPY_NUMBER : 0)); > - gcc_assert (ok); > + rtx_insn *insn; > > - free (wont_exit); > + /* We should never have conditional in latch block. */ > + gcc_assert (desc->in_edge->dest != loop->header); > > - if (opt_info) > - { > - apply_opt_in_copies (opt_info, npeel, false, true); > - free_opt_info (opt_info); > - } > + if (desc->in_edge->dest != loop->latch) > + return false; > > - /* Remove the exit edges. */ > - FOR_EACH_VEC_ELT (remove_edges, i, ein) > - remove_path (ein); > + /* Check that the latch is empty. */ > + FOR_BB_INSNS (loop->latch, insn) > + { > + if (INSN_P (insn) && active_insn_p (insn)) > + return false; > } > > - ein = desc->in_edge; > - free_simple_loop_desc (loop); > - > - /* Now remove the unreachable part of the last iteration and cancel > - the loop. */ > - remove_path (ein); > - > - if (dump_file) > - fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) > npeel); > + return true; > } > > /* Decide whether to unroll LOOP iterating constant number of times > @@ -1372,160 +1082,6 @@ unroll_loop_runtime_iterations (struct l > max_unroll, num_loop_insns (loop)); > } > > -/* Decide whether to simply peel LOOP and how much. */ > -static void > -decide_peel_simple (struct loop *loop, int flags) > -{ > - unsigned npeel; > - widest_int iterations; > - > - if (!(flags & UAP_PEEL)) > - { > - /* We were not asked to, just return back silently. */ > - return; > - } > - > - if (dump_file) > - fprintf (dump_file, "\n;; Considering simply peeling loop\n"); > - > - /* npeel = number of iterations to peel. */ > - npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns; > - if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES)) > - npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES); > - > - /* Skip big loops. */ > - if (!npeel) > - { > - if (dump_file) > - fprintf (dump_file, ";; Not considering loop, is too big\n"); > - return; > - } > - > - /* Do not simply peel loops with branches inside -- it increases number > - of mispredicts. > - Exception is when we do have profile and we however have good chance > - to peel proper number of iterations loop will iterate in practice. > - TODO: this heuristic needs tunning; while for complette unrolling > - the branch inside loop mostly eliminates any improvements, for > - peeling it is not the case. Also a function call inside loop is > - also branch from branch prediction POV (and probably better reason > - to not unroll/peel). */ > - if (num_loop_branches (loop) > 1 > - && profile_status_for_fn (cfun) != PROFILE_READ) > - { > - if (dump_file) > - fprintf (dump_file, ";; Not peeling, contains branches\n"); > - return; > - } > - > - /* If we have realistic estimate on number of iterations, use it. */ > - if (get_estimated_loop_iterations (loop, &iterations)) > - { > - if (wi::leu_p (npeel, iterations)) > - { > - if (dump_file) > - { > - fprintf (dump_file, ";; Not peeling loop, rolls too much ("); > - fprintf (dump_file, "%"PRId64, > - (int64_t) (iterations.to_shwi () + 1)); > - fprintf (dump_file, " iterations > %d [maximum peelings])\n", > - npeel); > - } > - return; > - } > - npeel = iterations.to_shwi () + 1; > - } > - /* If we have small enough bound on iterations, we can still peel > (completely > - unroll). */ > - else if (get_max_loop_iterations (loop, &iterations) > - && wi::ltu_p (iterations, npeel)) > - npeel = iterations.to_shwi () + 1; > - else > - { > - /* For now we have no good heuristics to decide whether loop peeling > - will be effective, so disable it. */ > - if (dump_file) > - fprintf (dump_file, > - ";; Not peeling loop, no evidence it will be profitable\n"); > - return; > - } > - > - /* Success. */ > - loop->lpt_decision.decision = LPT_PEEL_SIMPLE; > - loop->lpt_decision.times = npeel; > -} > - > -/* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this: > - > - while (cond) > - body; > - > - ==> (LOOP->LPT_DECISION.TIMES == 3) > - > - if (!cond) goto end; > - body; > - if (!cond) goto end; > - body; > - if (!cond) goto end; > - body; > - while (cond) > - body; > - end: ; > - */ > -static void > -peel_loop_simple (struct loop *loop) > -{ > - sbitmap wont_exit; > - unsigned npeel = loop->lpt_decision.times; > - struct niter_desc *desc = get_simple_loop_desc (loop); > - struct opt_info *opt_info = NULL; > - bool ok; > - > - if (flag_split_ivs_in_unroller && npeel > 1) > - opt_info = analyze_insns_in_loop (loop); > - > - wont_exit = sbitmap_alloc (npeel + 1); > - bitmap_clear (wont_exit); > - > - opt_info_start_duplication (opt_info); > - > - ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), > - npeel, wont_exit, NULL, > - NULL, DLTHE_FLAG_UPDATE_FREQ > - | (opt_info > - ? DLTHE_RECORD_COPY_NUMBER > - : 0)); > - gcc_assert (ok); > - > - free (wont_exit); > - > - if (opt_info) > - { > - apply_opt_in_copies (opt_info, npeel, false, false); > - free_opt_info (opt_info); > - } > - > - if (desc->simple_p) > - { > - if (desc->const_iter) > - { > - desc->niter -= npeel; > - desc->niter_expr = GEN_INT (desc->niter); > - desc->noloop_assumptions = NULL_RTX; > - } > - else > - { > - /* We cannot just update niter_expr, as its value might be clobbered > - inside loop. We could handle this by counting the number into > - temporary just like we do in runtime unrolling, but it does not > - seem worthwhile. */ > - free_simple_loop_desc (loop); > - } > - } > - if (dump_file) > - fprintf (dump_file, ";; Peeling loop %d times\n", npeel); > -} > - > /* Decide whether to unroll LOOP stupidly and how much. */ > static void > decide_unroll_stupid (struct loop *loop, int flags) > Index: testsuite/gcc.dg/unroll_3.c > =================================================================== > --- testsuite/gcc.dg/unroll_3.c (revision 216145) > +++ testsuite/gcc.dg/unroll_3.c (working copy) > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp > -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo" > } */ > +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops > -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli=foo > -fdisable-tree-cunrolli=foo2" } */ > > unsigned a[100], b[100]; > inline void bar() > @@ -28,5 +28,5 @@ int foo2(void) > return 1; > } > > -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never > loops" 1 "loop2_unroll" } } */ > -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */ > +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely > unrolled" 1 "cunrolli" } } */ > +/* { dg-final { cleanup-tree-dump "cunrolli" } } */ > Index: testsuite/gcc.dg/tree-prof/peel-1.c > =================================================================== > --- testsuite/gcc.dg/tree-prof/peel-1.c (revision 216145) > +++ testsuite/gcc.dg/tree-prof/peel-1.c (working copy) > @@ -1,4 +1,4 @@ > -/* { dg-options "-O3 -fdump-rtl-loop2_unroll -fno-unroll-loops -fpeel-loops" > } */ > +/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops > -fpeel-loops" } */ > void abort(); > > int a[1000]; > Index: testsuite/gcc.dg/unroll_4.c > =================================================================== > --- testsuite/gcc.dg/unroll_4.c (revision 216145) > +++ testsuite/gcc.dg/unroll_4.c (working copy) > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp > -fdisable-tree-cunroll -fdisable-tree-cunrolli > -fenable-rtl-loop2_unroll=foo2" } */ > +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops > -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli=foo2 > -fdisable-tree-cunrolli=foo" } */ > > unsigned a[100], b[100]; > inline void bar() > @@ -28,5 +28,5 @@ int foo2(void) > return 1; > } > > -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never > loops" 1 "loop2_unroll" } } */ > -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */ > +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely > unrolled" 1 "cunrolli" } } */ > +/* { dg-final { cleanup-tree-dump "cunrolli" } } */ > Index: testsuite/gcc.dg/unroll_1.c > =================================================================== > --- testsuite/gcc.dg/unroll_1.c (revision 216145) > +++ testsuite/gcc.dg/unroll_1.c (working copy) > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -fdump-rtl-loop2_unroll=stderr -fno-peel-loops > -fno-tree-vrp -fdisable-tree-cunroll -fdisable-tree-cunrolli > -fenable-rtl-loop2 -fenable-rtl-loop2_unroll" } */ > +/* { dg-options "-O2 -fdump-tree-cunrolli-details=stderr -fno-peel-loops > -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli" } */ > > unsigned a[100], b[100]; > inline void bar() > @@ -11,7 +11,7 @@ int foo(void) > { > int i; > bar(); > - for (i = 0; i < 2; i++) /* { dg-message "note: loop turned into non-loop; > it never loops" } */ > + for (i = 0; i < 2; i++) /* { dg-message "note: loop with 3 iterations > completely unrolled" } */ > { > a[i]= b[i] + 1; > } > @@ -21,7 +21,7 @@ int foo(void) > int foo2(void) > { > int i; > - for (i = 0; i < 2; i++) /* { dg-message "note: loop turned into non-loop; > it never loops" } */ > + for (i = 0; i < 2; i++) /* { dg-message "note: loop with 3 iterations > completely unrolled" } */ > { > a[i]= b[i] + 1; > } > Index: testsuite/gcc.dg/unroll_2.c > =================================================================== > --- testsuite/gcc.dg/unroll_2.c (revision 216145) > +++ testsuite/gcc.dg/unroll_2.c (working copy) > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp > -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo > -fenable-rtl-loop2_unroll" } */ > +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops > -fno-tree-vrp -fdisable-tree-cunrolli=foo -fenable-tree-cunrolli=foo" } */ > > unsigned a[100], b[100]; > inline void bar() > @@ -28,5 +28,5 @@ int foo2(void) > return 1; > } > > -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never > loops" 1 "loop2_unroll" } } */ > -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */ > +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely > unrolled" 1 "cunrolli" } } */ > +/* { dg-final { cleanup-tree-dump "cunrolli" } } */ > > -- Richard Biener <rguent...@suse.de> SUSE / SUSE Labs SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer