Re: Move loop peeling from RTL to gimple

Richard Biener Tue, 14 Oct 2014 00:41:22 -0700

On Tue, 14 Oct 2014, Jan Hubicka wrote:

> Hi,
> this is update of my 2013 update to 2012 patch to move rtl loop peeling
> to tree level. This is to expose optimization oppurtunities earlier.
> Incrementally I think I can also improve profiling to provide a histogram
> on loop iterations and get more sensible peeling decisions.
> 
> profiled-bootstrapped/regtested x86_64-linux, OK?


Ok.

Thanks,
Richard.

> Honza
> 
>       * loop-unroll.c: (decide_unrolling_and_peeling): Rename to
>       (decide_unrolling): ... this one.
>       (peel_loops_completely): Remove.
>       (decide_peel_simple): Remove.
>       (decide_peel_once_rolling): Remove.
>       (decide_peel_completely): Remove.
>       (peel_loop_simple): Remove.
>       (peel_loop_completely): Remove.
>       (unroll_and_peel_loops): Rename to ...
>       (unroll_loops): ... this one; handle only unrolling.
>       * cfgloop.h (lpt_dec): Remove LPT_PEEL_COMPLETELY and
>       LPT_PEEL_SIMPLE.
>       (UAP_PEEL): Remove.
>       (unroll_and_peel_loops): Remove.
>       (unroll_loops): New.
>       * passes.def: Replace
>       pass_rtl_unroll_and_peel_loops by pass_rtl_unroll_loops.
>       * loop-init.c (gate_rtl_unroll_and_peel_loops,
>       rtl_unroll_and_peel_loops): Rename to ...
>       (gate_rtl_unroll_loops, rtl_unroll_loops): ... these; update.
>       (pass_rtl_unroll_and_peel_loops): Rename to ...
>       (pass_rtl_unroll_loops): ... this one.
>       * tree-pass.h (make_pass_rtl_unroll_and_peel_loops): Remove.
>       (make_pass_rtl_unroll_loops): New.
>       * tree-ssa-loop-ivcanon.c: (estimated_peeled_sequence_size, 
> try_peel_loop): New.
>       (canonicalize_loop_induction_variables): Update.
> 
>       * gcc.dg/tree-prof/peel-1.c: Update.
>       * gcc.dg/tree-prof/unroll-1.c: Update.
>       * gcc.dg/gcc.dg/unroll_1.c: Update.
>       * gcc.dg/gcc.dg/unroll_2.c: Update.
>       * gcc.dg/gcc.dg/unroll_3.c: Update.
>       * gcc.dg/gcc.dg/unroll_4.c: Update.
> Index: tree-pass.h
> ===================================================================
> --- tree-pass.h       (revision 216145)
> +++ tree-pass.h       (working copy)
> @@ -504,7 +504,7 @@ extern rtl_opt_pass *make_pass_outof_cfg
>  extern rtl_opt_pass *make_pass_loop2 (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_loop_init (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_move_loop_invariants (gcc::context *ctxt);
> -extern rtl_opt_pass *make_pass_rtl_unroll_and_peel_loops (gcc::context 
> *ctxt);
> +extern rtl_opt_pass *make_pass_rtl_unroll_loops (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_doloop (gcc::context *ctxt);
>  extern rtl_opt_pass *make_pass_rtl_loop_done (gcc::context *ctxt);
>  
> Index: tree-ssa-loop-ivcanon.c
> ===================================================================
> --- tree-ssa-loop-ivcanon.c   (revision 216145)
> +++ tree-ssa-loop-ivcanon.c   (working copy)
> @@ -28,9 +28,12 @@ along with GCC; see the file COPYING3.
>     variables.  In that case the created optimization possibilities are likely
>     to pay up.
>  
> -   Additionally in case we detect that it is beneficial to unroll the
> -   loop completely, we do it right here to expose the optimization
> -   possibilities to the following passes.  */
> +   We also perform
> +     - complette unrolling (or peeling) when the loops is rolling few enough
> +       times
> +     - simple peeling (i.e. copying few initial iterations prior the loop)
> +       when number of iteration estimate is known (typically by the profile
> +       info).  */
>  
>  #include "config.h"
>  #include "system.h"
> @@ -657,11 +660,12 @@ try_unroll_loop_completely (struct loop
>                           HOST_WIDE_INT maxiter,
>                           location_t locus)
>  {
> -  unsigned HOST_WIDE_INT n_unroll, ninsns, max_unroll, unr_insns;
> +  unsigned HOST_WIDE_INT n_unroll = 0, ninsns, max_unroll, unr_insns;
>    gimple cond;
>    struct loop_size size;
>    bool n_unroll_found = false;
>    edge edge_to_cancel = NULL;
> +  int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
>  
>    /* See if we proved number of iterations to be low constant.
>  
> @@ -821,6 +825,8 @@ try_unroll_loop_completely (struct loop
>                    loop->num);
>         return false;
>       }
> +      dump_printf_loc (report_flags, locus,
> +                       "loop turned into non-loop; it never loops.\n");
>  
>        initialize_original_copy_tables ();
>        wont_exit = sbitmap_alloc (n_unroll + 1);
> @@ -902,6 +908,133 @@ try_unroll_loop_completely (struct loop
>    return true;
>  }
>  
> +/* Return number of instructions after peeling.  */
> +static unsigned HOST_WIDE_INT
> +estimated_peeled_sequence_size (struct loop_size *size,
> +                             unsigned HOST_WIDE_INT npeel)
> +{
> +  return MAX (npeel * (HOST_WIDE_INT) (size->overall
> +                                    - size->eliminated_by_peeling), 1);
> +}
> +
> +/* If the loop is expected to iterate N times and is
> +   small enough, duplicate the loop body N+1 times before
> +   the loop itself.  This way the hot path will never
> +   enter the loop.  
> +   Parameters are the same as for try_unroll_loops_completely */
> +
> +static bool
> +try_peel_loop (struct loop *loop,
> +            edge exit, tree niter,
> +            HOST_WIDE_INT maxiter)
> +{
> +  int npeel;
> +  struct loop_size size;
> +  int peeled_size;
> +  sbitmap wont_exit;
> +  unsigned i;
> +  vec<edge> to_remove = vNULL;
> +  edge e;
> +
> +  /* If the iteration bound is known and large, then we can safely eliminate
> +     the check in peeled copies.  */
> +  if (TREE_CODE (niter) != INTEGER_CST)
> +    exit = NULL;
> +
> +  if (!flag_peel_loops || PARAM_VALUE (PARAM_MAX_PEEL_TIMES) <= 0)
> +    return false;
> +
> +  /* Peel only innermost loops.  */
> +  if (loop->inner)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, "Not peeling: outer loop\n");
> +      return false;
> +    }
> +
> +  if (!optimize_loop_for_speed_p (loop))
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, "Not peeling: cold loop\n");
> +      return false;
> +    }
> +
> +  /* Check if there is an estimate on the number of iterations.  */
> +  npeel = estimated_loop_iterations_int (loop);
> +  if (npeel < 0)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, "Not peeling: number of iterations is not "
> +              "estimated\n");
> +      return false;
> +    }
> +  if (maxiter >= 0 && maxiter <= npeel)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, "Not peeling: upper bound is known so can "
> +              "unroll complettely\n");
> +      return false;
> +    }
> +
> +  /* We want to peel estimated number of iterations + 1 (so we never
> +     enter the loop on quick path).  Check against PARAM_MAX_PEEL_TIMES
> +     and be sure to avoid overflows.  */
> +  if (npeel > PARAM_VALUE (PARAM_MAX_PEEL_TIMES) - 1)
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, "Not peeling: rolls too much "
> +              "(%i + 1 > --param max-peel-times)\n", npeel);
> +      return false;
> +    }
> +  npeel++;
> +
> +  /* Check peeled loops size.  */
> +  tree_estimate_loop_size (loop, exit, NULL, &size,
> +                        PARAM_VALUE (PARAM_MAX_PEELED_INSNS));
> +  if ((peeled_size = estimated_peeled_sequence_size (&size, npeel))
> +      > PARAM_VALUE (PARAM_MAX_PEELED_INSNS))
> +    {
> +      if (dump_file)
> +        fprintf (dump_file, "Not peeling: peeled sequence size is too large "
> +              "(%i insns > --param max-peel-insns)", peeled_size);
> +      return false;
> +    }
> +
> +  /* Duplicate possibly eliminating the exits.  */
> +  initialize_original_copy_tables ();
> +  wont_exit = sbitmap_alloc (npeel + 1);
> +  bitmap_ones (wont_exit);
> +  bitmap_clear_bit (wont_exit, 0);
> +  if (!gimple_duplicate_loop_to_header_edge (loop, loop_preheader_edge 
> (loop),
> +                                          npeel, wont_exit,
> +                                          exit, &to_remove,
> +                                          DLTHE_FLAG_UPDATE_FREQ
> +                                          | DLTHE_FLAG_COMPLETTE_PEEL))
> +    {
> +      free_original_copy_tables ();
> +      free (wont_exit);
> +      return false;
> +    }
> +  FOR_EACH_VEC_ELT (to_remove, i, e)
> +    {
> +      bool ok = remove_path (e);
> +      gcc_assert (ok);
> +    }
> +  free (wont_exit);
> +  free_original_copy_tables ();
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "Peeled loop %d, %i times.\n",
> +            loop->num, npeel);
> +    }
> +  if (loop->any_upper_bound)
> +    loop->nb_iterations_upper_bound -= npeel;
> +  loop->nb_iterations_estimate = 0;
> +  /* Make sure to mark loop cold so we do not try to peel it more.  */
> +  scale_loop_profile (loop, 1, 0);
> +  loop->header->count = 0;
> +  return true;
> +}
>  /* Adds a canonical induction variable to LOOP if suitable.
>     CREATE_IV is true if we may create a new iv.  UL determines
>     which loops we are allowed to completely unroll.  If TRY_EVAL is true, we 
> try
> @@ -981,6 +1114,9 @@ canonicalize_loop_induction_variables (s
>        && exit && just_once_each_iteration_p (loop, exit->src))
>      create_canonical_iv (loop, exit, niter);
>  
> +  if (ul == UL_ALL)
> +    modified |= try_peel_loop (loop, exit, niter, maxiter);
> +
>    return modified;
>  }
>  
> Index: loop-init.c
> ===================================================================
> --- loop-init.c       (revision 216145)
> +++ loop-init.c       (working copy)
> @@ -357,7 +357,6 @@ pass_loop2::gate (function *fun)
>    if (optimize > 0
>        && (flag_move_loop_invariants
>         || flag_unswitch_loops
> -       || flag_peel_loops
>         || flag_unroll_loops
>  #ifdef HAVE_doloop_end
>         || (flag_branch_on_count_reg && HAVE_doloop_end)
> @@ -537,7 +536,7 @@ make_pass_rtl_move_loop_invariants (gcc:
>  
>  namespace {
>  
> -const pass_data pass_data_rtl_unroll_and_peel_loops =
> +const pass_data pass_data_rtl_unroll_loops =
>  {
>    RTL_PASS, /* type */
>    "loop2_unroll", /* name */
> @@ -550,11 +549,11 @@ const pass_data pass_data_rtl_unroll_and
>    0, /* todo_flags_finish */
>  };
>  
> -class pass_rtl_unroll_and_peel_loops : public rtl_opt_pass
> +class pass_rtl_unroll_loops : public rtl_opt_pass
>  {
>  public:
> -  pass_rtl_unroll_and_peel_loops (gcc::context *ctxt)
> -    : rtl_opt_pass (pass_data_rtl_unroll_and_peel_loops, ctxt)
> +  pass_rtl_unroll_loops (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_rtl_unroll_loops, ctxt)
>    {}
>  
>    /* opt_pass methods: */
> @@ -565,10 +564,10 @@ public:
>  
>    virtual unsigned int execute (function *);
>  
> -}; // class pass_rtl_unroll_and_peel_loops
> +}; // class pass_rtl_unroll_loops
>  
>  unsigned int
> -pass_rtl_unroll_and_peel_loops::execute (function *fun)
> +pass_rtl_unroll_loops::execute (function *fun)
>  {
>    if (number_of_loops (fun) > 1)
>      {
> @@ -576,14 +575,12 @@ pass_rtl_unroll_and_peel_loops::execute
>        if (dump_file)
>       df_dump (dump_file);
>  
> -      if (flag_peel_loops)
> -     flags |= UAP_PEEL;
>        if (flag_unroll_loops)
>       flags |= UAP_UNROLL;
>        if (flag_unroll_all_loops)
>       flags |= UAP_UNROLL_ALL;
>  
> -      unroll_and_peel_loops (flags);
> +      unroll_loops (flags);
>      }
>    return 0;
>  }
> @@ -591,9 +588,9 @@ pass_rtl_unroll_and_peel_loops::execute
>  } // anon namespace
>  
>  rtl_opt_pass *
> -make_pass_rtl_unroll_and_peel_loops (gcc::context *ctxt)
> +make_pass_rtl_unroll_loops (gcc::context *ctxt)
>  {
> -  return new pass_rtl_unroll_and_peel_loops (ctxt);
> +  return new pass_rtl_unroll_loops (ctxt);
>  }
>  
>  
> Index: cfgloop.h
> ===================================================================
> --- cfgloop.h (revision 216145)
> +++ cfgloop.h (working copy)
> @@ -30,8 +30,6 @@ along with GCC; see the file COPYING3.
>  enum lpt_dec
>  {
>    LPT_NONE,
> -  LPT_PEEL_COMPLETELY,
> -  LPT_PEEL_SIMPLE,
>    LPT_UNROLL_CONSTANT,
>    LPT_UNROLL_RUNTIME,
>    LPT_UNROLL_STUPID
> @@ -731,12 +729,11 @@ extern void loop_optimizer_finalize (voi
>  /* Optimization passes.  */
>  enum
>  {
> -  UAP_PEEL = 1,              /* Enables loop peeling.  */
> -  UAP_UNROLL = 2,    /* Enables unrolling of loops if it seems profitable.  
> */
> -  UAP_UNROLL_ALL = 4 /* Enables unrolling of all loops.  */
> +  UAP_UNROLL = 1,    /* Enables unrolling of loops if it seems profitable.  
> */
> +  UAP_UNROLL_ALL = 2 /* Enables unrolling of all loops.  */
>  };
>  
> -extern void unroll_and_peel_loops (int);
> +extern void unroll_loops (int);
>  extern void doloop_optimize_loops (void);
>  extern void move_loop_invariants (void);
>  extern void scale_loop_profile (struct loop *loop, int scale, gcov_type 
> iteration_bound);
> Index: passes.def
> ===================================================================
> --- passes.def        (revision 216145)
> +++ passes.def        (working copy)
> @@ -359,7 +359,7 @@ along with GCC; see the file COPYING3.
>        PUSH_INSERT_PASSES_WITHIN (pass_loop2)
>         NEXT_PASS (pass_rtl_loop_init);
>         NEXT_PASS (pass_rtl_move_loop_invariants);
> -       NEXT_PASS (pass_rtl_unroll_and_peel_loops);
> +       NEXT_PASS (pass_rtl_unroll_loops);
>         NEXT_PASS (pass_rtl_doloop);
>         NEXT_PASS (pass_rtl_loop_done);
>         TERMINATE_PASS_LIST ()
> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c     (revision 216145)
> +++ loop-unroll.c     (working copy)
> @@ -1,4 +1,4 @@
> -/* Loop unrolling and peeling.
> +/* Loop unrolling.
>     Copyright (C) 2002-2014 Free Software Foundation, Inc.
>  
>  This file is part of GCC.
> @@ -34,8 +34,8 @@ along with GCC; see the file COPYING3.
>  #include "target.h"
>  #include "dumpfile.h"
>  
> -/* This pass performs loop unrolling and peeling.  We only perform these
> -   optimizations on innermost loops (with single exception) because
> +/* This pass performs loop unrolling.  We only perform this
> +   optimization on innermost loops (with single exception) because
>     the impact on performance is greatest here, and we want to avoid
>     unnecessary code size growth.  The gain is caused by greater sequentiality
>     of code, better code to optimize for further passes and in some cases
> @@ -44,12 +44,6 @@ along with GCC; see the file COPYING3.
>  
>     What we do:
>  
> -   -- complete peeling of once-rolling loops; this is the above mentioned
> -      exception, as this causes loop to be cancelled completely and
> -      does not cause code growth
> -   -- complete peeling of loops that roll (small) constant times.
> -   -- simple peeling of first iterations of loops that do not roll much
> -      (according to profile feedback)
>     -- unrolling of loops that roll constant times; this is almost always
>        win, as we get rid of exit condition tests.
>     -- unrolling of loops that roll number of times that we can compute
> @@ -62,7 +56,7 @@ along with GCC; see the file COPYING3.
>     appropriate function below.
>  
>     There is a lot of parameters (defined and described in params.def) that
> -   control how much we unroll/peel.
> +   control how much we unroll.
>  
>     ??? A great problem is that we don't have a good way how to determine
>     how many times we should unroll the loop; the experiments I have made
> @@ -170,17 +164,11 @@ struct opt_info
>    basic_block loop_preheader;      /* The loop preheader basic block.  */
>  };
>  
> -static void decide_unrolling_and_peeling (int);
> -static void peel_loops_completely (int);
> -static void decide_peel_simple (struct loop *, int);
> -static void decide_peel_once_rolling (struct loop *, int);
> -static void decide_peel_completely (struct loop *, int);
>  static void decide_unroll_stupid (struct loop *, int);
>  static void decide_unroll_constant_iterations (struct loop *, int);
>  static void decide_unroll_runtime_iterations (struct loop *, int);
> -static void peel_loop_simple (struct loop *);
> -static void peel_loop_completely (struct loop *);
>  static void unroll_loop_stupid (struct loop *);
> +static void decide_unrolling (int);
>  static void unroll_loop_constant_iterations (struct loop *);
>  static void unroll_loop_runtime_iterations (struct loop *);
>  static struct opt_info *analyze_insns_in_loop (struct loop *);
> @@ -197,15 +185,13 @@ static void combine_var_copies_in_loop_e
>                                            basic_block);
>  static rtx get_expansion (struct var_to_expand *);
>  
> -/* Emit a message summarizing the unroll or peel that will be
> +/* Emit a message summarizing the unroll that will be
>     performed for LOOP, along with the loop's location LOCUS, if
>     appropriate given the dump or -fopt-info settings.  */
>  
>  static void
> -report_unroll_peel (struct loop *loop, location_t locus)
> +report_unroll (struct loop *loop, location_t locus)
>  {
> -  struct niter_desc *desc;
> -  int niters = 0;
>    int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
>  
>    if (loop->lpt_decision.decision == LPT_NONE)
> @@ -214,169 +200,20 @@ report_unroll_peel (struct loop *loop, l
>    if (!dump_enabled_p ())
>      return;
>  
> -  /* In the special case where the loop never iterated, emit
> -     a different message so that we don't report an unroll by 0.
> -     This matches the equivalent message emitted during tree unrolling.  */
> -  if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
> -      && !loop->lpt_decision.times)
> -    {
> -      dump_printf_loc (report_flags, locus,
> -                       "loop turned into non-loop; it never loops.\n");
> -      return;
> -    }
> -
> -  desc = get_simple_loop_desc (loop);
> -
> -  if (desc->const_iter)
> -    niters = desc->niter;
> -  else if (loop->header->count)
> -    niters = expected_loop_iterations (loop);
> -
> -  if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
> -    dump_printf_loc (report_flags, locus,
> -                     "loop with %d iterations completely unrolled",
> -                  loop->lpt_decision.times + 1);
> -  else
> -    dump_printf_loc (report_flags, locus,
> -                     "loop %s %d times",
> -                     (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
> -                       ? "peeled" : "unrolled"),
> -                     loop->lpt_decision.times);
> +  dump_printf_loc (report_flags, locus,
> +                   "loop unrolled %d times",
> +                   loop->lpt_decision.times);
>    if (profile_info)
>      dump_printf (report_flags,
> -                 " (header execution count %d",
> +                 " (header execution count %d)",
>                   (int)loop->header->count);
> -  if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
> -    dump_printf (report_flags,
> -                 "%s%s iterations %d)",
> -                 profile_info ? ", " : " (",
> -                 desc->const_iter ? "const" : "average",
> -                 niters);
> -  else if (profile_info)
> -    dump_printf (report_flags, ")");
>  
>    dump_printf (report_flags, "\n");
>  }
>  
> -/* Unroll and/or peel (depending on FLAGS) LOOPS.  */
> -void
> -unroll_and_peel_loops (int flags)
> -{
> -  struct loop *loop;
> -  bool changed = false;
> -
> -  /* First perform complete loop peeling (it is almost surely a win,
> -     and affects parameters for further decision a lot).  */
> -  peel_loops_completely (flags);
> -
> -  /* Now decide rest of unrolling and peeling.  */
> -  decide_unrolling_and_peeling (flags);
> -
> -  /* Scan the loops, inner ones first.  */
> -  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> -    {
> -      /* And perform the appropriate transformations.  */
> -      switch (loop->lpt_decision.decision)
> -     {
> -     case LPT_PEEL_COMPLETELY:
> -       /* Already done.  */
> -       gcc_unreachable ();
> -     case LPT_PEEL_SIMPLE:
> -       peel_loop_simple (loop);
> -       changed = true;
> -       break;
> -     case LPT_UNROLL_CONSTANT:
> -       unroll_loop_constant_iterations (loop);
> -       changed = true;
> -       break;
> -     case LPT_UNROLL_RUNTIME:
> -       unroll_loop_runtime_iterations (loop);
> -       changed = true;
> -       break;
> -     case LPT_UNROLL_STUPID:
> -       unroll_loop_stupid (loop);
> -       changed = true;
> -       break;
> -     case LPT_NONE:
> -       break;
> -     default:
> -       gcc_unreachable ();
> -     }
> -    }
> -
> -    if (changed)
> -      {
> -     calculate_dominance_info (CDI_DOMINATORS);
> -     fix_loop_structure (NULL);
> -      }
> -
> -  iv_analysis_done ();
> -}
> -
> -/* Check whether exit of the LOOP is at the end of loop body.  */
> -
> -static bool
> -loop_exit_at_end_p (struct loop *loop)
> -{
> -  struct niter_desc *desc = get_simple_loop_desc (loop);
> -  rtx_insn *insn;
> -
> -  if (desc->in_edge->dest != loop->latch)
> -    return false;
> -
> -  /* Check that the latch is empty.  */
> -  FOR_BB_INSNS (loop->latch, insn)
> -    {
> -      if (NONDEBUG_INSN_P (insn))
> -     return false;
> -    }
> -
> -  return true;
> -}
> -
> -/* Depending on FLAGS, check whether to peel loops completely and do so.  */
> -static void
> -peel_loops_completely (int flags)
> -{
> -  struct loop *loop;
> -  bool changed = false;
> -
> -  /* Scan the loops, the inner ones first.  */
> -  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
> -    {
> -      loop->lpt_decision.decision = LPT_NONE;
> -      location_t locus = get_loop_location (loop);
> -
> -      if (dump_enabled_p ())
> -     dump_printf_loc (TDF_RTL, locus,
> -                         ";; *** Considering loop %d at BB %d for "
> -                         "complete peeling ***\n",
> -                         loop->num, loop->header->index);
> -
> -      loop->ninsns = num_loop_insns (loop);
> -
> -      decide_peel_once_rolling (loop, flags);
> -      if (loop->lpt_decision.decision == LPT_NONE)
> -     decide_peel_completely (loop, flags);
> -
> -      if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
> -     {
> -       report_unroll_peel (loop, locus);
> -       peel_loop_completely (loop);
> -       changed = true;
> -     }
> -    }
> -
> -    if (changed)
> -      {
> -     calculate_dominance_info (CDI_DOMINATORS);
> -     fix_loop_structure (NULL);
> -      }
> -}
> -
> -/* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
> +/* Decide whether unroll loops and how much.  */
>  static void
> -decide_unrolling_and_peeling (int flags)
> +decide_unrolling (int flags)
>  {
>    struct loop *loop;
>  
> @@ -389,7 +226,7 @@ decide_unrolling_and_peeling (int flags)
>        if (dump_enabled_p ())
>       dump_printf_loc (TDF_RTL, locus,
>                           ";; *** Considering loop %d at BB %d for "
> -                         "unrolling and peeling ***\n",
> +                         "unrolling ***\n",
>                           loop->num, loop->header->index);
>  
>        /* Do not peel cold areas.  */
> @@ -428,204 +265,77 @@ decide_unrolling_and_peeling (int flags)
>       decide_unroll_runtime_iterations (loop, flags);
>        if (loop->lpt_decision.decision == LPT_NONE)
>       decide_unroll_stupid (loop, flags);
> -      if (loop->lpt_decision.decision == LPT_NONE)
> -     decide_peel_simple (loop, flags);
> -
> -      report_unroll_peel (loop, locus);
> -    }
> -}
> -
> -/* Decide whether the LOOP is once rolling and suitable for complete
> -   peeling.  */
> -static void
> -decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
> -{
> -  struct niter_desc *desc;
> -
> -  if (dump_file)
> -    fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
> -
> -  /* Is the loop small enough?  */
> -  if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
> -    {
> -      if (dump_file)
> -     fprintf (dump_file, ";; Not considering loop, is too big\n");
> -      return;
> -    }
>  
> -  /* Check for simple loops.  */
> -  desc = get_simple_loop_desc (loop);
> -
> -  /* Check number of iterations.  */
> -  if (!desc->simple_p
> -      || desc->assumptions
> -      || desc->infinite
> -      || !desc->const_iter
> -      || (desc->niter != 0
> -       && get_max_loop_iterations_int (loop) != 0))
> -    {
> -      if (dump_file)
> -     fprintf (dump_file,
> -              ";; Unable to prove that the loop rolls exactly once\n");
> -      return;
> +      report_unroll (loop, locus);
>      }
> -
> -  /* Success.  */
> -  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
>  }
>  
> -/* Decide whether the LOOP is suitable for complete peeling.  */
> -static void
> -decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
> +/* Unroll LOOPS.  */
> +void
> +unroll_loops (int flags)
>  {
> -  unsigned npeel;
> -  struct niter_desc *desc;
> -
> -  if (dump_file)
> -    fprintf (dump_file, "\n;; Considering peeling completely\n");
> -
> -  /* Skip non-innermost loops.  */
> -  if (loop->inner)
> -    {
> -      if (dump_file)
> -     fprintf (dump_file, ";; Not considering loop, is not innermost\n");
> -      return;
> -    }
> -
> -  /* Do not peel cold areas.  */
> -  if (optimize_loop_for_size_p (loop))
> -    {
> -      if (dump_file)
> -     fprintf (dump_file, ";; Not considering loop, cold area\n");
> -      return;
> -    }
> -
> -  /* Can the loop be manipulated?  */
> -  if (!can_duplicate_loop_p (loop))
> -    {
> -      if (dump_file)
> -     fprintf (dump_file,
> -              ";; Not considering loop, cannot duplicate\n");
> -      return;
> -    }
> -
> -  /* npeel = number of iterations to peel.  */
> -  npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
> -  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
> -    npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
> -
> -  /* Is the loop small enough?  */
> -  if (!npeel)
> -    {
> -      if (dump_file)
> -     fprintf (dump_file, ";; Not considering loop, is too big\n");
> -      return;
> -    }
> -
> -  /* Check for simple loops.  */
> -  desc = get_simple_loop_desc (loop);
> +  struct loop *loop;
> +  bool changed = false;
>  
> -  /* Check number of iterations.  */
> -  if (!desc->simple_p
> -      || desc->assumptions
> -      || !desc->const_iter
> -      || desc->infinite)
> -    {
> -      if (dump_file)
> -     fprintf (dump_file,
> -              ";; Unable to prove that the loop iterates constant times\n");
> -      return;
> -    }
> +  /* Now decide rest of unrolling.  */
> +  decide_unrolling (flags);
>  
> -  if (desc->niter > npeel - 1)
> +  /* Scan the loops, inner ones first.  */
> +  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
>      {
> -      if (dump_file)
> +      /* And perform the appropriate transformations.  */
> +      switch (loop->lpt_decision.decision)
>       {
> -       fprintf (dump_file,
> -                ";; Not peeling loop completely, rolls too much (");
> -       fprintf (dump_file, "%"PRId64, desc->niter);
> -       fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
> +     case LPT_UNROLL_CONSTANT:
> +       unroll_loop_constant_iterations (loop);
> +       changed = true;
> +       break;
> +     case LPT_UNROLL_RUNTIME:
> +       unroll_loop_runtime_iterations (loop);
> +       changed = true;
> +       break;
> +     case LPT_UNROLL_STUPID:
> +       unroll_loop_stupid (loop);
> +       changed = true;
> +       break;
> +     case LPT_NONE:
> +       break;
> +     default:
> +       gcc_unreachable ();
>       }
> -      return;
>      }
>  
> -  /* Success.  */
> -  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
> -}
> -
> -/* Peel all iterations of LOOP, remove exit edges and cancel the loop
> -   completely.  The transformation done:
> +    if (changed)
> +      {
> +     calculate_dominance_info (CDI_DOMINATORS);
> +     fix_loop_structure (NULL);
> +      }
>  
> -   for (i = 0; i < 4; i++)
> -     body;
> +  iv_analysis_done ();
> +}
>  
> -   ==>
> +/* Check whether exit of the LOOP is at the end of loop body.  */
>  
> -   i = 0;
> -   body; i++;
> -   body; i++;
> -   body; i++;
> -   body; i++;
> -   */
> -static void
> -peel_loop_completely (struct loop *loop)
> +static bool
> +loop_exit_at_end_p (struct loop *loop)
>  {
> -  sbitmap wont_exit;
> -  unsigned HOST_WIDE_INT npeel;
> -  unsigned i;
> -  edge ein;
>    struct niter_desc *desc = get_simple_loop_desc (loop);
> -  struct opt_info *opt_info = NULL;
> -
> -  npeel = desc->niter;
> -
> -  if (npeel)
> -    {
> -      bool ok;
> -
> -      wont_exit = sbitmap_alloc (npeel + 1);
> -      bitmap_ones (wont_exit);
> -      bitmap_clear_bit (wont_exit, 0);
> -      if (desc->noloop_assumptions)
> -     bitmap_clear_bit (wont_exit, 1);
> -
> -      auto_vec<edge> remove_edges;
> -      if (flag_split_ivs_in_unroller)
> -        opt_info = analyze_insns_in_loop (loop);
> -
> -      opt_info_start_duplication (opt_info);
> -      ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
> -                                       npeel,
> -                                       wont_exit, desc->out_edge,
> -                                       &remove_edges,
> -                                       DLTHE_FLAG_UPDATE_FREQ
> -                                       | DLTHE_FLAG_COMPLETTE_PEEL
> -                                       | (opt_info
> -                                          ? DLTHE_RECORD_COPY_NUMBER : 0));
> -      gcc_assert (ok);
> +  rtx_insn *insn;
>  
> -      free (wont_exit);
> +  /* We should never have conditional in latch block.  */
> +  gcc_assert (desc->in_edge->dest != loop->header);
>  
> -      if (opt_info)
> -     {
> -       apply_opt_in_copies (opt_info, npeel, false, true);
> -       free_opt_info (opt_info);
> -     }
> +  if (desc->in_edge->dest != loop->latch)
> +    return false;
>  
> -      /* Remove the exit edges.  */
> -      FOR_EACH_VEC_ELT (remove_edges, i, ein)
> -     remove_path (ein);
> +  /* Check that the latch is empty.  */
> +  FOR_BB_INSNS (loop->latch, insn)
> +    {
> +      if (INSN_P (insn) && active_insn_p (insn))
> +     return false;
>      }
>  
> -  ein = desc->in_edge;
> -  free_simple_loop_desc (loop);
> -
> -  /* Now remove the unreachable part of the last iteration and cancel
> -     the loop.  */
> -  remove_path (ein);
> -
> -  if (dump_file)
> -    fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) 
> npeel);
> +  return true;
>  }
>  
>  /* Decide whether to unroll LOOP iterating constant number of times
> @@ -1372,160 +1082,6 @@ unroll_loop_runtime_iterations (struct l
>            max_unroll, num_loop_insns (loop));
>  }
>  
> -/* Decide whether to simply peel LOOP and how much.  */
> -static void
> -decide_peel_simple (struct loop *loop, int flags)
> -{
> -  unsigned npeel;
> -  widest_int iterations;
> -
> -  if (!(flags & UAP_PEEL))
> -    {
> -      /* We were not asked to, just return back silently.  */
> -      return;
> -    }
> -
> -  if (dump_file)
> -    fprintf (dump_file, "\n;; Considering simply peeling loop\n");
> -
> -  /* npeel = number of iterations to peel.  */
> -  npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
> -  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
> -    npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
> -
> -  /* Skip big loops.  */
> -  if (!npeel)
> -    {
> -      if (dump_file)
> -     fprintf (dump_file, ";; Not considering loop, is too big\n");
> -      return;
> -    }
> -
> -  /* Do not simply peel loops with branches inside -- it increases number
> -     of mispredicts.  
> -     Exception is when we do have profile and we however have good chance
> -     to peel proper number of iterations loop will iterate in practice.
> -     TODO: this heuristic needs tunning; while for complette unrolling
> -     the branch inside loop mostly eliminates any improvements, for
> -     peeling it is not the case.  Also a function call inside loop is
> -     also branch from branch prediction POV (and probably better reason
> -     to not unroll/peel).  */
> -  if (num_loop_branches (loop) > 1
> -      && profile_status_for_fn (cfun) != PROFILE_READ)
> -    {
> -      if (dump_file)
> -     fprintf (dump_file, ";; Not peeling, contains branches\n");
> -      return;
> -    }
> -
> -  /* If we have realistic estimate on number of iterations, use it.  */
> -  if (get_estimated_loop_iterations (loop, &iterations))
> -    {
> -      if (wi::leu_p (npeel, iterations))
> -     {
> -       if (dump_file)
> -         {
> -           fprintf (dump_file, ";; Not peeling loop, rolls too much (");
> -           fprintf (dump_file, "%"PRId64,
> -                    (int64_t) (iterations.to_shwi () + 1));
> -           fprintf (dump_file, " iterations > %d [maximum peelings])\n",
> -                    npeel);
> -         }
> -       return;
> -     }
> -      npeel = iterations.to_shwi () + 1;
> -    }
> -  /* If we have small enough bound on iterations, we can still peel 
> (completely
> -     unroll).  */
> -  else if (get_max_loop_iterations (loop, &iterations)
> -           && wi::ltu_p (iterations, npeel))
> -    npeel = iterations.to_shwi () + 1;
> -  else
> -    {
> -      /* For now we have no good heuristics to decide whether loop peeling
> -         will be effective, so disable it.  */
> -      if (dump_file)
> -     fprintf (dump_file,
> -              ";; Not peeling loop, no evidence it will be profitable\n");
> -      return;
> -    }
> -
> -  /* Success.  */
> -  loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
> -  loop->lpt_decision.times = npeel;
> -}
> -
> -/* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
> -
> -   while (cond)
> -     body;
> -
> -   ==>  (LOOP->LPT_DECISION.TIMES == 3)
> -
> -   if (!cond) goto end;
> -   body;
> -   if (!cond) goto end;
> -   body;
> -   if (!cond) goto end;
> -   body;
> -   while (cond)
> -     body;
> -   end: ;
> -   */
> -static void
> -peel_loop_simple (struct loop *loop)
> -{
> -  sbitmap wont_exit;
> -  unsigned npeel = loop->lpt_decision.times;
> -  struct niter_desc *desc = get_simple_loop_desc (loop);
> -  struct opt_info *opt_info = NULL;
> -  bool ok;
> -
> -  if (flag_split_ivs_in_unroller && npeel > 1)
> -    opt_info = analyze_insns_in_loop (loop);
> -
> -  wont_exit = sbitmap_alloc (npeel + 1);
> -  bitmap_clear (wont_exit);
> -
> -  opt_info_start_duplication (opt_info);
> -
> -  ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
> -                                   npeel, wont_exit, NULL,
> -                                   NULL, DLTHE_FLAG_UPDATE_FREQ
> -                                   | (opt_info
> -                                      ? DLTHE_RECORD_COPY_NUMBER
> -                                        : 0));
> -  gcc_assert (ok);
> -
> -  free (wont_exit);
> -
> -  if (opt_info)
> -    {
> -      apply_opt_in_copies (opt_info, npeel, false, false);
> -      free_opt_info (opt_info);
> -    }
> -
> -  if (desc->simple_p)
> -    {
> -      if (desc->const_iter)
> -     {
> -       desc->niter -= npeel;
> -       desc->niter_expr = GEN_INT (desc->niter);
> -       desc->noloop_assumptions = NULL_RTX;
> -     }
> -      else
> -     {
> -       /* We cannot just update niter_expr, as its value might be clobbered
> -          inside loop.  We could handle this by counting the number into
> -          temporary just like we do in runtime unrolling, but it does not
> -          seem worthwhile.  */
> -       free_simple_loop_desc (loop);
> -     }
> -    }
> -  if (dump_file)
> -    fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
> -}
> -
>  /* Decide whether to unroll LOOP stupidly and how much.  */
>  static void
>  decide_unroll_stupid (struct loop *loop, int flags)
> Index: testsuite/gcc.dg/unroll_3.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_3.c       (revision 216145)
> +++ testsuite/gcc.dg/unroll_3.c       (working copy)
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp 
> -fdisable-tree-cunroll -fdisable-tree-cunrolli -fenable-rtl-loop2_unroll=foo" 
> } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops 
> -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli=foo 
> -fdisable-tree-cunrolli=foo2" } */
>  
>  unsigned a[100], b[100];
>  inline void bar()
> @@ -28,5 +28,5 @@ int foo2(void)
>    return 1;
>  }
>  
> -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never 
> loops" 1 "loop2_unroll" } } */
> -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */
> +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely 
> unrolled" 1 "cunrolli" } } */
> +/* { dg-final { cleanup-tree-dump "cunrolli" } } */
> Index: testsuite/gcc.dg/tree-prof/peel-1.c
> ===================================================================
> --- testsuite/gcc.dg/tree-prof/peel-1.c       (revision 216145)
> +++ testsuite/gcc.dg/tree-prof/peel-1.c       (working copy)
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O3 -fdump-rtl-loop2_unroll -fno-unroll-loops -fpeel-loops" 
> } */
> +/* { dg-options "-O3 -fdump-tree-cunroll-details -fno-unroll-loops 
> -fpeel-loops" } */
>  void abort();
>  
>  int a[1000];
> Index: testsuite/gcc.dg/unroll_4.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_4.c       (revision 216145)
> +++ testsuite/gcc.dg/unroll_4.c       (working copy)
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp 
> -fdisable-tree-cunroll -fdisable-tree-cunrolli 
> -fenable-rtl-loop2_unroll=foo2" } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops 
> -fno-tree-vrp -fdisable-tree-cunroll -fenable-tree-cunrolli=foo2 
> -fdisable-tree-cunrolli=foo" } */
>  
>  unsigned a[100], b[100];
>  inline void bar()
> @@ -28,5 +28,5 @@ int foo2(void)
>    return 1;
>  }
>  
> -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never 
> loops" 1 "loop2_unroll" } } */
> -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */
> +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely 
> unrolled" 1 "cunrolli" } } */
> +/* { dg-final { cleanup-tree-dump "cunrolli" } } */
> Index: testsuite/gcc.dg/unroll_1.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_1.c       (revision 216145)
> +++ testsuite/gcc.dg/unroll_1.c       (working copy)
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll=stderr -fno-peel-loops 
> -fno-tree-vrp -fdisable-tree-cunroll -fdisable-tree-cunrolli 
> -fenable-rtl-loop2 -fenable-rtl-loop2_unroll" } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details=stderr -fno-peel-loops 
> -fno-tree-vrp  -fdisable-tree-cunroll -fenable-tree-cunrolli" } */
>  
>  unsigned a[100], b[100];
>  inline void bar()
> @@ -11,7 +11,7 @@ int foo(void)
>  {
>    int i;
>    bar();
> -  for (i = 0; i < 2; i++) /* { dg-message "note: loop turned into non-loop; 
> it never loops" } */
> +  for (i = 0; i < 2; i++) /* { dg-message "note: loop with 3 iterations 
> completely unrolled" } */
>    {
>       a[i]= b[i] + 1;
>    }
> @@ -21,7 +21,7 @@ int foo(void)
>  int foo2(void)
>  {
>    int i;
> -  for (i = 0; i < 2; i++) /* { dg-message "note: loop turned into non-loop; 
> it never loops" } */
> +  for (i = 0; i < 2; i++) /* { dg-message "note: loop with 3 iterations 
> completely unrolled" } */
>    {
>       a[i]= b[i] + 1;
>    }
> Index: testsuite/gcc.dg/unroll_2.c
> ===================================================================
> --- testsuite/gcc.dg/unroll_2.c       (revision 216145)
> +++ testsuite/gcc.dg/unroll_2.c       (working copy)
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-rtl-loop2_unroll -fno-peel-loops -fno-tree-vrp 
> -fdisable-tree-cunroll=foo -fdisable-tree-cunrolli=foo 
> -fenable-rtl-loop2_unroll" } */
> +/* { dg-options "-O2 -fdump-tree-cunrolli-details -fno-peel-loops 
> -fno-tree-vrp -fdisable-tree-cunrolli=foo -fenable-tree-cunrolli=foo" } */
>  
>  unsigned a[100], b[100];
>  inline void bar()
> @@ -28,5 +28,5 @@ int foo2(void)
>    return 1;
>  }
>  
> -/* { dg-final { scan-rtl-dump-times "loop turned into non-loop; it never 
> loops" 1 "loop2_unroll" } } */
> -/* { dg-final { cleanup-rtl-dump "loop2_unroll" } } */
> +/* { dg-final { scan-tree-dump-times "loop with 3 iterations completely 
> unrolled" 1 "cunrolli" } } */
> +/* { dg-final { cleanup-tree-dump "cunrolli" } } */
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

Re: Move loop peeling from RTL to gimple

Reply via email to