On Tue, Aug 30, 2011 at 2:29 PM, Richard Sandiford
<richard.sandif...@linaro.org> wrote:
> This is the move-scheduling patch itself.  It should be fairly
> self-explanatory.  Let me know if it isn't, and I'll try to improve
> the commentary.

Can you add some testcases?

Thanks,
Richard.

> One potentially controversial change is to the way we handle moves
> in the prologue and epilogue.  The current code uses a conservative
> check to decide which stages need which moves.  This check copes
> with values that are live before the loop, and emits some moves
> that aren't actually needed.  The code also emits chains of moves
> that can be trivially optimised through propagation.  We rely on
> later patches to clean this up for us (and they do).
>
> So, rather than keep a rather complicated check here, I've simply
> emitted the moves for all stages.  In principle, that will generate
> a little more dead code than now in cases where chains of two moves
> are needed, but that seems to be very rare (when moves are scheduled).
>
> Richard
>
>
> gcc/
>        * modulo-sched.c (extend_node_sched_params): New function.
>        (print_node_sched_params): Print the stage.
>        (set_columns_for_row, schedule_reg_move): New functions.
>        (set_columns_for_ps): Move up file and use set_columns_for_now.
>        (schedule_reg_moves): Call extend_node_sched_params and
>        schedule_reg_move.  Extend size of uses bitmap.  Return false
>        if a move could not be scheduled.
>        (apply_reg_moves): Don't emit moves here.
>        (permute_partial_schedule): Handle register moves.
>        (duplicate_insns_of_cycles): Remove for_prolog.  Always emit moves.
>        (generate_prolog_epilog): Update calls accordingly.
>
> Index: gcc/modulo-sched.c
> ===================================================================
> --- gcc/modulo-sched.c  2011-08-30 13:06:36.528669762 +0100
> +++ gcc/modulo-sched.c  2011-08-30 13:22:08.029124537 +0100
> @@ -220,8 +220,6 @@ static partial_schedule_ptr sms_schedule
>  static void permute_partial_schedule (partial_schedule_ptr, rtx);
>  static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
>                                     rtx, rtx);
> -static void duplicate_insns_of_cycles (partial_schedule_ptr,
> -                                      int, int, int, rtx);
>  static int calculate_stage_count (partial_schedule_ptr, int);
>  static void calculate_must_precede_follow (ddg_node_ptr, int, int,
>                                           int, int, sbitmap, sbitmap, 
> sbitmap);
> @@ -458,6 +456,15 @@ set_node_sched_params (ddg_ptr g)
>                         node_sched_param_vec, g->num_nodes);
>  }
>
> +/* Make sure that node_sched_param_vec has an entry for every move in PS.  */
> +static void
> +extend_node_sched_params (partial_schedule_ptr ps)
> +{
> +  VEC_safe_grow_cleared (node_sched_params, heap, node_sched_param_vec,
> +                        ps->g->num_nodes + VEC_length (ps_reg_move_info,
> +                                                       ps->reg_moves));
> +}
> +
>  static void
>  print_node_sched_params (FILE *file, int num_nodes, partial_schedule_ptr ps)
>  {
> @@ -474,6 +481,7 @@ print_node_sched_params (FILE *file, int
>               INSN_UID (ps_rtl_insn (ps, i)));
>       fprintf (file, " asap = %d:\n", NODE_ASAP (&ps->g->nodes[i]));
>       fprintf (file, " time = %d:\n", nsp->time);
> +      fprintf (file, " stage = %d:\n", nsp->stage);
>       fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
>       for (j = 0; j < nsp->nreg_moves; j++)
>        {
> @@ -485,6 +493,168 @@ print_node_sched_params (FILE *file, int
>     }
>  }
>
> +/* Set SCHED_COLUMN for each instruction in row ROW of PS.  */
> +static void
> +set_columns_for_row (partial_schedule_ptr ps, int row)
> +{
> +  ps_insn_ptr cur_insn;
> +  int column;
> +
> +  column = 0;
> +  for (cur_insn = ps->rows[row]; cur_insn; cur_insn = cur_insn->next_in_row)
> +    SCHED_COLUMN (cur_insn->id) = column++;
> +}
> +
> +/* Set SCHED_COLUMN for each instruction in PS.  */
> +static void
> +set_columns_for_ps (partial_schedule_ptr ps)
> +{
> +  int row;
> +
> +  for (row = 0; row < ps->ii; row++)
> +    set_columns_for_row (ps, row);
> +}
> +
> +/* Try to schedule the move with ps_insn identifier I_REG_MOVE in PS.
> +   The source of the move is provided by I_MUST_FOLLOW, which has
> +   already been scheduled.  MUST_FOLLOW is a scratch bitmap that
> +   is big enough to hold I_MUST_FOLLOW.
> +
> +   Return true on success.  */
> +static bool
> +schedule_reg_move (partial_schedule_ptr ps, int i_reg_move,
> +                  sbitmap must_follow, int i_must_follow)
> +{
> +  unsigned int u;
> +  int i, start_row, end_row, this_start, this_end, this_row, latency;
> +  int origin_row, origin_column;
> +  sbitmap_iterator sbi;
> +  ps_reg_move_info *move;
> +
> +  move = ps_reg_move (ps, i_reg_move);
> +
> +  /* The cyclic lifetime of move->new_reg starts and ends at move->def
> +     (the instruction that defines move->old_reg).  We need to decide
> +     where in that cyclic lifetime the move should go.  The position
> +     is limited by I_MUST_FOLLOW (which defines the source of the move)
> +     and the nodes in move->uses (which consume the result).
> +
> +     Specifically, the lowest possible row (relative to move->def)
> +     is the maximum of:
> +
> +       a) the row in which the result of the previous iteration's
> +         I_MUST_FOLLOW becomes available
> +       b) the first row in which every use of the previous iteration's
> +         move->new_reg has completed.
> +
> +     The highest possible row (again relative to move->def) is
> +     the minimum of:
> +
> +       c) the row in which the current iteration's I_MUST_FOLLOW
> +         executes (and thus makes the source of the move unavailable)
> +       d) the last row in which every use of this iteration's
> +         move->new_reg could be satisfied without delay.
> +
> +     Because all positions are relative to move->def, this function
> +     uses ROW - ps->ii to represent positions come after move->def.
> +     The range includes origin_row twice: "origin_row - ps->ii" is for
> +     the positions in origin_row after move->def, while "origin_row"
> +     itself is for the positions before move->def.  */
> +  origin_row = SCHED_ROW (move->def);
> +  origin_column = SCHED_COLUMN (move->def);
> +  start_row = origin_row - ps->ii;
> +  end_row = origin_row;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "Scheduling register move INSN %d with cyclic 
> lifetime"
> +            " [%d, %d]\n", INSN_UID (move->insn), start_row, end_row);
> +
> +  /* Limit the range to [a, c].  */
> +  this_end = SCHED_ROW (i_must_follow);
> +  if (this_end > origin_row
> +      || (this_end == origin_row
> +         && SCHED_COLUMN (i_must_follow) > origin_column))
> +    this_end -= ps->ii;
> +  latency = insn_latency (ps_rtl_insn (ps, i_must_follow), move->insn);
> +  this_start = this_end - ps->ii + latency;
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  -- source produced by INSN %d with latency %d,"
> +            " range [%d, %d]\n", INSN_UID (ps_rtl_insn (ps, i_must_follow)),
> +            latency, this_start, this_end);
> +
> +  if (start_row < this_start)
> +    start_row = this_start;
> +  if (end_row > this_end)
> +    end_row = this_end;
> +
> +  /* Limit the range to [b, d].  */
> +  EXECUTE_IF_SET_IN_SBITMAP (move->uses, 0, u, sbi)
> +    {
> +      latency = insn_latency (move->insn, ps_rtl_insn (ps, u));
> +      this_start = SCHED_ROW (u);
> +      if (this_start > origin_row
> +         || (this_start == origin_row
> +             && SCHED_COLUMN (u) > origin_column))
> +       this_start -= ps->ii;
> +      this_end = this_start + ps->ii - latency;
> +
> +      if (dump_file)
> +       fprintf (dump_file, "  -- destination used by INSN %d with latency"
> +                " %d, range [%d, %d]\n", INSN_UID (ps_rtl_insn (ps, u)),
> +                latency, this_start, this_end);
> +
> +      if (start_row < this_start)
> +       start_row = this_start;
> +      if (end_row > this_end)
> +       end_row = this_end;
> +    }
> +
> +  sbitmap_zero (must_follow);
> +  SET_BIT (must_follow, i_must_follow);
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  -- trying to schedule in rows [%d, %d]\n",
> +            start_row, end_row);
> +
> +  for (i = end_row; i >= start_row; i--)
> +    {
> +      ps_insn_ptr psi;
> +
> +      this_row = i < 0 ? i + ps->ii : i;
> +      /* origin_row - ps->ii represents the part of row origin_row after
> +        move->def.  In this case the move must come after both move->uses
> +        and move->def.  */
> +      if (i == origin_row - ps->ii)
> +       {
> +         gcc_checking_assert (i == start_row);
> +         gcc_checking_assert (!TEST_BIT (move->uses, move->def));
> +         SET_BIT (move->uses, move->def);
> +         psi = ps_add_node_check_conflicts (ps, i_reg_move, this_row,
> +                                            move->uses, NULL);
> +         RESET_BIT (move->uses, move->def);
> +       }
> +      else
> +       psi = ps_add_node_check_conflicts (ps, i_reg_move, this_row,
> +                                          move->uses, must_follow);
> +
> +      if (psi)
> +       {
> +         SCHED_ROW (i_reg_move) = this_row;
> +         if (dump_file)
> +           fprintf (dump_file, "  -- scheduled in row %d (%d)\n",
> +                    i, this_row);
> +         set_columns_for_row (ps, this_row);
> +         return true;
> +       }
> +    }
> +
> +  if (dump_file)
> +    fprintf (dump_file, "  -- no available slot\n");
> +
> +  return false;
> +}
> +
>  /*
>    Breaking intra-loop register anti-dependences:
>    Each intra-loop register anti-dependence implies a cross-iteration true
> @@ -507,9 +677,10 @@ schedule_reg_moves (partial_schedule_ptr
>     {
>       ddg_node_ptr u = &g->nodes[i];
>       ddg_edge_ptr e;
> -      int nreg_moves = 0, i_reg_move;
> +      int nreg_moves = 0, i_reg_move, i_must_follow;
>       rtx prev_reg, old_reg;
>       int first_move;
> +      sbitmap must_follow;
>
>       /* Compute the number of reg_moves needed for u, by looking at life
>         ranges started at u (excluding self-loops).  */
> @@ -539,6 +710,7 @@ schedule_reg_moves (partial_schedule_ptr
>       first_move = VEC_length (ps_reg_move_info, ps->reg_moves);
>       VEC_safe_grow_cleared (ps_reg_move_info, heap, ps->reg_moves,
>                             first_move + nreg_moves);
> +      extend_node_sched_params (ps);
>
>       /* Record the moves associated with this node.  */
>       first_move += ps->g->num_nodes;
> @@ -552,7 +724,7 @@ schedule_reg_moves (partial_schedule_ptr
>          ps_reg_move_info *move = ps_reg_move (ps, first_move + i_reg_move);
>
>          move->def = i;
> -         move->uses = sbitmap_alloc (g->num_nodes);
> +         move->uses = sbitmap_alloc (first_move + nreg_moves);
>          move->old_reg = old_reg;
>          move->new_reg = gen_reg_rtx (GET_MODE (prev_reg));
>          move->insn = gen_move_insn (move->new_reg, copy_rtx (prev_reg));
> @@ -586,6 +758,19 @@ schedule_reg_moves (partial_schedule_ptr
>                SET_BIT (move->uses, e->dest->cuid);
>              }
>          }
> +
> +      must_follow = sbitmap_alloc (first_move + nreg_moves);
> +      i_must_follow = i;
> +      for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
> +       {
> +         if (!schedule_reg_move (ps, first_move + i_reg_move,
> +                                 must_follow, i_must_follow))
> +           break;
> +         i_must_follow = first_move + i_reg_move;
> +       }
> +      sbitmap_free (must_follow);
> +      if (i_reg_move < nreg_moves)
> +       return false;
>     }
>   return true;
>  }
> @@ -609,9 +794,6 @@ apply_reg_moves (partial_schedule_ptr ps
>          df_insn_rescan (ps->g->nodes[i_use].insn);
>        }
>     }
> -
> -  FOR_EACH_VEC_ELT_REVERSE (ps_reg_move_info, ps->reg_moves, i, move)
> -    add_insn_before (move->insn, ps_first_note (ps, move->def), NULL);
>  }
>
>  /* Update the sched_params (time, row and stage) for node U using the II,
> @@ -682,22 +864,6 @@ reset_sched_times (partial_schedule_ptr
>       }
>  }
>
> -/* Set SCHED_COLUMN of each node according to its position in PS.  */
> -static void
> -set_columns_for_ps (partial_schedule_ptr ps)
> -{
> -  int row;
> -
> -  for (row = 0; row < ps->ii; row++)
> -    {
> -      ps_insn_ptr cur_insn = ps->rows[row];
> -      int column = 0;
> -
> -      for (; cur_insn; cur_insn = cur_insn->next_in_row)
> -       SCHED_COLUMN (cur_insn->id) = column++;
> -    }
> -}
> -
>  /* Permute the insns according to their order in PS, from row 0 to
>    row ii-1, and position them right before LAST.  This schedules
>    the insns of the loop kernel.  */
> @@ -714,8 +880,13 @@ permute_partial_schedule (partial_schedu
>        rtx insn = ps_rtl_insn (ps, ps_ij->id);
>
>        if (PREV_INSN (last) != insn)
> -         reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
> -                             PREV_INSN (last));
> +         {
> +           if (ps_ij->id < ps->g->num_nodes)
> +             reorder_insns_nobb (ps_first_note (ps, ps_ij->id), insn,
> +                                 PREV_INSN (last));
> +           else
> +             add_insn_before (insn, last, NULL);
> +         }
>       }
>  }
>
> @@ -919,7 +1090,7 @@ optimize_sc (partial_schedule_ptr ps, dd
>
>  static void
>  duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
> -                          int to_stage, int for_prolog, rtx count_reg)
> +                          int to_stage, rtx count_reg)
>  {
>   int row;
>   ps_insn_ptr ps_ij;
> @@ -928,7 +1099,6 @@ duplicate_insns_of_cycles (partial_sched
>     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
>       {
>        int u = ps_ij->id;
> -       int j, i_reg_moves, i_reg_move;
>        rtx u_insn;
>
>         /* Do not duplicate any insn which refers to count_reg as it
> @@ -942,42 +1112,18 @@ duplicate_insns_of_cycles (partial_sched
>             || JUMP_P (u_insn))
>           continue;
>
> -       if (for_prolog)
> -         {
> -           /* SCHED_STAGE (u) >= from_stage == 0.  Generate increasing
> -              number of reg_moves starting with the second occurrence of
> -              u, which is generated if its SCHED_STAGE <= to_stage.  */
> -           i_reg_moves = to_stage - SCHED_STAGE (u) + 1;
> -           i_reg_moves = MAX (i_reg_moves, 0);
> -           i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u));
> -
> -           /* The reg_moves start from the *first* reg_move backwards.  */
> -           i_reg_move = SCHED_FIRST_REG_MOVE (u) + (i_reg_moves - 1);
> -         }
> -       else /* It's for the epilog.  */
> -         {
> -           /* SCHED_STAGE (u) <= to_stage.  Generate all reg_moves,
> -              starting to decrease one stage after u no longer occurs;
> -              that is, generate all reg_moves until
> -              SCHED_STAGE (u) == from_stage - 1.  */
> -           i_reg_moves = (SCHED_NREG_MOVES (u)
> -                          - (from_stage - SCHED_STAGE (u) - 1));
> -           i_reg_moves = MAX (i_reg_moves, 0);
> -           i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u));
> -
> -           /* The reg_moves start from the *last* reg_move forwards.  */
> -           i_reg_move = SCHED_FIRST_REG_MOVE (u) + (SCHED_NREG_MOVES (u) - 
> 1);
> -         }
> -
> -       for (j = 0; j < i_reg_moves; j++)
> +       if (u < ps->g->num_nodes)
>          {
> -           ps_reg_move_info *move = ps_reg_move (ps, i_reg_move - j);
> -
> -           emit_insn (copy_rtx (PATTERN (move->insn)));
> +           if (SCHED_STAGE (u) >= from_stage
> +               && SCHED_STAGE (u) <= to_stage)
> +             duplicate_insn_chain (ps_first_note (ps, u), u_insn);
>          }
> -       if (SCHED_STAGE (u) >= from_stage
> -           && SCHED_STAGE (u) <= to_stage)
> -         duplicate_insn_chain (ps_first_note (ps, u), u_insn);
> +       else
> +         /* For simplicity, we generate all moves for every stage,
> +            even though some of them will be dead.  Later optimizations
> +            will remove the dead instructions and undo some of the
> +            unnecessary renaming that moves would otherwise produce.  */
> +         emit_insn (copy_rtx (PATTERN (u_insn)));
>       }
>  }
>
> @@ -1011,7 +1157,7 @@ generate_prolog_epilog (partial_schedule
>     }
>
>   for (i = 0; i < last_stage; i++)
> -    duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
> +    duplicate_insns_of_cycles (ps, 0, i, count_reg);
>
>   /* Put the prolog on the entry edge.  */
>   e = loop_preheader_edge (loop);
> @@ -1023,7 +1169,7 @@ generate_prolog_epilog (partial_schedule
>   start_sequence ();
>
>   for (i = 0; i < last_stage; i++)
> -    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
> +    duplicate_insns_of_cycles (ps, i + 1, last_stage, count_reg);
>
>   /* Put the epilogue on the exit edge.  */
>   gcc_assert (single_exit (loop));
>

Reply via email to