Hi Ayal, Thanks to you and Revital for the replies. The reason I asked is that I wanted to rewrite gen_sched_window so that it has only one loop over the PSPs and one loop over the PSSs. I have a follow-up patch to use iv analysis to reduce the number of memory dependencies (or at least increase the distance between them), and it was easier to do that after this change.
Ayal Zaks <ayal.z...@gmail.com> writes: >>> I ask because in the final range: >>> >>> start = early_start; >>> end = MIN (end, early_start + ii); >>> /* Schedule the node close to it's predecessors. */ >>> step = 1; >>> >>> END is an exclusive bound. It seems like we might be double-counting > here, >>> and effectively limiting the schedule to SCHED_TIME (v_node) + ii - 2. >> >> >>Yes, I think it indeed should be fixed. Thanks for reporting on this. >> >>Revital > > Agreed; > > if (e->data_type == MEM_DEP) > end = MIN (end, SCHED_TIME (v_node) + ii - 1); > > should be replaced with > > if (e->data_type == MEM_DEP) > end = MIN (end, p_st + ii); > > also for the (3rd) case when there are both previously-scheduled > predessors and previously-scheduled successors. The range is inclusive > of start and exclusive of end: for (c = start; c != end; c += step)... OK. For the follow-on iv patch, it seemed easier to keep both bounds inclusive for the loop, then make the "end" exclusive when setting the out parameters. (Note that there shouldn't be any problem with overflow when making the bound exclusive, because the size of the range has been restricted to "ii" by that stage. The follow-on patch does use saturating maths for all ops though.) >>This doesn't seem to be in the paper, and the comment suggests >>"count_succs > count_preds" rather than "count_succs >= count_preds". >>Is the ">=" vs ">" important? > > I think not: in both cases you'll be trying to minimize the same > number of live ranges. But indeed it's good to be consistent with the > comment. OK. For no particular reason other than cowardice, I ended up keeping this as: ! if (count_succs && count_succs >= count_preds) The reason for asking was that: ! if (count_succs > count_preds) seemed more natural, and would match the existing comment. I'm happy to test that instead if you prefer. I don't have powerpc hardware that I can do meaningful performance testing on, but I did run it through a Popular* Embedded Benchmark on an ARM Cortex-A8 board with -O3 -fmodulo-sched -fmodulo-sched-allow-regmoves. There were no changes. (And this is a benchmark that does benefit from modulo scheduling, in some cases by a significant amount.) Bootstrapped & regression-tested on powerpc-ibm-aix5.3 with the additional patch: Index: gcc/opts.c =================================================================== --- gcc/opts.c 2011-08-03 10:56:50.000000000 +0100 +++ gcc/opts.c 2011-08-03 10:56:50.000000000 +0100 @@ -449,6 +449,8 @@ static const struct default_options defa { OPT_LEVELS_1_PLUS, OPT_ftree_ch, NULL, 1 }, { OPT_LEVELS_1_PLUS, OPT_fcombine_stack_adjustments, NULL, 1 }, { OPT_LEVELS_1_PLUS, OPT_fcompare_elim, NULL, 1 }, + { OPT_LEVELS_1_PLUS, OPT_fmodulo_sched, NULL, 1 }, + { OPT_LEVELS_1_PLUS, OPT_fmodulo_sched_allow_regmoves, NULL, 1 }, /* -O2 optimizations. */ { OPT_LEVELS_2_PLUS, OPT_finline_small_functions, NULL, 1 }, applied for both the "before" and "after" runs. OK to install? Thanks, Richard gcc/ * modulo-sched.c (get_sched_window): Use just one loop for predecessors and one loop for successors. Fix upper bound of memory range. Index: gcc/modulo-sched.c =================================================================== *** gcc/modulo-sched.c 2011-08-04 09:09:29.000000000 +0100 --- gcc/modulo-sched.c 2011-08-04 09:49:16.000000000 +0100 *************** #define MAX_SPLIT_NUM 10 *** 1630,1638 **** static int get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node, ! sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p) { int start, step, end; ddg_edge_ptr e; sbitmap psp = sbitmap_alloc (ps->g->num_nodes); sbitmap pss = sbitmap_alloc (ps->g->num_nodes); --- 1630,1640 ---- static int get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node, ! sbitmap sched_nodes, int ii, int *start_p, int *step_p, ! int *end_p) { int start, step, end; + int early_start, late_start; ddg_edge_ptr e; sbitmap psp = sbitmap_alloc (ps->g->num_nodes); sbitmap pss = sbitmap_alloc (ps->g->num_nodes); *************** get_sched_window (partial_schedule_ptr p *** 1640,1645 **** --- 1642,1649 ---- sbitmap u_node_succs = NODE_SUCCESSORS (u_node); int psp_not_empty; int pss_not_empty; + int count_preds; + int count_succs; /* 1. compute sched window for u (start, end, step). */ sbitmap_zero (psp); *************** get_sched_window (partial_schedule_ptr p *** 1647,1861 **** psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes); pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes); ! if (psp_not_empty && !pss_not_empty) ! { ! int early_start = INT_MIN; ! ! end = INT_MAX; ! for (e = u_node->in; e != 0; e = e->next_in) ! { ! ddg_node_ptr v_node = e->src; ! ! if (dump_file) ! { ! fprintf (dump_file, "\nProcessing edge: "); ! print_ddg_edge (dump_file, e); ! fprintf (dump_file, ! "\nScheduling %d (%d) in psp_not_empty," ! " checking p %d (%d): ", u_node->cuid, ! INSN_UID (u_node->insn), v_node->cuid, INSN_UID ! (v_node->insn)); ! } ! ! if (TEST_BIT (sched_nodes, v_node->cuid)) ! { ! int p_st = SCHED_TIME (v_node); ! ! early_start = ! MAX (early_start, p_st + e->latency - (e->distance * ii)); ! ! if (dump_file) ! fprintf (dump_file, ! "pred st = %d; early_start = %d; latency: %d", ! p_st, early_start, e->latency); ! ! if (e->data_type == MEM_DEP) ! end = MIN (end, SCHED_TIME (v_node) + ii - 1); ! } ! else if (dump_file) ! fprintf (dump_file, "the node is not scheduled\n"); ! } ! start = early_start; ! end = MIN (end, early_start + ii); ! /* Schedule the node close to it's predecessors. */ ! step = 1; ! ! if (dump_file) ! fprintf (dump_file, ! "\nScheduling %d (%d) in a window (%d..%d) with step %d\n", ! u_node->cuid, INSN_UID (u_node->insn), start, end, step); ! } ! ! else if (!psp_not_empty && pss_not_empty) ! { ! int late_start = INT_MAX; ! ! end = INT_MIN; ! for (e = u_node->out; e != 0; e = e->next_out) ! { ! ddg_node_ptr v_node = e->dest; ! ! if (dump_file) ! { ! fprintf (dump_file, "\nProcessing edge:"); ! print_ddg_edge (dump_file, e); ! fprintf (dump_file, ! "\nScheduling %d (%d) in pss_not_empty," ! " checking s %d (%d): ", u_node->cuid, ! INSN_UID (u_node->insn), v_node->cuid, INSN_UID ! (v_node->insn)); ! } ! ! if (TEST_BIT (sched_nodes, v_node->cuid)) ! { ! int s_st = SCHED_TIME (v_node); ! ! late_start = MIN (late_start, ! s_st - e->latency + (e->distance * ii)); ! ! if (dump_file) ! fprintf (dump_file, ! "succ st = %d; late_start = %d; latency = %d", ! s_st, late_start, e->latency); ! ! if (e->data_type == MEM_DEP) ! end = MAX (end, SCHED_TIME (v_node) - ii + 1); ! if (dump_file) ! fprintf (dump_file, "end = %d\n", end); ! ! } ! else if (dump_file) ! fprintf (dump_file, "the node is not scheduled\n"); ! } ! start = late_start; ! end = MAX (end, late_start - ii); ! /* Schedule the node close to it's successors. */ ! step = -1; ! if (dump_file) ! fprintf (dump_file, ! "\nScheduling %d (%d) in a window (%d..%d) with step %d\n", ! u_node->cuid, INSN_UID (u_node->insn), start, end, step); ! } ! else if (psp_not_empty && pss_not_empty) ! { ! int early_start = INT_MIN; ! int late_start = INT_MAX; ! int count_preds = 0; ! int count_succs = 0; ! start = INT_MIN; ! end = INT_MAX; ! for (e = u_node->in; e != 0; e = e->next_in) ! { ! ddg_node_ptr v_node = e->src; ! if (dump_file) ! { ! fprintf (dump_file, "\nProcessing edge:"); ! print_ddg_edge (dump_file, e); fprintf (dump_file, ! "\nScheduling %d (%d) in psp_pss_not_empty," ! " checking p %d (%d): ", u_node->cuid, INSN_UID ! (u_node->insn), v_node->cuid, INSN_UID ! (v_node->insn)); ! } ! ! if (TEST_BIT (sched_nodes, v_node->cuid)) ! { ! int p_st = SCHED_TIME (v_node); ! ! early_start = MAX (early_start, ! p_st + e->latency ! - (e->distance * ii)); ! ! if (dump_file) ! fprintf (dump_file, ! "pred st = %d; early_start = %d; latency = %d", ! p_st, early_start, e->latency); ! ! if (e->type == TRUE_DEP && e->data_type == REG_DEP) ! count_preds++; ! if (e->data_type == MEM_DEP) ! end = MIN (end, SCHED_TIME (v_node) + ii - 1); ! } ! else if (dump_file) ! fprintf (dump_file, "the node is not scheduled\n"); ! } ! for (e = u_node->out; e != 0; e = e->next_out) ! { ! ddg_node_ptr v_node = e->dest; ! if (dump_file) ! { ! fprintf (dump_file, "\nProcessing edge:"); ! print_ddg_edge (dump_file, e); ! fprintf (dump_file, ! "\nScheduling %d (%d) in psp_pss_not_empty," ! " checking s %d (%d): ", u_node->cuid, INSN_UID ! (u_node->insn), v_node->cuid, INSN_UID ! (v_node->insn)); ! } ! if (TEST_BIT (sched_nodes, v_node->cuid)) ! { ! int s_st = SCHED_TIME (v_node); ! late_start = MIN (late_start, ! s_st - e->latency ! + (e->distance * ii)); ! if (dump_file) ! fprintf (dump_file, ! "succ st = %d; late_start = %d; latency = %d", ! s_st, late_start, e->latency); ! if (e->type == TRUE_DEP && e->data_type == REG_DEP) ! count_succs++; ! if (e->data_type == MEM_DEP) ! start = MAX (start, SCHED_TIME (v_node) - ii + 1); ! } ! else if (dump_file) ! fprintf (dump_file, "the node is not scheduled\n"); ! } ! start = MAX (start, early_start); ! end = MIN (end, MIN (early_start + ii, late_start + 1)); ! step = 1; ! /* If there are more successors than predecessors schedule the ! node close to it's successors. */ ! if (count_succs >= count_preds) ! { ! int old_start = start; ! start = end - 1; ! end = old_start - 1; ! step = -1; ! } ! } ! else /* psp is empty && pss is empty. */ ! { ! start = SCHED_ASAP (u_node); ! end = start + ii; ! step = 1; } *start_p = start; *step_p = step; *end_p = end; --- 1651,1769 ---- psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes); pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes); ! early_start = INT_MIN; ! late_start = INT_MAX; ! start = INT_MIN; ! end = INT_MAX; ! count_preds = psp_not_empty; ! count_succs = pss_not_empty; ! ! /* Calculate early_start and limit end. Both bounds are inclusive. */ ! if (psp_not_empty) ! for (e = u_node->in; e != 0; e = e->next_in) ! { ! ddg_node_ptr v_node = e->src; ! if (dump_file) ! { ! fprintf (dump_file, "\nProcessing edge: "); ! print_ddg_edge (dump_file, e); ! fprintf (dump_file, ! "\nScheduling %d (%d) in psp_not_empty," ! " checking p %d (%d): ", u_node->cuid, ! INSN_UID (u_node->insn), v_node->cuid, INSN_UID ! (v_node->insn)); ! } ! if (TEST_BIT (sched_nodes, v_node->cuid)) ! { ! int p_st = SCHED_TIME (v_node); ! early_start = MAX (early_start, ! p_st + e->latency - (e->distance * ii)); ! if (e->data_type == MEM_DEP) ! end = MIN (end, p_st + ii - 1); ! if (e->type == TRUE_DEP && e->data_type == REG_DEP) ! count_preds++; ! if (dump_file) fprintf (dump_file, ! "pred st = %d; early_start = %d; latency: %d;" ! " end: %d\n", p_st, early_start, e->latency, end); ! } ! else if (dump_file) ! fprintf (dump_file, "the node is not scheduled\n"); ! } ! /* Calculate late_start and limit start. Both bounds are inclusive. */ ! if (pss_not_empty) ! for (e = u_node->out; e != 0; e = e->next_out) ! { ! ddg_node_ptr v_node = e->dest; ! if (dump_file) ! { ! fprintf (dump_file, "\nProcessing edge:"); ! print_ddg_edge (dump_file, e); ! fprintf (dump_file, ! "\nScheduling %d (%d) in pss_not_empty," ! " checking s %d (%d): ", u_node->cuid, ! INSN_UID (u_node->insn), v_node->cuid, INSN_UID ! (v_node->insn)); ! } ! if (TEST_BIT (sched_nodes, v_node->cuid)) ! { ! int s_st = SCHED_TIME (v_node); ! late_start = MIN (late_start, ! s_st - e->latency + (e->distance * ii)); ! if (e->data_type == MEM_DEP) ! start = MAX (start, s_st - ii + 1); ! if (e->type == TRUE_DEP && e->data_type == REG_DEP) ! count_succs++; ! if (dump_file) ! fprintf (dump_file, ! "succ st = %d; late_start = %d; latency = %d;" ! " start=%d", s_st, late_start, e->latency, start); ! } ! else if (dump_file) ! fprintf (dump_file, "the node is not scheduled\n"); ! } ! /* Get a target scheduling window no bigger than ii. */ ! if (early_start == INT_MIN && late_start == INT_MAX) ! early_start = SCHED_ASAP (u_node); ! else if (early_start == INT_MIN) ! early_start = late_start - (ii - 1); ! late_start = MIN (early_start + (ii - 1), late_start); ! ! /* Apply memory dependence limits. */ ! start = MAX (start, early_start); ! end = MIN (end, late_start); ! step = 1; ! ! /* If there are at least as many successors as predecessors, schedule the ! node close to its successors. */ ! if (count_succs && count_succs >= count_preds) ! { ! int tmp = end; ! end = start; ! start = tmp; ! step = -1; } + /* Now that we've finalized the window, make END an exclusive rather + than an inclusive bound. */ + end += step; + *start_p = start; *step_p = step; *end_p = end; *************** get_sched_window (partial_schedule_ptr p *** 1867,1876 **** if (dump_file) fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n", start, end, step); ! return -1; } ! return 0; } /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the --- 1775,1784 ---- if (dump_file) fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n", start, end, step); ! return -1; } ! return 0; } /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the