[PATCH] modulo-sched: skip loops with strange register defs [PR100225]
Hi all! Situation from PR was already caught earlier locally. So, I've just extracted appropriate part, it also slightly modifies loop checks related to non-single-set instructions. Patch (attached) was successfully bootstrapped/regtested on aarch64-linux on all active branches (8-12) with modulo-sched enabled by default, usual amd64-linux reg-strap is also ok. Pushing to trunk after 24h if no objections, backporting to 8-11 branches next week. Roman -- modulo-sched: skip loops with strange register defs [PR100225] PR84878 fix adds an assertion which can fail, e.g. when stack pointer is adjusted inside the loop. We have to prevent it and search earlier for any 'strange' instruction. The solution is to skip the whole loop if using 'note_stores' we found that one of hard registers is in 'df->regular_block_artificial_uses' set. Also patch properly prohibit not single-set instruction in loop body. gcc/ChangeLog: PR rtl-optimization/100225 PR rtl-optimization/84878 * modulo-sched.c (sms_schedule): Use note_stores to skip loops where we have an instruction which touches (writes) any hard register from df->regular_block_artificial_uses set. Allow not-single-set instruction only right before basic block tail. gcc/testsuite/ChangeLog: PR rtl-optimization/100225 PR rtl-optimization/84878 * gcc.dg/pr100225.c: New test. libgomp/ChangeLog: * testsuite/libgomp.oacc-c-c++-common/atomic_capture-3.c: New test. -- modulo-sched: skip loops with strange register defs [PR100225] PR84878 fix adds an assertion which can fail, e.g. when stack pointer is adjusted inside the loop. We have to prevent it and search earlier for any 'strange' instruction. The solution is to skip the whole loop if using 'note_stores' we found that one of hard registers is in 'df->regular_block_artificial_uses' set. Also patch properly prohibit not single-set instruction in loop body. gcc/ChangeLog: PR rtl-optimization/100225 PR rtl-optimization/84878 * modulo-sched.c (sms_schedule): Use note_stores to skip loops where we have an instruction which touches (writes) any hard register from df->regular_block_artificial_uses set. Allow not-single-set instruction only right before basic block tail. gcc/testsuite/ChangeLog: PR rtl-optimization/100225 PR rtl-optimization/84878 * gcc.dg/pr100225.c: New test. libgomp/ChangeLog: * testsuite/libgomp.oacc-c-c++-common/atomic_capture-3.c: New test. diff --git a/gcc/modulo-sched.c b/gcc/modulo-sched.c index 6ad960ec1f..e72e46db38 100644 --- a/gcc/modulo-sched.c +++ b/gcc/modulo-sched.c @@ -43,6 +43,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-pass.h" #include "dbgcnt.h" #include "loop-unroll.h" +#include "hard-reg-set.h" #ifdef INSN_SCHEDULING @@ -1356,6 +1357,7 @@ sms_schedule (void) basic_block condition_bb = NULL; edge latch_edge; HOST_WIDE_INT trip_count, max_trip_count; + HARD_REG_SET prohibited_regs; loop_optimizer_init (LOOPS_HAVE_PREHEADERS | LOOPS_HAVE_RECORDED_EXITS); @@ -1385,6 +1387,8 @@ sms_schedule (void) We use loop->num as index into this array. */ g_arr = XCNEWVEC (ddg_ptr, number_of_loops (cfun)); + REG_SET_TO_HARD_REG_SET (prohibited_regs, &df->regular_block_artificial_uses); + if (dump_file) { fprintf (dump_file, "\n\nSMS analysis phase\n"); @@ -1469,23 +1473,31 @@ sms_schedule (void) } /* Don't handle BBs with calls or barriers - or !single_set with the exception of instructions that include - count_reg---these instructions are part of the control part - that do-loop recognizes. + or !single_set with the exception of do-loop control part insns. ??? Should handle insns defining subregs. */ - for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn)) - { - rtx set; - -if (CALL_P (insn) -|| BARRIER_P (insn) -|| (NONDEBUG_INSN_P (insn) && !JUMP_P (insn) -&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE -&& !reg_mentioned_p (count_reg, insn)) -|| (INSN_P (insn) && (set = single_set (insn)) -&& GET_CODE (SET_DEST (set)) == SUBREG)) -break; - } + for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn)) + { + if (INSN_P (insn)) + { + HARD_REG_SET regs; + CLEAR_HARD_REG_SET (regs); + note_stores (insn, record_hard_reg_sets, ®s); + if (hard_reg_set_intersect_p (regs, prohibited_regs)) + break; + } + + if (CALL_P (insn) + || BARRIER_P (insn) + || (INSN_P (insn) && single_set (insn) + && GET_CODE (SET_DEST (single_set (insn))) == SUBREG) + /* Not a single set. */ + |
[committed] minor: fix indentation in ddg.c
This obvious patch fixes indentation in PR90001-related code. Committed as r10-7106. Roman -- gcc/ChangeLog: * ddg.c (create_ddg): Fix intendation. (set_recurrence_length): Likewise. (create_ddg_all_sccs): Likewise. diff --git a/gcc/ddg.c b/gcc/ddg.c index aae92adf89a..ca8cb74823d 100644 --- a/gcc/ddg.c +++ b/gcc/ddg.c @@ -633,7 +633,7 @@ create_ddg (basic_block bb, int closing_branch_deps) g->nodes[i].aux.count = -1; g->nodes[i].max_dist = XCNEWVEC (int, num_nodes); for (j = 0; j < num_nodes; j++) - g->nodes[i].max_dist[j] = -1; + g->nodes[i].max_dist[j] = -1; g->nodes[i++].insn = insn; first_note = NULL; @@ -838,7 +838,7 @@ set_recurrence_length (ddg_scc_ptr scc) int length = src->max_dist[dest->cuid]; if (length < 0) -continue; + continue; length += backarc->latency; result = MAX (result, (length / distance)); @@ -1069,8 +1069,8 @@ create_ddg_all_sccs (ddg_ptr g) n->max_dist[k] = 0; for (e = n->out; e; e = e->next_out) -if (e->distance == 0 && g->nodes[e->dest->cuid].aux.count == n->aux.count) - n->max_dist[e->dest->cuid] = e->latency; + if (e->distance == 0 && g->nodes[e->dest->cuid].aux.count == n->aux.count) + n->max_dist[e->dest->cuid] = e->latency; } /* Run main Floid-Warshall loop. We use only non-backarc edges @@ -1079,19 +1079,19 @@ create_ddg_all_sccs (ddg_ptr g) { scc = g->nodes[k].aux.count; if (scc != -1) -{ - for (i = 0; i < num_nodes; i++) -if (g->nodes[i].aux.count == scc) - for (j = 0; j < num_nodes; j++) -if (g->nodes[j].aux.count == scc -&& g->nodes[i].max_dist[k] >= 0 -&& g->nodes[k].max_dist[j] >= 0) - { -way = g->nodes[i].max_dist[k] + g->nodes[k].max_dist[j]; -if (g->nodes[i].max_dist[j] < way) - g->nodes[i].max_dist[j] = way; - } -} + { + for (i = 0; i < num_nodes; i++) + if (g->nodes[i].aux.count == scc) + for (j = 0; j < num_nodes; j++) + if (g->nodes[j].aux.count == scc + && g->nodes[i].max_dist[k] >= 0 + && g->nodes[k].max_dist[j] >= 0) + { + way = g->nodes[i].max_dist[k] + g->nodes[k].max_dist[j]; + if (g->nodes[i].max_dist[j] < way) + g->nodes[i].max_dist[j] = way; + } + } } /* Calculate recurrence_length using max_dist info. */
[committed] loop-iv: make find_simple_exit static
This patch marks find_simple_exit function as static. Committed as r10-7107 after successful bootstrap on x86_64 and powerpc64le. Roman -- Function 'find_simple_exit' is used only from loop-iv.c In 2004-2006 it was also used in predict.c, but since r118694 (992c31e62304ed5d34247dbdef2db276d08fac05) it does not. gcc/ChangeLog: * loop-iv.c (find_simple_exit): Make it static. * cfgloop.h: Remove the corresponding prototype. diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h index 11378cadd41..1c49a8b8c2d 100644 --- a/gcc/cfgloop.h +++ b/gcc/cfgloop.h @@ -499,7 +499,6 @@ extern bool iv_analyze_expr (rtx_insn *, scalar_int_mode, rtx, class rtx_iv *); extern rtx get_iv_value (class rtx_iv *, rtx); extern bool biv_p (rtx_insn *, scalar_int_mode, rtx); -extern void find_simple_exit (class loop *, class niter_desc *); extern void iv_analysis_done (void); extern class niter_desc *get_simple_loop_desc (class loop *loop); diff --git a/gcc/loop-iv.c b/gcc/loop-iv.c index 6a59954f60a..d7b3d90f047 100644 --- a/gcc/loop-iv.c +++ b/gcc/loop-iv.c @@ -2915,7 +2915,7 @@ check_simple_exit (class loop *loop, edge e, class niter_desc *desc) /* Finds a simple exit of LOOP and stores its description into DESC. */ -void +static void find_simple_exit (class loop *loop, class niter_desc *desc) { unsigned i;
Re: Fix modulo-scheduler -fcompare-debug issues
Hi all! 12.03.2020 6:17, Jeff Law wrote: >> Current modulo-sched implementation is a bit faulty from -fcompile-debug >> perspective. >> >> But right now I see that when I enable -fmodulo-sched by default, powerpc64le >> bootstrap give comparison failure as of r10-7056. >> >> Comparing stages 2 and 3 >> Bootstrap comparison failure! >> gcc/ggc-page.o differs >> >> That doesn't happen on released branches, so it is a kind of "regression" >> (certainly, nobody runs bootstrap with -fmodulo-sched). > Even though we don't have a BZ, I think a bootstrap failure, even one with > modulo-scheduling enabled is severe enough that we should try to fix it. > > OK for the trunk. > > jeff 11.03.2020 11:14, Richard Biener wrote: > > Yes from a RM perspective, no comments about the technical details of > the patch. > > Richard. > Haven't committed that patch yet but made some additional investigation. (0) Running tests with -fcompare-debug (without -fmodulo-sched) shows a lot of different failures, I've created few PRs already and they were addressed (Thanks to Jakub!). There are three kinds of issues. (0a) Technical ones, I've not reported them. Many checking techniques are not ready for the compiler to run twice for one driver invocation. E.g. FAIL: c-c++-common/diagnostic-format-json-2.c -Wc++-compat (test for excess errors) fails because second cc1 run prints additional '[]' to the output. All tests in gcc.dg/pch directory have a similar issue. (0b) When only dumps differ, but not the code, e.g. PR94223 and PR94167. (0c) When it is a real "mistake", like PR94166 or PR94211. Or -w difference like in PR94189. Now, speaking about SMS, enabling it by default catches three (again!) kinds of separate issues with debug instructions. And two first are observable for many years, while the third one is a bit more tricky and I caught it only in 2019. My previous patch targeted (2) and (3) together, and now I have to split it. (1) The first issue is that reordering doesn't touch debug instructions at all. When we have found a proper schedule permute_partial_schedule function is called. It constructs new order in reverse, first we move the instruction that should be prev_insn (jump) to its place, than the previous one, etc. It doesn't consider moving debug instructions, so all of them are crowded before any non-debug instruction. This certainly doesn't help much for debug information and on ppc64le causes FAIL: gcc.dg/guality/loop-1.c -O2 -DPREVENT_OPTIMIZATION line 20 i == 1 and few other failures. I never actually tried to fix those, maybe the best solution here is to drop all debug inside the loop when SMS succeeded. But in this case we will also lose something - at least for now we have correct debug info when the loop was finished. (2) The second issue is a "simple" -fcompare-debug failure, which actually can not lead to different code in .text (and can not cause bootstrap failure). As I mentioned in previous mail it can be observed in pr42631.c example for ~10 years. My previous patch and small attached patch2.txt (can be applied only after patch3.txt) solves this. Note instructions "stick" to nearest following instruction and they are moved all together. But when one compares -g0 and -g, the note can instead stick to a debug instruction, and according to (1) it will eventually "move up" together with that debug instruction instead of "sinking" with the following non-debug instruction. An obvious solution is to extend the "sticking" approach to debug instructions themselves. I've used that solution ("previous" patch) locally for ~3 years. This change also affects the approach described in (1) but unfortunately doesn't fix any testcase and make something even worse: FAIL: gcc.dg/guality/pr90716.c -O1 -DPREVENT_OPTIMIZATION line 23 j + 1 == 9 So, IMHO we should not apply this in stage4. (3) And the last one we have to fix now if we bother about -fmodulo-sched ppc64le bootstrap. Failing examples are "gcc -fcompare-debug -O2 -fmodulo-sched --param sms-min-sc=1 gcc/testsuite/gcc.dg/torture/pr87197.c" on ppc64le and "gcc -fcompare-debug -O2 gcc.c-torture/execute/pr70127.c" on aarch64. Contrary to scheduler itself, when building DDG graph we consider all debug instructions and their dependencies. This may sometimes lead to a different result in sms_order_nodes function, as it depends on SCCs in DDG. My previous ad-hoc solution was just to remove any "debug->non-debug" edges in DDG. Now I reimplemented that by fully removing debug instructions nodes. I've tested patch3.txt a lot last week in different scenarios and will commit it in a week if no objection. Roman Author: Roman Zhuykov Date: Thu Mar 12 19:36:22 2020 +0300 modulo-sched: fix compare-debug issue with flying note insns This patch fixes -fcompare-debug failures in testsuite when it runs with -fmodulo-sched enabled. The approach is to move debug
Re: Fix modulo-scheduler -fcompare-debug issues
Hi, Christophe Lyon wrote 2020-03-27 19:53: Hi, On Fri, 20 Mar 2020 at 11:56, Roman Zhuykov via Gcc-patches wrote: Hi all! 12.03.2020 6:17, Jeff Law wrote: >> Current modulo-sched implementation is a bit faulty from -fcompile-debug >> perspective. >> >> But right now I see that when I enable -fmodulo-sched by default, powerpc64le >> bootstrap give comparison failure as of r10-7056. >> >> Comparing stages 2 and 3 >> Bootstrap comparison failure! >> gcc/ggc-page.o differs >> >> That doesn't happen on released branches, so it is a kind of "regression" >> (certainly, nobody runs bootstrap with -fmodulo-sched). > Even though we don't have a BZ, I think a bootstrap failure, even one with > modulo-scheduling enabled is severe enough that we should try to fix it. > > OK for the trunk. > > jeff 11.03.2020 11:14, Richard Biener wrote: > > Yes from a RM perspective, no comments about the technical details of > the patch. > > Richard. > Haven't committed that patch yet but made some additional investigation. (0) Running tests with -fcompare-debug (without -fmodulo-sched) shows a lot of different failures, I've created few PRs already and they were addressed (Thanks to Jakub!). There are three kinds of issues. (0a) Technical ones, I've not reported them. Many checking techniques are not ready for the compiler to run twice for one driver invocation. E.g. FAIL: c-c++-common/diagnostic-format-json-2.c -Wc++-compat (test for excess errors) fails because second cc1 run prints additional '[]' to the output. All tests in gcc.dg/pch directory have a similar issue. (0b) When only dumps differ, but not the code, e.g. PR94223 and PR94167. (0c) When it is a real "mistake", like PR94166 or PR94211. Or -w difference like in PR94189. Now, speaking about SMS, enabling it by default catches three (again!) kinds of separate issues with debug instructions. And two first are observable for many years, while the third one is a bit more tricky and I caught it only in 2019. My previous patch targeted (2) and (3) together, and now I have to split it. (1) The first issue is that reordering doesn't touch debug instructions at all. When we have found a proper schedule permute_partial_schedule function is called. It constructs new order in reverse, first we move the instruction that should be prev_insn (jump) to its place, than the previous one, etc. It doesn't consider moving debug instructions, so all of them are crowded before any non-debug instruction. This certainly doesn't help much for debug information and on ppc64le causes FAIL: gcc.dg/guality/loop-1.c -O2 -DPREVENT_OPTIMIZATION line 20 i == 1 and few other failures. I never actually tried to fix those, maybe the best solution here is to drop all debug inside the loop when SMS succeeded. But in this case we will also lose something - at least for now we have correct debug info when the loop was finished. (2) The second issue is a "simple" -fcompare-debug failure, which actually can not lead to different code in .text (and can not cause bootstrap failure). As I mentioned in previous mail it can be observed in pr42631.c example for ~10 years. My previous patch and small attached patch2.txt (can be applied only after patch3.txt) solves this. Note instructions "stick" to nearest following instruction and they are moved all together. But when one compares -g0 and -g, the note can instead stick to a debug instruction, and according to (1) it will eventually "move up" together with that debug instruction instead of "sinking" with the following non-debug instruction. An obvious solution is to extend the "sticking" approach to debug instructions themselves. I've used that solution ("previous" patch) locally for ~3 years. This change also affects the approach described in (1) but unfortunately doesn't fix any testcase and make something even worse: FAIL: gcc.dg/guality/pr90716.c -O1 -DPREVENT_OPTIMIZATION line 23 j + 1 == 9 So, IMHO we should not apply this in stage4. (3) And the last one we have to fix now if we bother about -fmodulo-sched ppc64le bootstrap. Failing examples are "gcc -fcompare-debug -O2 -fmodulo-sched --param sms-min-sc=1 gcc/testsuite/gcc.dg/torture/pr87197.c" on ppc64le and "gcc -fcompare-debug -O2 gcc.c-torture/execute/pr70127.c" on aarch64. Contrary to scheduler itself, when building DDG graph we consider all debug instructions and their dependencies. This may sometimes lead to a different result in sms_order_nodes function, as it depends on SCCs in DDG. My previous ad-hoc solution was just to remove any "debug->non-debug" edges in DDG. Now I reimplemented that by fully removin
[PATCH] modulo-sched: Carefully process loop counter initialization [PR97421]
Hi all! Same patch attached with commit message and inlined below. It was successfully reg-strapped on aarch64-linux. Planning also to briefly check amd64 build before push. Pushing in a few days if no objections. Any opinion about backports? Roman -- modulo-sched: Carefully process loop counter initialization [PR97421] Do not allow direct adjustment of pre-header initialization instruction for count register if is read in some instruction below in that basic block. gcc/ChangeLog: PR rtl-optimization/97421 * modulo-sched.c (generate_prolog_epilog): Remove forward declaration, adjust last argument name and type. (const_iteration_count): Add bool pointer parameter to return whether count register is read in pre-header after its initialization. (sms_schedule): Fix count register initialization adjustment procedure according to what const_iteration_count said. gcc/testsuite/ChangeLog: PR rtl-optimization/97421 * gcc.c-torture/execute/pr97421-1.c: New test. * gcc.c-torture/execute/pr97421-2.c: New test. * gcc.c-torture/execute/pr97421-3.c: New test. diff --git a/gcc/modulo-sched.c b/gcc/modulo-sched.c index 6f699a874e..4568674aa6 100644 --- a/gcc/modulo-sched.c +++ b/gcc/modulo-sched.c @@ -210,8 +210,6 @@ static int sms_order_nodes (ddg_ptr, int, int *, int *); static void set_node_sched_params (ddg_ptr); static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *); static void permute_partial_schedule (partial_schedule_ptr, rtx_insn *); -static void generate_prolog_epilog (partial_schedule_ptr, class loop *, -rtx, rtx); static int calculate_stage_count (partial_schedule_ptr, int); static void calculate_must_precede_follow (ddg_node_ptr, int, int, int, int, sbitmap, sbitmap, sbitmap); @@ -391,30 +389,40 @@ doloop_register_get (rtx_insn *head, rtx_insn *tail) this constant. Otherwise return 0. */ static rtx_insn * const_iteration_count (rtx count_reg, basic_block pre_header, - int64_t * count) + int64_t *count, bool* adjust_inplace) { rtx_insn *insn; rtx_insn *head, *tail; + *adjust_inplace = false; + bool read_after = false; + if (! pre_header) return NULL; get_ebb_head_tail (pre_header, pre_header, &head, &tail); for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn)) -if (NONDEBUG_INSN_P (insn) && single_set (insn) && - rtx_equal_p (count_reg, SET_DEST (single_set (insn +if (single_set (insn) && rtx_equal_p (count_reg, + SET_DEST (single_set (insn { rtx pat = single_set (insn); if (CONST_INT_P (SET_SRC (pat))) { *count = INTVAL (SET_SRC (pat)); + *adjust_inplace = !read_after; return insn; } return NULL; } +else if (NONDEBUG_INSN_P (insn) && reg_mentioned_p (count_reg, insn)) + { + read_after = true; + if (reg_set_p (count_reg, insn)) + break; + } return NULL; } @@ -1126,7 +1134,7 @@ duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage, /* Generate the instructions (including reg_moves) for prolog & epilog. */ static void generate_prolog_epilog (partial_schedule_ptr ps, class loop *loop, -rtx count_reg, rtx count_init) + rtx count_reg, bool adjust_init) { int i; int last_stage = PS_STAGE_COUNT (ps) - 1; @@ -1135,12 +1143,12 @@ generate_prolog_epilog (partial_schedule_ptr ps, class loop *loop, /* Generate the prolog, inserting its insns on the loop-entry edge. */ start_sequence (); - if (!count_init) + if (adjust_init) { /* Generate instructions at the beginning of the prolog to - adjust the loop count by STAGE_COUNT. If loop count is constant - (count_init), this constant is adjusted by STAGE_COUNT in - generate_prolog_epilog function. */ +adjust the loop count by STAGE_COUNT. If loop count is constant +and it not used anywhere in prologue, this constant is adjusted by +STAGE_COUNT outside of generate_prolog_epilog function. */ rtx sub_reg = NULL_RTX; sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS, count_reg, @@ -1528,7 +1536,8 @@ sms_schedule (void) rtx_insn *count_init; int mii, rec_mii, stage_count, min_cycle; int64_t loop_count = 0; - bool opt_sc_p; + bool opt_sc_p, adjust_inplace = false; + basic_block pre_header; if (! (g = g_arr[loop->num])) continue; @@ -1569,19 +1578,13 @@ sms_schedule (void) } - /* In case of th loop have doloop register it gets special -handling. */ - count_init = NULL; - if ((count_reg = do