The following fixes PR90402 by avoiding value-numbering bring the loop header PHIs of if-converted and non-if-converted loop copy out-of-sync for the vectorizer.
This is done by teaching region VN to handle multiple entries into the entry block which allows us to value-number a cycles body without considering the backedge (just forcing all the entry PHIs varying). Single BB cycles probably still cannot be handled since they'd have entry->dest == exit_bb, but I didn't actually try... Bootstrap & regtest running on x86_64-unknown-linux-gnu. Richard. 2019-05-09 Richard Biener <rguent...@suse.de> PR tree-optimization/90402 * tree-if-conv.c (tree_if_conversion): Value number only the loop body by making the latch an exit of the region as well. * tree-ssa-sccvn.c (process_bb): Add flag whether to skip processing PHIs. (do_rpo_vn): Deal with multiple edges into the entry block that are not backedges inside the region by skipping PHIs of the entry block. * gcc.dg/torture/pr90402-1.c: New testcase. Index: gcc/tree-if-conv.c =================================================================== --- gcc/tree-if-conv.c (revision 271030) +++ gcc/tree-if-conv.c (working copy) @@ -3066,10 +3066,12 @@ tree_if_conversion (struct loop *loop, v ifcvt_local_dce (loop->header); /* Perform local CSE, this esp. helps the vectorizer analysis if loads - and stores are involved. + and stores are involved. CSE only the loop body, not the entry + PHIs, those are to be kept in sync with the non-if-converted copy. ??? We'll still keep dead stores though. */ exit_bbs = BITMAP_ALLOC (NULL); bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index); + bitmap_set_bit (exit_bbs, loop->latch->index); todo |= do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs); BITMAP_FREE (exit_bbs); Index: gcc/tree-ssa-sccvn.c =================================================================== --- gcc/tree-ssa-sccvn.c (revision 271030) +++ gcc/tree-ssa-sccvn.c (working copy) @@ -5978,7 +5978,7 @@ insert_related_predicates_on_edge (enum static unsigned process_bb (rpo_elim &avail, basic_block bb, bool bb_visited, bool iterate_phis, bool iterate, bool eliminate, - bool do_region, bitmap exit_bbs) + bool do_region, bitmap exit_bbs, bool skip_phis) { unsigned todo = 0; edge_iterator ei; @@ -5989,7 +5989,8 @@ process_bb (rpo_elim &avail, basic_block /* If we are in loop-closed SSA preserve this state. This is relevant when called on regions from outside of FRE/PRE. */ bool lc_phi_nodes = false; - if (loops_state_satisfies_p (LOOP_CLOSED_SSA)) + if (!skip_phis + && loops_state_satisfies_p (LOOP_CLOSED_SSA)) FOR_EACH_EDGE (e, ei, bb->preds) if (e->src->loop_father != e->dest->loop_father && flow_loop_nested_p (e->dest->loop_father, @@ -6010,67 +6011,68 @@ process_bb (rpo_elim &avail, basic_block } /* Value-number all defs in the basic-block. */ - for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); - gsi_next (&gsi)) - { - gphi *phi = gsi.phi (); - tree res = PHI_RESULT (phi); - vn_ssa_aux_t res_info = VN_INFO (res); - if (!bb_visited) - { - gcc_assert (!res_info->visited); - res_info->valnum = VN_TOP; - res_info->visited = true; - } - - /* When not iterating force backedge values to varying. */ - visit_stmt (phi, !iterate_phis); - if (virtual_operand_p (res)) - continue; - - /* Eliminate */ - /* The interesting case is gcc.dg/tree-ssa/pr22230.c for correctness - how we handle backedges and availability. - And gcc.dg/tree-ssa/ssa-sccvn-2.c for optimization. */ - tree val = res_info->valnum; - if (res != val && !iterate && eliminate) - { - if (tree leader = avail.eliminate_avail (bb, res)) - { - if (leader != res - /* Preserve loop-closed SSA form. */ - && (! lc_phi_nodes - || is_gimple_min_invariant (leader))) - { - if (dump_file && (dump_flags & TDF_DETAILS)) - { - fprintf (dump_file, "Replaced redundant PHI node " - "defining "); - print_generic_expr (dump_file, res); - fprintf (dump_file, " with "); - print_generic_expr (dump_file, leader); - fprintf (dump_file, "\n"); - } - avail.eliminations++; + if (!skip_phis) + for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + tree res = PHI_RESULT (phi); + vn_ssa_aux_t res_info = VN_INFO (res); + if (!bb_visited) + { + gcc_assert (!res_info->visited); + res_info->valnum = VN_TOP; + res_info->visited = true; + } - if (may_propagate_copy (res, leader)) - { - /* Schedule for removal. */ - avail.to_remove.safe_push (phi); - continue; - } - /* ??? Else generate a copy stmt. */ - } - } - } - /* Only make defs available that not already are. But make - sure loop-closed SSA PHI node defs are picked up for - downstream uses. */ - if (lc_phi_nodes - || res == val - || ! avail.eliminate_avail (bb, res)) - avail.eliminate_push_avail (bb, res); - } + /* When not iterating force backedge values to varying. */ + visit_stmt (phi, !iterate_phis); + if (virtual_operand_p (res)) + continue; + + /* Eliminate */ + /* The interesting case is gcc.dg/tree-ssa/pr22230.c for correctness + how we handle backedges and availability. + And gcc.dg/tree-ssa/ssa-sccvn-2.c for optimization. */ + tree val = res_info->valnum; + if (res != val && !iterate && eliminate) + { + if (tree leader = avail.eliminate_avail (bb, res)) + { + if (leader != res + /* Preserve loop-closed SSA form. */ + && (! lc_phi_nodes + || is_gimple_min_invariant (leader))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Replaced redundant PHI node " + "defining "); + print_generic_expr (dump_file, res); + fprintf (dump_file, " with "); + print_generic_expr (dump_file, leader); + fprintf (dump_file, "\n"); + } + avail.eliminations++; + + if (may_propagate_copy (res, leader)) + { + /* Schedule for removal. */ + avail.to_remove.safe_push (phi); + continue; + } + /* ??? Else generate a copy stmt. */ + } + } + } + /* Only make defs available that not already are. But make + sure loop-closed SSA PHI node defs are picked up for + downstream uses. */ + if (lc_phi_nodes + || res == val + || ! avail.eliminate_avail (bb, res)) + avail.eliminate_push_avail (bb, res); + } /* For empty BBs mark outgoing edges executable. For non-empty BBs we do this when processing the last stmt as we have to do this @@ -6414,6 +6416,13 @@ do_rpo_vn (function *fn, edge entry, bit bitmap_set_bit (exit_bbs, EXIT_BLOCK); } + /* Clear EDGE_DFS_BACK on "all" entry edges, RPO order compute will + re-mark those that are contained in the region. */ + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, entry->dest->preds) + e->flags &= ~EDGE_DFS_BACK; + int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fn) - NUM_FIXED_BLOCKS); int n = rev_post_order_and_mark_dfs_back_seme (fn, entry, exit_bbs, !loops_state_satisfies_p (LOOPS_NEED_FIXUP), rpo); @@ -6424,6 +6433,18 @@ do_rpo_vn (function *fn, edge entry, bit if (!do_region) BITMAP_FREE (exit_bbs); + /* If there are any non-DFS_BACK edges into entry->dest skip + processing PHI nodes for that block. This supports + value-numbering loop bodies w/o the actual loop. */ + FOR_EACH_EDGE (e, ei, entry->dest->preds) + if (e != entry + && !(e->flags & EDGE_DFS_BACK)) + break; + bool skip_entry_phis = e != NULL; + if (skip_entry_phis && dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Region does not contain all edges into " + "the entry block, skipping its PHIs.\n"); + int *bb_to_rpo = XNEWVEC (int, last_basic_block_for_fn (fn)); for (int i = 0; i < n; ++i) bb_to_rpo[rpo[i]] = i; @@ -6453,7 +6474,9 @@ do_rpo_vn (function *fn, edge entry, bit edge e; edge_iterator ei; FOR_EACH_EDGE (e, ei, bb->preds) - gcc_assert (e == entry || (e->src->flags & bb_in_region)); + gcc_assert (e == entry + || (skip_entry_phis && bb == entry->dest) + || (e->src->flags & bb_in_region)); } for (int i = 0; i < n; ++i) { @@ -6497,7 +6520,7 @@ do_rpo_vn (function *fn, edge entry, bit if (e->flags & EDGE_DFS_BACK) has_backedges = true; e->flags &= ~EDGE_EXECUTABLE; - if (iterate || e == entry) + if (iterate || e == entry || (skip_entry_phis && bb == entry->dest)) continue; if (bb_to_rpo[e->src->index] > i) { @@ -6530,7 +6553,7 @@ do_rpo_vn (function *fn, edge entry, bit edge_iterator ei; FOR_EACH_EDGE (e, ei, bb->preds) { - if (e == entry) + if (e == entry || (skip_entry_phis && bb == entry->dest)) continue; int max_rpo = MAX (rpo_state[i].max_rpo, rpo_state[bb_to_rpo[e->src->index]].max_rpo); @@ -6619,7 +6642,7 @@ do_rpo_vn (function *fn, edge entry, bit todo |= process_bb (avail, bb, rpo_state[idx].visited != 0, rpo_state[idx].iterate, - iterate, eliminate, do_region, exit_bbs); + iterate, eliminate, do_region, exit_bbs, false); rpo_state[idx].visited++; /* Verify if changed values flow over executable outgoing backedges @@ -6717,8 +6740,10 @@ do_rpo_vn (function *fn, edge entry, bit edge e; FOR_EACH_EDGE (e, ei, bb->preds) if (!(e->flags & EDGE_EXECUTABLE) - && !rpo_state[bb_to_rpo[e->src->index]].visited - && rpo_state[bb_to_rpo[e->src->index]].max_rpo >= (int)idx) + && (bb == entry->dest + || (!rpo_state[bb_to_rpo[e->src->index]].visited + && (rpo_state[bb_to_rpo[e->src->index]].max_rpo + >= (int)idx)))) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Cannot trust state of predecessor " @@ -6729,7 +6754,8 @@ do_rpo_vn (function *fn, edge entry, bit nblk++; todo |= process_bb (avail, bb, false, false, false, eliminate, - do_region, exit_bbs); + do_region, exit_bbs, + skip_entry_phis && bb == entry->dest); rpo_state[idx].visited++; FOR_EACH_EDGE (e, ei, bb->succs) @@ -6811,7 +6837,9 @@ do_rpo_vn (function *fn, edge entry, bit } /* Region-based entry for RPO VN. Performs value-numbering and elimination - on the SEME region specified by ENTRY and EXIT_BBS. */ + on the SEME region specified by ENTRY and EXIT_BBS. If ENTRY is not + the only edge into the region at ENTRY->dest PHI nodes in ENTRY->dest + are not considered. */ unsigned do_rpo_vn (function *fn, edge entry, bitmap exit_bbs) Index: gcc/testsuite/gcc.dg/torture/pr90402-1.c =================================================================== --- gcc/testsuite/gcc.dg/torture/pr90402-1.c (nonexistent) +++ gcc/testsuite/gcc.dg/torture/pr90402-1.c (working copy) @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-mavx" { target x86_64-*-* i?86-*-* } } */ + +int kn, ha; + +int +c7 (void) +{ +} + +void +ul (int w3) +{ + kn = c7 (); + + while (w3 < 1) + { + ha += !!kn ? 1 : w3; + + for (kn = 0; kn < 2; ++kn) + { + } + + ++w3; + } +}