Hi Jakub, > -----Original Message----- > From: Jakub Jelinek <ja...@redhat.com> > Sent: Tuesday, March 27, 2018 2:40 PM > To: Richard Biener <rguent...@suse.de> > Cc: gcc-patches@gcc.gnu.org; Kumar, Venkataramanan > <venkataramanan.ku...@amd.com> > Subject: [PATCH] Improve TRUTH_{AND,OR}IF_EXPR expansion (PR rtl- > optimization/78200) > > Hi! > > The following patch attempts to improve expansion, if we have code like: > <bb 16> [local count: 102513059]: > if_conversion_var_52 = MEM[base: st1_22, offset: 0B]; > if (if_conversion_var_52 < 0) > goto <bb 17>; [41.00%] > else > goto <bb 18>; [59.00%] > > ... > > <bb 18> [local count: 60482706]: > _81 = _11 == 2; > _82 = if_conversion_var_52 > 0; > _79 = _81 & _82; > if (_79 != 0) > goto <bb 19>; [29.26%] > else > goto <bb 20>; [70.74%] > > Here, the single pred of the bb performed a similar comparison to what one > of the & (or |) operands does, and as & (or |) is not ordered, we can choose > which operand we'll expand first. If we expand if_conversion_var_52 > 0 > first, there is a chance that we can reuse flags from the previous comparison. > The patch does it only if there are no (non-virtual) phis on the current bb, > all > stmts before the current condition are TERable, so there is nothing that > would clobber the flags in between. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk if it > helps for SPEC? Venkat, do you think you could benchmark it in the setup > where you've measured the slowdown to see if it helps? I see the patch > changes the loop:
Sure will benchmark and get back to you. > .p2align 4,,10 > .p2align 3 > .L11: > + jle .L10 > cmpl $2, %eax > - jne .L10 > - testq %r8, %r8 > - jg .L12 > + je .L12 > .p2align 4,,10 > .p2align 3 > .L10: > addq %r9, %rsi > cmpq %rsi, %rdx > jbe .L35 > .L13: > movl 24(%rsi), %eax > testl %eax, %eax > jle .L10 > movq (%rsi), %r8 > testq %r8, %r8 > jns .L11 > cmpl $1, %eax > jne .L10 > .L12: > addq $1, %r10 > movl $1, %r11d > movq st5(,%r10,8), %rax > movq %rsi, (%rax) > addq %r9, %rsi > movq %r8, 8(%rax) > movq st5(,%rdi,8), %rax > movq %r8, 16(%rax) > cmpq %rsi, %rdx > ja .L13 > which I assume shall be an improvement, since we can save one extra > comparison. > > 2018-03-27 Jakub Jelinek <ja...@redhat.com> > Richard Biener <rgue...@suse.de> > > PR rtl-optimization/78200 > * cfgexpand.c (gimple_bb_info_for_bb): New variable. > (expand_bb_seq, expand_phi_nodes): New functions. > (expand_gimple_cond): Use heuristics if it is desirable to > swap TRUTH_{AND,OR}IF_EXPR operands. > (expand_gimple_basic_block): Remember GIMPLE il for bbs > being expanded or already expanded. > (pass_expand::execute): Initialize and then free the > gimple_bb_info_for_bb vector. > > --- gcc/cfgexpand.c.jj 2018-02-09 06:44:36.413805556 +0100 > +++ gcc/cfgexpand.c 2018-03-26 13:35:57.536509844 +0200 > @@ -2357,6 +2357,34 @@ label_rtx_for_bb (basic_block bb ATTRIBU > return l; > } > > +/* Maps blocks to their GIMPLE IL. */ > +static vec<gimple_bb_info, va_heap, vl_embed> *gimple_bb_info_for_bb; > + > +/* Like bb_seq, except that during expansion returns the GIMPLE seq > + even for the blocks that have been already expanded or are being > + currently expanded. */ > + > +static gimple_seq > +expand_bb_seq (basic_block bb) > +{ > + if ((bb->flags & BB_RTL) > + && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb)) > + return (*gimple_bb_info_for_bb)[bb->index].seq; > + return bb_seq (bb); > +} > + > +/* Like phi_nodes, except that during expansion returns the GIMPLE PHIs > + even for the blocks that have been already expanded or are being > + currently expanded. */ > + > +static gimple_seq > +expand_phi_nodes (basic_block bb) > +{ > + if ((bb->flags & BB_RTL) > + && (unsigned) bb->index < vec_safe_length (gimple_bb_info_for_bb)) > + return (*gimple_bb_info_for_bb)[bb->index].phi_nodes; > + return phi_nodes (bb); > +} > > /* A subroutine of expand_gimple_cond. Given E, a fallthrough edge > of a basic block where we just expanded the conditional at the end, @@ - > 2475,6 +2503,65 @@ expand_gimple_cond (basic_block bb, gcon > op0 = gimple_assign_rhs1 (second); > op1 = gimple_assign_rhs2 (second); > } > + > + /* We'll expand RTL for op0 first, see if we'd better expand RTL > + for op1 first. Do that if the previous bb ends with > + if (x op cst), op1's def_stmt rhs is x op2 cst and there are > + no non-virtual PHIs nor non-TERed stmts in BB before STMT. > */ > + while (TREE_CODE (op1) == SSA_NAME > + && (code == TRUTH_ANDIF_EXPR || code == > TRUTH_ORIF_EXPR) > + && single_pred_p (bb)) > + { > + gimple *def1 = SSA_NAME_DEF_STMT (op1); > + if (!is_gimple_assign (def1) > + || (TREE_CODE_CLASS (gimple_assign_rhs_code (def1)) > + != tcc_comparison)) > + break; > + > + basic_block pred = single_pred (bb); > + gimple_seq pred_seq = expand_bb_seq (pred); > + gimple_stmt_iterator i = gsi_last (pred_seq); > + if (!gsi_end_p (i) && is_gimple_debug (gsi_stmt (i))) > + gsi_prev_nondebug (&i); > + > + gimple *last = gsi_stmt (i); > + if (!last > + || gimple_code (last) != GIMPLE_COND > + || gimple_assign_rhs1 (def1) != gimple_cond_lhs (last) > + || !operand_equal_p (gimple_assign_rhs2 (def1), > + gimple_cond_rhs (last), 0)) > + break; > + > + gimple_seq phi_seq = expand_phi_nodes (bb); > + i = gsi_start (phi_seq); > + if (!gsi_end_p (i) > + && !virtual_operand_p (gimple_phi_result (gsi_stmt (i)))) > + break; > + > + /* Check if all stmts before STMT are TERed. */ > + gimple_seq cur_seq = expand_bb_seq (bb); > + i = gsi_start_nondebug (cur_seq); > + int cnt = 100; > + while (!gsi_end_p (i) && gsi_stmt (i) != stmt) > + { > + gimple *g = gsi_stmt (i); > + if (!is_gimple_assign (g)) > + break; > + if (--cnt == 0) > + break; > + tree lhs = gimple_assign_lhs (g); > + if (TREE_CODE (lhs) != SSA_NAME > + || !get_gimple_for_ssa_name (lhs)) > + break; > + gsi_next_nondebug (&i); > + } > + > + if (gsi_stmt (i) != stmt) > + break; > + > + std::swap (op0, op1); > + break; > + } > } > } > } > @@ -5501,6 +5588,10 @@ expand_gimple_basic_block (basic_block b > if (optimize) > reorder_operands (bb); > stmts = bb_seq (bb); > + if ((unsigned) bb->index >= vec_safe_length (gimple_bb_info_for_bb)) > + vec_safe_grow_cleared (gimple_bb_info_for_bb, bb->index + 1); > + (*gimple_bb_info_for_bb)[bb->index].seq = stmts; > + (*gimple_bb_info_for_bb)[bb->index].phi_nodes = phi_nodes (bb); > bb->il.gimple.seq = NULL; > bb->il.gimple.phi_nodes = NULL; > rtl_profile_for_bb (bb); > @@ -6419,6 +6510,8 @@ pass_expand::execute (function *fun) > >= PARAM_VALUE (PARAM_MAX_DEBUG_MARKER_COUNT)) > cfun->debug_nonbind_markers = false; > > + vec_safe_grow_cleared (gimple_bb_info_for_bb, n_basic_blocks_for_fn > + (cfun)); > + > lab_rtx_for_bb = new hash_map<basic_block, rtx_code_label *>; > FOR_BB_BETWEEN (bb, init_block->next_bb, EXIT_BLOCK_PTR_FOR_FN > (fun), > next_bb) > @@ -6433,6 +6526,8 @@ pass_expand::execute (function *fun) > deep_ter_debug_map = NULL; > } > > + vec_free (gimple_bb_info_for_bb); > + > /* Free stuff we no longer need after GIMPLE optimizations. */ > free_dominance_info (CDI_DOMINATORS); > free_dominance_info (CDI_POST_DOMINATORS); > > Jakub Regards, Venkat.