Resending my response in plain text so it will go through to gcc-patches...
On Tue, Apr 24, 2012 at 2:36 PM, Teresa Johnson <[email protected]> wrote: > > > > On Tue, Apr 24, 2012 at 2:30 PM, Andrew Pinski <[email protected]> wrote: >> >> On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <[email protected]> wrote: >> > This patch adds heuristics to limit unrolling in loops with branches that >> > may increase >> > branch mispredictions. It affects loops that are not frequently iterated, >> > and that are >> > nested within a hot region of code that already contains many branch >> > instructions. >> > >> > Performance tested with both internal benchmarks and with SPEC 2000/2006 >> > on a variety >> > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different >> > AMD Opteron systems. >> > This improves performance of an internal search indexing benchmark by >> > close to 2% on >> > all the tested Intel platforms. It also consistently improves 445.gobmk >> > (with FDO feedback >> > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance >> > effects are >> > neutral. >> > >> > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? >> > >> > Thanks, >> > Teresa >> > >> > 2012-04-24 Teresa Johnson <[email protected]> >> > >> > * loop-unroll.c (loop_has_call): New function. >> > (loop_has_FP_comp): Ditto. >> > (compute_weighted_branches): Ditto. >> > (max_unroll_with_branches): Ditto. >> > (decide_unroll_constant_iterations): Add heuristic to avoid >> > increasing branch mispredicts when unrolling. >> > (decide_unroll_runtime_iterations): Ditto. >> > * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param. >> > (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto. >> > >> > Index: loop-unroll.c >> > =================================================================== >> > --- loop-unroll.c (revision 186783) >> > +++ loop-unroll.c (working copy) >> > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc >> > basic_block); >> > static rtx get_expansion (struct var_to_expand *); >> > >> > +/* Determine whether LOOP contains call. */ >> > +static bool >> > +loop_has_call(struct loop *loop) >> > +{ >> > + basic_block *body, bb; >> > + unsigned i; >> > + rtx insn; >> > + >> > + body = get_loop_body (loop); >> > + for (i = 0; i < loop->num_nodes; i++) >> > + { >> > + bb = body[i]; >> > + >> > + FOR_BB_INSNS (bb, insn) >> > + { >> > + if (CALL_P (insn)) >> > + { >> > + free (body); >> > + return true; >> > + } >> > + } >> > + } >> > + free (body); >> > + return false; >> > +} >> > + >> > +/* Determine whether LOOP contains floating-point computation. */ >> > +static bool >> > +loop_has_FP_comp(struct loop *loop) >> > +{ >> > + rtx set, dest; >> > + basic_block *body, bb; >> > + unsigned i; >> > + rtx insn; >> > + >> > + body = get_loop_body (loop); >> > + for (i = 0; i < loop->num_nodes; i++) >> > + { >> > + bb = body[i]; >> > + >> > + FOR_BB_INSNS (bb, insn) >> > + { >> > + set = single_set (insn); >> > + if (!set) >> > + continue; >> > + >> > + dest = SET_DEST (set); >> > + if (FLOAT_MODE_P (GET_MODE (dest))) >> > + { >> > + free (body); >> > + return true; >> > + } >> > + } >> > + } >> > + free (body); >> > + return false; >> > +} >> > + >> > +/* Compute the number of branches in LOOP, weighted by execution counts. >> > */ >> > +static float >> > +compute_weighted_branches(struct loop *loop) >> > +{ >> > + int header_count = loop->header->count; >> > + unsigned i; >> > + float n; >> > + basic_block * body; >> > + >> > + /* If no profile feedback data exists, don't limit unrolling */ >> > + if (header_count == 0) >> > + return 0.0; >> > + >> > + gcc_assert (loop->latch != EXIT_BLOCK_PTR); >> > + >> > + body = get_loop_body (loop); >> > + n = 0.0; >> > + for (i = 0; i < loop->num_nodes; i++) >> > + { >> > + if (EDGE_COUNT (body[i]->succs) >= 2) >> > + { >> > + /* If this block is executed less frequently than the header >> > (loop >> > + entry), then it is weighted based on the ratio of times it is >> > + executed compared to the header. */ >> > + if (body[i]->count < header_count) >> > + n += ((float)body[i]->count)/header_count; >> >> Please don't introduce more floating point usage into the compiler >> since it could change between different hosts (sse vs x87 for an >> example). >> Maybe use a fixed point multiply of 1000 (note use a macro for this >> special value though) like what is used in the rest of the predict >> code. > > > Ok, got it. I will address this in the next version of the patch. > > Thanks, > Teresa > >> >> >> >> Thanks, >> Andrew Pinski >> >> >> > + >> > + /* When it is executed more frequently than the header (i.e. it >> > is >> > + in a nested inner loop), simply weight the branch at 1.0. */ >> > + else >> > + n += 1.0; >> > + } >> > + } >> > + free (body); >> > + >> > + return n; >> > +} >> > + >> > +/* Compute the maximum number of times LOOP can be unrolled without >> > exceeding >> > + a branch budget, which can increase branch mispredictions. The number >> > of >> > + branches is computed by weighting each branch with its expected >> > execution >> > + probability through the loop based on profile data. If no profile >> > feedback >> > + data exists, simply return the current NUNROLL factor. */ >> > +static unsigned >> > +max_unroll_with_branches(struct loop *loop, unsigned nunroll) >> > +{ >> > + struct loop *outer; >> > + struct niter_desc *outer_desc; >> > + int outer_niters = 1; >> > + float weighted_outer_branches = 0.0; >> > + float weighted_num_branches = compute_weighted_branches (loop); >> > + >> > + /* If there was no profile feedback data, weighted_num_branches will be >> > 0.0 >> > + and we won't limit unrolling. If the weighted_num_branches is at >> > most 1.0, >> > + also don't limit unrolling as the back-edge branch will not be >> > duplicated. */ >> > + if (weighted_num_branches <= 1.0) >> > + return nunroll; >> > + >> > + /* Walk up the loop tree until we find a hot outer loop in which the >> > current >> > + loop is nested. At that point we will compute the number of times the >> > + current loop can be unrolled based on the number of branches in the >> > hot >> > + outer loop. */ >> > + outer = loop_outer(loop); >> > + /* The loop structure contains a fake outermost loop, so this should >> > always >> > + be non-NULL for our current loop. */ >> > + gcc_assert (outer); >> > + /* Detect if this is the fake outermost loop (at which point we are >> > done) >> > + by checking its outer loop. */ >> > + while (loop_outer(outer)) >> > + { >> > + outer_desc = get_simple_loop_desc (outer); >> > + >> > + if (outer_desc->const_iter) >> > + outer_niters *= outer_desc->niter; >> > + else if (outer->header->count) >> > + outer_niters *= expected_loop_iterations (outer); >> > + >> > + weighted_outer_branches = compute_weighted_branches (outer); >> > + >> > + /* Should have been checked by caller. */ >> > + gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1); >> > + >> > + /* If the outer loop has enough iterations to be considered hot, >> > then >> > + we can stop our upwards loop tree traversal and examine the >> > current >> > + outer loop. */ >> > + if (outer_niters >= PARAM_VALUE >> > (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> > + { >> > + /* Assume that any call will cause the branch budget to be >> > exceeded, >> > + and that we can't unroll the current loop without increasing >> > + mispredicts. */ >> > + if (loop_has_call(outer)) >> > + return 0; >> > + >> > + /* Otherwise, compute the maximum number of times current loop >> > can be >> > + unrolled without exceeding our branch budget. First we >> > subtract >> > + off the outer loop's weighted branch count from the budget. >> > Note >> > + that this includes the branches in the current loop. This >> > yields >> > + the number of branches left in the budget for the unrolled >> > copies. >> > + We divide this by the number of branches in the current loop >> > that >> > + must be duplicated when we unroll, which is the total >> > weighted >> > + number of branches minus the back-edge branch. This yields >> > the >> > + number of new loop body copies that can be created by >> > unrolling >> > + without exceeding the budget, to which we add 1 to get the >> > unroll >> > + factor. */ >> > + return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) - >> > + weighted_outer_branches)/(weighted_num_branches - 1) + >> > 1; >> > + } >> > + outer = loop_outer(outer); >> > + } >> > + >> > + /* The current loop is not enclosed by a hot enough outer loop in this >> > + procedure, since the hot outer loop is inter-procedural, assume that >> > + it already contains a significant number of branches, so don't >> > unroll. */ >> > + return 0; >> > +} >> > + >> > /* Unroll and/or peel (depending on FLAGS) LOOPS. */ >> > void >> > unroll_and_peel_loops (int flags) >> > @@ -522,6 +696,7 @@ static void >> > decide_unroll_constant_iterations (struct loop *loop, int flags) >> > { >> > unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, >> > i; >> > + unsigned nunroll_branches; >> > struct niter_desc *desc; >> > >> > if (!(flags & UAP_UNROLL)) >> > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo >> > return; >> > } >> > >> > + /* Be careful when unrolling loops with branches inside -- it can >> > increase >> > + the number of mispredicts. Ignore loops with FP computation as these >> > + tend to benefit much more consistently from unrolling. */ >> > + if (num_loop_branches (loop) > 1 >> > + && loop_has_FP_comp(loop) >> > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> > + && desc->niter < (unsigned) PARAM_VALUE >> > (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> > + { >> > + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> > + if (nunroll > nunroll_branches) >> > + nunroll = nunroll_branches; >> > + if (nunroll <= 1) >> > + { >> > + if (dump_file) >> > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> > + return; >> > + } >> > + } >> > + >> > /* Check whether the loop rolls enough to consider. */ >> > if (desc->niter < 2 * nunroll) >> > { >> > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop >> > static void >> > decide_unroll_runtime_iterations (struct loop *loop, int flags) >> > { >> > - unsigned nunroll, nunroll_by_av, i; >> > + unsigned nunroll, nunroll_by_av, nunroll_branches, i; >> > struct niter_desc *desc; >> > >> > if (!(flags & UAP_UNROLL)) >> > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo >> > return; >> > } >> > >> > + /* Be careful when unrolling loops with branches inside -- it can >> > increase >> > + the number of mispredicts. Ignore loops with FP computation as these >> > + tend to benefit much more consistently from unrolling. */ >> > + if (num_loop_branches (loop) > 1 >> > + && loop_has_FP_comp(loop) >> > + && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1 >> > + && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE >> > (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES)) >> > + { >> > + nunroll_branches = max_unroll_with_branches(loop, nunroll); >> > + if (nunroll > nunroll_branches) >> > + nunroll = nunroll_branches; >> > + if (nunroll <= 1) >> > + { >> > + if (dump_file) >> > + fprintf (dump_file, ";; Not unrolling, contains branches\n"); >> > + return; >> > + } >> > + } >> > + >> > /* If we have profile feedback, check whether the loop rolls. */ >> > if ((loop->header->count >> > && expected_loop_iterations (loop) < 2 * nunroll) >> > Index: params.def >> > =================================================================== >> > --- params.def (revision 186783) >> > +++ params.def (working copy) >> > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS, >> > "The maximum depth of a loop nest we completely peel", >> > 8, 0, 0) >> > >> > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES, >> > + "min-iter-unroll-with-branches", >> > + "Minimum iteration count to ignore branch effects when unrolling", >> > + 50, 0, 0) >> > + >> > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET, >> > + "unroll-outer-loop-branch-budget", >> > + "Maximum number of branches allowed in hot outer loop region after >> > unroll", >> > + 25, 0, 0) >> > + >> > /* The maximum number of insns of an unswitched loop. */ >> > DEFPARAM(PARAM_MAX_UNSWITCH_INSNS, >> > "max-unswitch-insns", >> > >> > -- >> > This patch is available for review at http://codereview.appspot.com/6099055 > > > > > -- > Teresa Johnson | Software Engineer | [email protected] | 408-460-2413 > -- Teresa Johnson | Software Engineer | [email protected] | 408-460-2413
