On Thu, 29 Apr 2021, Jiufu Guo wrote:
> When there is the possibility that overflow may happen on the loop index,
> a few optimizations would not happen. For example code:
>
> foo (int *a, int *b, unsigned k, unsigned n)
> {
> while (++k != n)
> a[k] = b[k] + 1;
> }
>
> For this code, if "l > n", overflow may happen. if "l < n" at begining,
> it could be optimized (e.g. vectorization).
>
> We can split the loop into two loops:
>
> while (++k > n)
> a[k] = b[k] + 1;
> while (l++ < n)
> a[k] = b[k] + 1;
>
> then for the second loop, it could be optimized.
>
> This patch is spltting this kind of small loop to achieve better performance.
>
> Bootstrap and regtest pass on ppc64le. Is this ok for trunk?
Do you have any statistics on how often this splits a loop during
bootstrap (use --with-build-config=bootstrap-O3)? Or alternatively
on SPEC?
Actual comments on the patch inline.
> Thanks!
>
> Jiufu Guo.
>
> gcc/ChangeLog:
>
> 2021-04-29 Jiufu Guo <[email protected]>
>
> * params.opt (max-insns-ne-cond-split): New.
> * tree-ssa-loop-split.c (connect_loop_phis): Add new param.
> (get_ne_cond_branch): New function.
> (split_ne_loop): New function.
> (split_loop_on_ne_cond): New function.
> (tree_ssa_split_loops): Use split_loop_on_ne_cond.
>
> gcc/testsuite/ChangeLog:
> 2021-04-29 Jiufu Guo <[email protected]>
>
> * gcc.dg/loop-split1.c: New test.
>
> ---
> gcc/params.opt | 4 +
> gcc/testsuite/gcc.dg/loop-split1.c | 28 ++++
> gcc/tree-ssa-loop-split.c | 219 ++++++++++++++++++++++++++++-
> 3 files changed, 247 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
>
> diff --git a/gcc/params.opt b/gcc/params.opt
> index 2e4cbdd7a71..900b59b5136 100644
> --- a/gcc/params.opt
> +++ b/gcc/params.opt
> @@ -766,6 +766,10 @@ Min. ratio of insns to prefetches to enable prefetching
> for a loop with an unkno
> Common Joined UInteger Var(param_min_loop_cond_split_prob) Init(30)
> IntegerRange(0, 100) Param Optimization
> The minimum threshold for probability of semi-invariant condition statement
> to trigger loop split.
>
> +-param=max-insns-ne-cond-split=
> +Common Joined UInteger Var(param_max_insn_ne_cond_split) Init(64) Param
> Optimization
> +The maximum threshold for insnstructions number of a loop with ne condition
> to split.
> +
> -param=min-nondebug-insn-uid=
> Common Joined UInteger Var(param_min_nondebug_insn_uid) Param
> The minimum UID to be used for a nondebug insn.
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c
> b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..4c466aa9f54
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> + while (++l != n)
> + a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> + while (l++ != n)
> + a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> + while (++l != n)
> + if (a[l] != b[l])
> + break;
> +
> + return l;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 3 "lsplit" } } */
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index b80b6a75e62..a6d28078e5e 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see
> #include "cfghooks.h"
> #include "gimple-fold.h"
> #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
>
> /* This file implements two kinds of loop splitting.
>
> @@ -233,7 +234,8 @@ easy_exit_values (class loop *loop)
> this. The loops need to fulfill easy_exit_values(). */
>
> static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> + bool use_prev = false)
> {
> basic_block rest = loop_preheader_edge (loop2)->src;
> gcc_assert (new_e->dest == rest);
> @@ -248,13 +250,14 @@ connect_loop_phis (class loop *loop1, class loop
> *loop2, edge new_e)
> !gsi_end_p (psi_first);
> gsi_next (&psi_first), gsi_next (&psi_second))
> {
> - tree init, next, new_init;
> + tree init, next, new_init, prev;
> use_operand_p op;
> gphi *phi_first = psi_first.phi ();
> gphi *phi_second = psi_second.phi ();
>
> init = PHI_ARG_DEF_FROM_EDGE (phi_first, firste);
> next = PHI_ARG_DEF_FROM_EDGE (phi_first, firstn);
> + prev = PHI_RESULT (phi_first);
> op = PHI_ARG_DEF_PTR_FROM_EDGE (phi_second, seconde);
> gcc_assert (operand_equal_for_phi_arg_p (init, USE_FROM_PTR (op)));
>
> @@ -279,7 +282,7 @@ connect_loop_phis (class loop *loop1, class loop *loop2,
> edge new_e)
>
> gphi * newphi = create_phi_node (new_init, rest);
> add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> - add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> + add_phi_arg (newphi, use_prev ? prev : next, new_e, UNKNOWN_LOCATION);
> SET_USE (op, new_init);
> }
> }
> @@ -1599,6 +1602,213 @@ split_loop_on_cond (struct loop *loop)
> return do_split;
> }
>
> +/* Check if the LOOP exit branch likes "if (idx != bound)".
> + if INV is not NULL and the branch is "if (bound != idx)", set *INV to
> true.
If
> + return the branch edge which exit loop. */
Return
> +
> +static edge
> +get_ne_cond_branch (struct loop *loop, bool *inv)
> +{
> + int i;
> + edge e;
> +
> + auto_vec<edge> edges = get_loop_exit_edges (loop);
> + FOR_EACH_VEC_ELT (edges, i, e)
> + {
> + basic_block bb = e->src;
> +
> + /* Check gcond. */
> + gimple *last = last_stmt (bb);
> + if (!last || gimple_code (last) != GIMPLE_COND)
> + continue;
> + gcond *cond = as_a<gcond *> (last);
> + enum tree_code code = gimple_cond_code (cond);
> + if (code != NE_EXPR)
> + continue;
I'm not sure we canonicalize the case with code == EQ_EXPR,
at least for
void bar();
void foo(unsigned n)
{
unsigned i = 0;
do
{
if (i == n)
return;
bar();
++i;
}
while (1);
}
we don't. Since you return the exit edge can this case be
handled transparently?
> +
> + /* Make sure idx and bound. */
> + tree idx = gimple_cond_lhs (cond);
> + tree bnd = gimple_cond_rhs (cond);
> + if (expr_invariant_in_loop_p (loop, idx))
> + {
> + std::swap (idx, bnd);
> + if (inv)
> + *inv = true;
> + }
> + else if (!expr_invariant_in_loop_p (loop, bnd))
> + continue;
We canonicalize i < UINT_MAX to i != UINT_MAX so you want to
detect that and not split the loop if 'bnd' is the maximum
or minimum value of the type I think.
> + /* Extract conversion. */
> + if (TREE_CODE (idx) == SSA_NAME)
> + {
> + gimple *stmt = SSA_NAME_DEF_STMT (idx);
> + if (is_gimple_assign (stmt)
> + && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
> + && flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
> + idx = gimple_assign_rhs1 (stmt);
> + }
This skips arbitrary extensions and truncations - is that intended?
> + /* Make sure idx is iv. */
> + class loop *useloop = loop_containing_stmt (cond);
> + affine_iv iv;
> + if (!simple_iv (loop, useloop, idx, &iv, false))
> + continue;
> +
> + /* No need to split loop, if base is know value.
> + Or check range info. */
> + if (TREE_CODE (iv.base) == INTEGER_CST)
> + continue;
I think it would be better to check iv.no_overflow? Also looking
it might be possible to use simple_iv_with_niters with IV_NITERS
not NULL for most of this analysis?
> + /* There is type conversion on idx(or rhs of idx's def).
> + And there is converting shorter to longer type. */
> + tree type = TREE_TYPE (idx);
> + if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
> + || !TYPE_UNSIGNED (type)
> + || TYPE_PRECISION (type) == TYPE_PRECISION (sizetype))
> + continue;
That check can be done before the (expensive) simple_iv check.
I wonder what the 'sizetype' precision check is about? The
function level comment should probably clarify what kind of
conversions we handle (and why).
> + /* Check loop is simple to split. */
> + gcc_assert (bb != loop->latch);
> +
> + if (single_pred_p (loop->latch)
> + && single_pred_edge (loop->latch)->src == bb
> + && empty_block_p (loop->latch))
> + return e;
> +
> + /* Splitting is cheap for idx increase header. */
> + if (bb == loop->header)
> + {
> + if (get_virtual_phi (loop->header))
> + continue;
> +
> + /* In loop header: i++ or ++i. */
> + gimple_stmt_iterator gsi = gsi_start_bb (bb);
> + if (gsi_end_p (gsi))
> + return e;
> +
> + gimple *s1 = gsi_stmt (gsi);
> + if (!(is_gimple_assign (s1)
> + && (idx == gimple_assign_lhs (s1)
> + || idx == gimple_assign_rhs1 (s1))))
> + continue;
> +
> + gsi_next (&gsi);
> + if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
> + return e;
> + }
I wonder if these "cheapness" heuristics should simply fold
into the cost of the extra duplication of the header/tail
in the overall stmt limit?
> + }
> +
> + return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR. */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e)
> +{
> + initialize_original_copy_tables ();
> +
> + struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> + profile_probability::always (),
> + profile_probability::never (),
> + profile_probability::always (),
> + profile_probability::always (), true);
> +
> + gcc_assert (loop2);
> + update_ssa (TODO_update_ssa);
> +
> + free_original_copy_tables ();
> +
> + /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
> + bool inv = false;
> + edge dup_cond = get_ne_cond_branch (loop2, &inv);
I don't think you should rely in pattern-matching to detect the same
condition in the versioned loop - instead you can use the copy
tables, do
2nd_loop_exit_block = get_bb_copy (cond_e->src);
to get to the block with the COND_EXPR (before free_original_copy_tables
obviously).
> + enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> + enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> +
> + gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> + gimple_cond_set_code (gc, up_code);
> +
> + gcond *dup_gc = as_a<gcond *> (last_stmt (dup_cond->src));
> + gimple_cond_set_code (dup_gc, down_code);
> +
> + /* Link the exit cond edge to new loop. */
> + gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> + edge pred_e = single_pred_edge (loop->latch);
> + gcc_assert (pred_e);
> + bool simple_loop = pred_e->src == cond_e->src && empty_block_p
> (loop->latch);
> + if (simple_loop)
> + gimple_cond_set_code (break_cond, down_code);
> + else
> + gimple_cond_make_true (break_cond);
> +
> + basic_block break_bb = split_edge (cond_e);
> + gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> + gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> + edge to_exit = single_succ_edge (break_bb);
> + edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src,
> 0);
> + to_new_loop->flags |= EDGE_TRUE_VALUE;
> + to_exit->flags |= EDGE_FALSE_VALUE;
> + to_exit->flags &= ~EDGE_FALLTHRU;
> + to_exit->probability = cond_e->probability;
> + to_new_loop->probability = to_exit->probability.invert ();
> +
> + update_ssa (TODO_update_ssa);
> +
> + connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> + rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> + if (dump_file && (dump_flags & TDF_DETAILS))
> + fprintf (dump_file, ";; Loop split.\n");
Maybe ";; Loop split on != condition.\n"?
> +
> + return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
> +L_H:
> + if (i!=N)
> + S;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transform to:
> +
> +L_H:
> + if (i>N)
> + S;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> + S;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor both GIMPLE and RTL passes. */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> + if (!can_duplicate_loop_p (loop))
> + return false;
> +
> + int num = 0;
> + basic_block *bbs = get_loop_body (loop);
To avoid repeated DFS walks of the loop body do the can_duplicate_loop_p
check here as
if (!can_copy_bbs_p (bbs, loop->num_nodes))
{
free (bbs);
return false;
}
(see split_loop)
> + for (unsigned i = 0; i < loop->num_nodes; i++)
> + num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
> + free (bbs);
So with this and the suggestion above it is maybe possible to re-use
compute_added_num_insns? That already seems to handle splitting at
aribtrary branches (but maybe not loops with multiple exits?).
The code using this computation uses param_max_peeled_insns - is
that limit not sufficient for your case (to avoid another param?)?
It's default is 100, a bit higher than yours (64).
I'd really like to see some numbers on how much this triggers since
the splitting itself is O (number-of-split-loops * function-size)
complexity and thus worse than quadratic as we do update_ssa for each
split loop. You can experience this by simply concating one of your
testcase loops N times in a function ...
> + if (num > param_max_insn_ne_cond_split)
> + return false;
> +
> + edge branch_edge = get_ne_cond_branch (loop, NULL);
> + if (branch_edge && split_ne_loop (loop, branch_edge))
> + return true;
> +
> + return false;
> +}
> +
> /* Main entry point. Perform loop splitting on all suitable loops. */
>
> static unsigned int
> @@ -1628,7 +1838,8 @@ tree_ssa_split_loops (void)
> if (optimize_loop_for_size_p (loop))
> continue;
>
> - if (split_loop (loop) || split_loop_on_cond (loop))
> + if (split_loop (loop) || split_loop_on_cond (loop)
> + || split_loop_on_ne_cond (loop))
> {
> /* Mark our containing loop as having had some split inner loops. */
> loop_outer (loop)->aux = loop;
>