On Fri, 4 Jun 2021, Jiufu Guo wrote:
> Update the patch since v2:
> . Check index and bound from gcond before checking if wrap.
> . Update test case, and add an executable case.
> . Refine code comments.
> . Enhance the checking for i++/++i in the loop header.
> . Enhance code to handle equal condition on exit
>
> Bootstrap and regtest pass on powerpc64le, and also pass regtest
> on bootstrap-O3. Is this ok for trunk?
>
> BR.
> Jiufu Guo.
>
>
> When there is the possibility that wrap may happen on the loop index,
> a few optimizations would not happen. For example code:
>
> foo (int *a, int *b, unsigned k, unsigned n)
> {
> while (++k != n)
> a[k] = b[k] + 1;
> }
>
> For this code, if "k > n", k would wrap. if "k < n" at begining,
> it could be optimized (e.g. vectorization).
>
> We can split the loop into two loops:
>
> while (++k > n)
> a[k] = b[k] + 1;
> while (k++ < n)
> a[k] = b[k] + 1;
>
> This patch splits this kind of loop to achieve better performance.
>
> gcc/ChangeLog:
>
> 2021-06-04 Jiufu Guo <[email protected]>
>
> * tree-ssa-loop-split.c (connect_loop_phis): Add new param.
> (get_ne_cond_branch): New function.
> (split_ne_loop): New function.
> (split_loop_on_ne_cond): New function.
> (tree_ssa_split_loops): Use split_loop_on_ne_cond.
>
> gcc/testsuite/ChangeLog:
>
> 2021-06-04 Jiufu Guo <[email protected]>
>
> * gcc.dg/loop-split1.c: New test.
> * gcc.dg/loop-split2.c: New test.
> * g++.dg/vect/pr98064.cc: Suppress warning.
>
> ---
> gcc/testsuite/g++.dg/vect/pr98064.cc | 4 +-
> gcc/testsuite/gcc.dg/loop-split1.c | 101 +++++++++++
> gcc/testsuite/gcc.dg/loop-split2.c | 54 ++++++
> gcc/tree-ssa-loop-split.c | 251 ++++++++++++++++++++++++++-
> 4 files changed, 404 insertions(+), 6 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/loop-split1.c
> create mode 100644 gcc/testsuite/gcc.dg/loop-split2.c
>
> diff --git a/gcc/testsuite/g++.dg/vect/pr98064.cc
> b/gcc/testsuite/g++.dg/vect/pr98064.cc
> index 74043ce7725..dcb2985d05a 100644
> --- a/gcc/testsuite/g++.dg/vect/pr98064.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr98064.cc
> @@ -1,5 +1,7 @@
> // { dg-do compile }
> -// { dg-additional-options "-O3" }
> +// { dg-additional-options "-O3 -Wno-stringop-overflow" }
> +/* There is warning message when "short g = var_8; g; g++"
> + is optimized/analyzed as string operation,e.g. memset. */
>
> const long long &min(const long long &__a, long long &__b) {
> if (__b < __a)
> diff --git a/gcc/testsuite/gcc.dg/loop-split1.c
> b/gcc/testsuite/gcc.dg/loop-split1.c
> new file mode 100644
> index 00000000000..dd2d03a7b96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split1.c
> @@ -0,0 +1,101 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fsplit-loops -fdump-tree-lsplit-details" } */
> +
> +void
> +foo (int *a, int *b, unsigned l, unsigned n)
> +{
> + while (++l != n)
> + a[l] = b[l] + 1;
> +}
> +void
> +foo_1 (int *a, int *b, unsigned n)
> +{
> + unsigned l = 0;
> + while (++l != n)
> + a[l] = b[l] + 1;
> +}
> +
> +void
> +foo1 (int *a, int *b, unsigned l, unsigned n)
> +{
> + while (l++ != n)
> + a[l] = b[l] + 1;
> +}
> +
> +/* No wrap. */
> +void
> +foo1_1 (int *a, int *b, unsigned n)
> +{
> + unsigned l = 0;
> + while (l++ != n)
> + a[l] = b[l] + 1;
> +}
> +
> +unsigned
> +foo2 (char *a, char *b, unsigned l, unsigned n)
> +{
> + while (++l != n)
> + if (a[l] != b[l])
> + break;
> +
> + return l;
> +}
> +
> +unsigned
> +foo2_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> + l = 0;
> + while (++l != n)
> + if (a[l] != b[l])
> + break;
> +
> + return l;
> +}
> +
> +unsigned
> +foo3 (char *a, char *b, unsigned l, unsigned n)
> +{
> + while (l++ != n)
> + if (a[l] != b[l])
> + break;
> +
> + return l;
> +}
> +
> +/* No wrap. */
> +unsigned
> +foo3_1 (char *a, char *b, unsigned l, unsigned n)
> +{
> + l = 0;
> + while (l++ != n)
> + if (a[l] != b[l])
> + break;
> +
> + return l;
> +}
> +
> +void
> +bar ();
> +void
> +foo4 (unsigned n, unsigned i)
> +{
> + do
> + {
> + if (i == n)
> + return;
> + bar ();
> + ++i;
> + }
> + while (1);
> +}
> +
> +unsigned
> +find_skip_diff (char *p, char *q, unsigned n, unsigned i)
> +{
> + while (p[i] == q[i] && ++i != n)
> + p++, q++;
> +
> + return i;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Loop split" 8 "lsplit" } } */
> diff --git a/gcc/testsuite/gcc.dg/loop-split2.c
> b/gcc/testsuite/gcc.dg/loop-split2.c
> new file mode 100644
> index 00000000000..0d3fded3f61
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/loop-split2.c
> @@ -0,0 +1,54 @@
> +/* { dg-do run } */
> +/* { dg-options "-O3" } */
> +
> +extern void abort (void);
> +extern void exit (int);
> +
> +#define NI __attribute__ ((noinline))
> +
> +void NI
> +foo (int *a, int *b, unsigned char l, unsigned char n)
> +{
> + while (++l != n)
> + a[l] = b[l] + 1;
> +}
> +
> +unsigned NI
> +bar (int *a, int *b, unsigned char l, unsigned char n)
> +{
> + while (l++ != n)
> + if (a[l] != b[l])
> + break;
> +
> + return l;
> +}
> +
> +int a[258];
> +int b[258];
> +
> +int main()
> +{
> + __builtin_memcpy (b, a, sizeof (a));
> +
> + if (bar (a, b, 3, 8) != 9)
> + abort ();
> +
> + if (bar (a, b, 8, 3) != 4)
> + abort ();
> +
> + b[100] += 1;
> + if (bar (a, b, 90, 110) != 100)
> + abort ();
> +
> + if (bar (a, b, 110, 105) != 100)
> + abort ();
> +
> + foo (a, b, 99, 99);
> + a[99] = b[99] + 1;
> + for (int i = 0; i < 256; i++)
> + if (a[i] != b[i] + 1)
> + abort();
> +
> + exit (0);
> +}
> +
> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
> index 3a09bbc39e5..9c0de66e288 100644
> --- a/gcc/tree-ssa-loop-split.c
> +++ b/gcc/tree-ssa-loop-split.c
> @@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see
> #include "cfghooks.h"
> #include "gimple-fold.h"
> #include "gimplify-me.h"
> +#include "tree-ssa-loop-ivopts.h"
>
> /* This file implements two kinds of loop splitting.
>
> @@ -229,11 +230,14 @@ easy_exit_values (class loop *loop)
> conditional). I.e. the second loop can now be entered either
> via the original entry or via NEW_E, so the entry values of LOOP2
> phi nodes are either the original ones or those at the exit
> - of LOOP1. Insert new phi nodes in LOOP2 pre-header reflecting
> - this. The loops need to fulfill easy_exit_values(). */
> + of LOOP1. Selecting the previous value instead next value as the
> + exit value of LOOP1 if USE_PREV is true. Insert new phi nodes in
> + LOOP2 pre-header reflecting this. The loops need to fulfill
> + easy_exit_values(). */
>
> static void
> -connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e)
> +connect_loop_phis (class loop *loop1, class loop *loop2, edge new_e,
> + bool use_prev = false)
> {
> basic_block rest = loop_preheader_edge (loop2)->src;
> gcc_assert (new_e->dest == rest);
> @@ -279,7 +283,8 @@ connect_loop_phis (class loop *loop1, class loop *loop2,
> edge new_e)
>
> gphi * newphi = create_phi_node (new_init, rest);
> add_phi_arg (newphi, init, skip_first, UNKNOWN_LOCATION);
> - add_phi_arg (newphi, next, new_e, UNKNOWN_LOCATION);
> + add_phi_arg (newphi, use_prev ? PHI_RESULT (phi_first) : next, new_e,
> + UNKNOWN_LOCATION);
> SET_USE (op, new_init);
> }
> }
> @@ -1593,6 +1598,241 @@ split_loop_on_cond (struct loop *loop)
> return do_split;
> }
>
> +/* Check if the LOOP exit branch is like "if (idx != bound)",
> + Return the branch edge which exit loop, if wrap may happen on "idx". */
> +
> +static edge
> +get_ne_cond_branch (struct loop *loop)
> +{
> + int i;
> + edge e;
> +
> + auto_vec<edge> edges = get_loop_exit_edges (loop);
> + FOR_EACH_VEC_ELT (edges, i, e)
> + {
> + basic_block bb = e->src;
> +
> + /* Check if exit at gcond. */
> + gimple *last = last_stmt (bb);
> + if (!last || gimple_code (last) != GIMPLE_COND)
> + continue;
> + gcond *cond = as_a<gcond *> (last);
> + enum tree_code code = gimple_cond_code (cond);
> + if (!(code == NE_EXPR
> + || (code == EQ_EXPR && (e->flags & EDGE_TRUE_VALUE))))
The NE_EXPR check misses a corresponding && (e->flags & EDGE_FALSE_VALUE)
check.
> + continue;
> +
> + /* Check if bound is invarant. */
> + tree idx = gimple_cond_lhs (cond);
> + tree bnd = gimple_cond_rhs (cond);
> + if (expr_invariant_in_loop_p (loop, idx))
> + std::swap (idx, bnd);
> + else if (!expr_invariant_in_loop_p (loop, bnd))
> + continue;
> +
> + /* Only unsigned type conversion could cause wrap. */
> + tree type = TREE_TYPE (idx);
> + if (!INTEGRAL_TYPE_P (type) || TREE_CODE (idx) != SSA_NAME
> + || !TYPE_UNSIGNED (type))
> + continue;
> +
> + /* Avoid to split if bound is MAX/MIN val. */
> + tree bound_type = TREE_TYPE (bnd);
> + if (TREE_CODE (bnd) == INTEGER_CST && INTEGRAL_TYPE_P (bound_type)
> + && (tree_int_cst_equal (bnd, TYPE_MAX_VALUE (bound_type))
> + || tree_int_cst_equal (bnd, TYPE_MIN_VALUE (bound_type))))
> + continue;
Note you do not require 'bnd' to be constant and thus at runtime those
cases still need to be handled correctly.
> + /* Check if there is possible wrap. */
> + class tree_niter_desc niter;
> + if (!number_of_iterations_exit (loop, e, &niter, false, false))
> + continue;
> + if (niter.control.no_overflow)
> + return NULL;
> + if (niter.cmp != NE_EXPR)
> + continue;
> +
> + /* If exit edge is just before the empty latch, it is easy to link
> + the split loops: just jump from the exit edge of one loop to the
> + header of new loop. */
> + if (single_pred_p (loop->latch)
> + && single_pred_edge (loop->latch)->src == bb
> + && empty_block_p (loop->latch))
> + return e;
> +
> + /* If exit edge is at end of header, and header contains i++ or ++i,
> + only, it is simple to link the split loops: jump from the end of
> + one loop header to the new loop header, and use unchanged PHI
> + result of first loop as the entry PHI value of the second loop. */
> + if (bb == loop->header)
> + {
> + /* Only one phi. */
> + gphi_iterator psi = gsi_start_phis (bb);
> + if (gsi_end_p (psi))
> + continue;
> + gphi *phi = psi.phi ();
> + gsi_next (&psi);
> + if (!gsi_end_p (psi))
> + continue;
> +
> + /* Check it is ++i or ++i */
> + tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
> + tree prev = PHI_RESULT (phi);
> + if (idx != prev && idx != next)
> + continue;
> +
> + gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb (bb);
> + if (gsi_end_p (gsi))
> + continue;
> + gimple *s1 = gsi_stmt (gsi);
> + if (!is_gimple_assign (s1) || gimple_assign_lhs (s1) != next
> + || gimple_assign_rhs1 (s1) != prev)
> + continue;
> +
> + gsi_next_nondebug (&gsi);
> + if (!gsi_end_p (gsi) && gsi_stmt (gsi) == cond)
> + return e;
> + }
> + }
> +
> + return NULL;
> +}
> +
> +/* Split the LOOP with NE_EXPR into two loops with GT_EXPR and LT_EXPR. */
> +
> +static bool
> +split_ne_loop (struct loop *loop, edge cond_e)
> +{
> + initialize_original_copy_tables ();
> +
> + struct loop *loop2 = loop_version (loop, boolean_true_node, NULL,
> + profile_probability::always (),
> + profile_probability::never (),
> + profile_probability::always (),
> + profile_probability::always (), true);
> +
> + gcc_assert (loop2);
> + update_ssa (TODO_update_ssa);
> +
> + basic_block loop2_cond_exit_bb = get_bb_copy (cond_e->src);
> + free_original_copy_tables ();
> +
> + gcond *gc = as_a<gcond *> (last_stmt (cond_e->src));
> + gcond *dup_gc = as_a<gcond *> (last_stmt (loop2_cond_exit_bb));
> +
> + /* Invert edges for gcond. */
> + if (gimple_cond_code (gc) == EQ_EXPR)
> + {
> + auto invert_edge = [](basic_block bb) {
> + edge out = EDGE_SUCC (bb, 0);
> + edge in = EDGE_SUCC (bb, 1);
> + if (in->flags & EDGE_TRUE_VALUE)
> + std::swap (in, out);
> + in->flags |= EDGE_TRUE_VALUE;
> + in->flags &= ~EDGE_FALSE_VALUE;
> + out->flags |= EDGE_FALSE_VALUE;
> + out->flags &= ~EDGE_TRUE_VALUE;
> + };
> +
> + invert_edge (gimple_bb (gc));
> + invert_edge (gimple_bb (dup_gc));
> + }
> +
> + /* Change if (i != n) to LOOP1:if (i > n) and LOOP2:if (i < n) */
It now occurs to me that we nowhere check the evolution of IDX
(split_at_bb_p uses simple_iv for this for example). The transform
assumes that we will actually hit i == n and that i increments, but
while you check the control IV from number_of_iterations_exit
for NE_EXPR that does not guarantee a positive evolution.
Your testcases do not include any negative step examples, but I guess
the conditions need to be swapped in this case?
I think you also have to consider the order we split, say with
for (i = start; i != end; ++i)
{
push (i);
if (a[i] != b[i])
break;
}
push (i) calls need to be in the same order for all cases of
start < end, start == end and start > end (and also cover
runtime testcases with end == 0 or end == UINT_MAX, likewise
for start).
> + bool inv = expr_invariant_in_loop_p (loop, gimple_cond_lhs (gc));
> + enum tree_code up_code = inv ? LT_EXPR : GT_EXPR;
> + enum tree_code down_code = inv ? GT_EXPR : LT_EXPR;
> +
> + gimple_cond_set_code (gc, up_code);
> + gimple_cond_set_code (dup_gc, down_code);
> +
> + /* Link the exit cond edge to new loop. */
> + gcond *break_cond = as_a<gcond *> (gimple_copy (gc));
> + edge pred_e = single_pred_edge (loop->latch);
> + bool simple_loop
> + = pred_e && pred_e->src == cond_e->src && empty_block_p (loop->latch);
> + if (simple_loop)
> + gimple_cond_set_code (break_cond, down_code);
> + else
> + gimple_cond_make_true (break_cond);
> +
> + basic_block break_bb = split_edge (cond_e);
> + gimple_stmt_iterator gsi = gsi_last_bb (break_bb);
> + gsi_insert_after (&gsi, break_cond, GSI_NEW_STMT);
> +
> + edge to_exit = single_succ_edge (break_bb);
> + edge to_new_loop = make_edge (break_bb, loop_preheader_edge (loop2)->src,
> 0);
> + to_new_loop->flags |= EDGE_TRUE_VALUE;
> + to_exit->flags |= EDGE_FALSE_VALUE;
> + to_exit->flags &= ~EDGE_FALLTHRU;
> + to_exit->probability = cond_e->probability;
> + to_new_loop->probability = to_exit->probability.invert ();
> +
> + update_ssa (TODO_update_ssa);
> +
> + connect_loop_phis (loop, loop2, to_new_loop, !simple_loop);
> +
> + rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
> + if (dump_file && (dump_flags & TDF_DETAILS))
> + fprintf (dump_file, ";; Loop split on wrap index.\n");
> +
> + return true;
> +}
> +
> +/* Checks if LOOP contains a suitable NE_EXPR conditional block to split.
> +L_H:
> + if (i!=N)
> + S;
> + else
> + goto EXIT;
> + i++;
> + goto L_H;
> +
> +The "i!=N" is like "i>N || i<N", then it could be transform to:
> +
> +L_H:
> + if (i>N)
> + S;
> + else
> + goto EXIT;
> + i++;
> + goto L_H;
> +L1_H:
> + if (i<N)
> + S;
> + else
> + goto EXIT;
> + i++;
> + goto L1_H;
> +
> +The loop with "i<N" is in favor both GIMPLE and RTL passes. */
> +
> +static bool
> +split_loop_on_ne_cond (class loop *loop)
> +{
> + int num = 0;
> + basic_block *bbs = get_loop_body (loop);
> +
> + if (!can_copy_bbs_p (bbs, loop->num_nodes))
> + {
> + free (bbs);
> + return false;
> + }
> +
> + for (unsigned i = 0; i < loop->num_nodes; i++)
> + num += estimate_num_insns_seq (bb_seq (bbs[i]), &eni_size_weights);
> + free (bbs);
> +
> + if (num > param_max_peeled_insns)
> + return false;
> +
> + edge branch_edge = get_ne_cond_branch (loop);
> + if (branch_edge && split_ne_loop (loop, branch_edge))
please check get_ne_cond_branch first, the can_copy_bbs_p and size
estimates are of linear complexity in the loop size. Please also
test the size estimate before can_copy_bbs and terminate the loop
when you reach param_max_peeled_insns.
> + return true;
> +
> + return false;
> +}
> +
> /* Main entry point. Perform loop splitting on all suitable loops. */
>
> static unsigned int
> @@ -1622,7 +1862,8 @@ tree_ssa_split_loops (void)
> if (optimize_loop_for_size_p (loop))
> continue;
>
> - if (split_loop (loop) || split_loop_on_cond (loop))
> + if (split_loop (loop) || split_loop_on_cond (loop)
> + || split_loop_on_ne_cond (loop))
> {
> /* Mark our containing loop as having had some split inner loops. */
> loop_outer (loop)->aux = loop;
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)