On Mon, 9 Dec 2024, liuhongt wrote:
> > Please pass 'sbitmap' instead of auto_sbitmap&, it should properly
> > decay to that. Applies everywhere I think.
> >
>
> Changed.
>
> > In fact I wonder whether we should simply populate the bitmap
> > from a
> >
> > for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
> > bitmap_set_bit (original_innermost, loop->num);
> >
> > loop instead and pass pass-is-cunrolli && bitmap_bit_p (..) as
> > cunrolli down to try_unroll_loop_completely?
> Changed, but still need to pass sbitmap from tree_unroll_loops_completely
> down to canonicalize_induction_variables and tree_unroll_loops_completely_1
> since "outer loop" could already be changed in later 2 callers.
>
> BTW arm ci reported 2 regressed testcase so I added
> * gcc.dg/tree-ssa/pr83403-1.c: Add --param max-completely-peeled-insns=300
> for arm*-*-*.
> * gcc.dg/tree-ssa/pr83403-2.c: Ditto.
>
> For 32-bit arm, there're more stmts in the innermost loop,
> and removal of the reduction prevents completely unrolling for them.
> For aarch64, it looks fine.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
OK.
Thanks,
Richard.
> r15-919-gef27b91b62c3aa removed 1 / 3 size reduction for innermost
> loop, but it doesn't accurately remember what's "innermost" for 2
> testcases in PR117888.
>
> 1) For pass_cunroll, the "innermost" loop could be an originally outer
> loop with inner loop completely unrolled by cunrolli. The patch moves
> local variable cunrolli to parameter of tree_unroll_loops_completely
> and passes it directly from execute of the pass.
>
> 2) For pass_cunrolli, cunrolli is set to false when the sibling loop
> of a innermost loop is completely unrolled, and it inaccurately
> takes the innermost loop as an "outer" loop. The patch add another
> paramter innermost to helps recognizing the "original" innermost loop.
>
> gcc/ChangeLog:
>
> PR tree-optimization/117888
> * tree-ssa-loop-ivcanon.cc (try_unroll_loop_completely): Use
> cunrolli instead of cunrolli && !loop->inner to check if it's
> innermost loop.
> (canonicalize_loop_induction_variables): Add new parameter
> const_sbitmap innermost, and pass
> cunrolli
> && (unsigned) loop->num < SBITMAP_SIZE (innermost)
> && bitmap_bit_p (innermost, loop->num) as "cunrolli" to
> try_unroll_loop_completely
> (canonicalize_induction_variables): Pass innermost to
> canonicalize_loop_induction_variables.
> (tree_unroll_loops_completely_1): Add new parameter
> const_sbitmap innermost.
> (tree_unroll_loops_completely): Move local variable cunrolli
> to parameter to indicate it's from pass cunrolli, also track
> all "original" innermost loop at the beginning.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/pr117888-2.c: New test.
> * gcc.dg/vect/pr117888-1.c: Ditto.
> * gcc.dg/tree-ssa/pr83403-1.c: Add
> --param max-completely-peeled-insns=300 for arm*-*-*.
> * gcc.dg/tree-ssa/pr83403-2.c: Ditto.
> ---
> gcc/testsuite/gcc.dg/pr117888-2.c | 37 ++++++++++++
> gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c | 1 +
> gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c | 1 +
> gcc/testsuite/gcc.dg/vect/pr117888-1.c | 71 +++++++++++++++++++++++
> gcc/tree-ssa-loop-ivcanon.cc | 60 +++++++++++++------
> 5 files changed, 151 insertions(+), 19 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/pr117888-2.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/pr117888-1.c
>
> diff --git a/gcc/testsuite/gcc.dg/pr117888-2.c
> b/gcc/testsuite/gcc.dg/pr117888-2.c
> new file mode 100644
> index 00000000000..97aa93d8ace
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr117888-2.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize
> -fdump-tree-cunroll-details" } */
> +
> +typedef struct {
> + double real;
> + double imag;
> +} complex;
> +
> +typedef struct { complex e[3][3]; } su3_matrix;
> +
> +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c )
> +{
> + int i,j;
> + double t,ar,ai,br,bi,cr,ci;
> + for(i=0;i<3;i++)
> + for(j=0;j<3;j++){
> +
> + ar=a->e[i][0].real; ai=a->e[i][0].imag;
> + br=b->e[0][j].real; bi=b->e[0][j].imag;
> + cr=ar*br; t=ai*bi; cr -= t;
> + ci=ar*bi; t=ai*br; ci += t;
> +
> + ar=a->e[i][1].real; ai=a->e[i][1].imag;
> + br=b->e[1][j].real; bi=b->e[1][j].imag;
> + t=ar*br; cr += t; t=ai*bi; cr -= t;
> + t=ar*bi; ci += t; t=ai*br; ci += t;
> +
> + ar=a->e[i][2].real; ai=a->e[i][2].imag;
> + br=b->e[2][j].real; bi=b->e[2][j].imag;
> + t=ar*br; cr += t; t=ai*bi; cr -= t;
> + t=ar*bi; ci += t; t=ai*br; ci += t;
> +
> + c->e[i][j].real=cr;
> + c->e[i][j].imag=ci;
> + }
> +}
> +/* { dg-final { scan-tree-dump-times "optimized: loop with 2 iterations
> completely unrolled" 1 "cunroll" { target i?86-*-* x86_64-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
> b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
> index bfc703d1aa6..293fd2dbd97 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
> @@ -1,6 +1,7 @@
> /* { dg-do compile } */
> /* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
> /* { dg-additional-options "--param max-completely-peeled-insns=200" {
> target { s390*-*-* } } } */
> +/* { dg-additional-options "--param max-completely-peeled-insns=300" {
> target { arm*-*-* } } } */
>
> #define TYPE unsigned int
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
> b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
> index 9130d9bd583..b421b387bca 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
> @@ -1,6 +1,7 @@
> /* { dg-do compile } */
> /* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
> /* { dg-additional-options "--param max-completely-peeled-insns=200" {
> target { s390*-*-* } } } */
> +/* { dg-additional-options "--param max-completely-peeled-insns=300" {
> target { arm*-*-* } } } */
>
> #define TYPE int
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr117888-1.c
> b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
> new file mode 100644
> index 00000000000..4796a7c83c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
> @@ -0,0 +1,71 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target vect_shift } */
> +/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
> +/* { dg-additional-options "--param max-completely-peeled-insns=200" {
> target powerpc64*-*-* } } */
> +
> +typedef unsigned short ggml_fp16_t;
> +static float table_f32_f16[1 << 16];
> +
> +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> + unsigned short s;
> + __builtin_memcpy(&s, &f, sizeof(unsigned short));
> + return table_f32_f16[s];
> +}
> +
> +typedef struct {
> + ggml_fp16_t d;
> + ggml_fp16_t m;
> + unsigned char qh[4];
> + unsigned char qs[32 / 2];
> +} block_q5_1;
> +
> +typedef struct {
> + float d;
> + float s;
> + char qs[32];
> +} block_q8_1;
> +
> +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
> restrict vx, const void * restrict vy) {
> + const int qk = 32;
> + const int nb = n / qk;
> +
> + const block_q5_1 * restrict x = vx;
> + const block_q8_1 * restrict y = vy;
> +
> + float sumf = 0.0;
> +
> + for (int i = 0; i < nb; i++) {
> + unsigned qh;
> + __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> +
> + int sumi = 0;
> +
> + if (qh) {
> + for (int j = 0; j < qk/2; ++j) {
> + const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> + const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> +
> + const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> + const int x1 = (x[i].qs[j] >> 4) | xh_1;
> +
> + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> + }
> + }
> + else {
> + for (int j = 0; j < qk/2; ++j) {
> + const int x0 = (x[i].qs[j] & 0xF);
> + const int x1 = (x[i].qs[j] >> 4);
> +
> + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> + }
> + }
> +
> + sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi +
> ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> + }
> +
> + *s = sumf;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index 9a94d82fc4e..451b09adc26 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -939,8 +939,7 @@ try_unroll_loop_completely (class loop *loop,
> 1) It could increase register pressure.
> 2) Big loop after completely unroll may not be vectorized
> by BB vectorizer. */
> - else if ((cunrolli && !loop->inner
> - ? unr_insns : unr_insns - est_eliminated)
> + else if ((cunrolli ? unr_insns : unr_insns - est_eliminated)
> > (unsigned) param_max_completely_peeled_insns)
> {
> if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1248,7 +1247,9 @@ try_peel_loop (class loop *loop,
> static bool
> canonicalize_loop_induction_variables (class loop *loop,
> bool create_iv, enum unroll_level ul,
> - bool try_eval, bool allow_peel, bool
> cunrolli)
> + bool try_eval, bool allow_peel,
> + const_sbitmap innermost,
> + bool cunrolli)
> {
> edge exit = NULL;
> tree niter;
> @@ -1334,8 +1335,15 @@ canonicalize_loop_induction_variables (class loop
> *loop,
> modified |= remove_redundant_iv_tests (loop);
>
> dump_user_location_t locus = find_loop_location (loop);
> +
> + bool innermost_cunrolli_p
> + = cunrolli
> + && (unsigned) loop->num < SBITMAP_SIZE (innermost)
> + && bitmap_bit_p (innermost, loop->num);
> +
> if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
> - maxiter, locus, allow_peel, cunrolli))
> + maxiter, locus, allow_peel,
> + innermost_cunrolli_p))
> return true;
>
> if (create_iv
> @@ -1372,14 +1380,19 @@ canonicalize_induction_variables (void)
> bool changed = false;
> bool irred_invalidated = false;
> bitmap loop_closed_ssa_invalidated = BITMAP_ALLOC (NULL);
> + auto_sbitmap innermost (number_of_loops (cfun));
> + bitmap_clear (innermost);
>
> estimate_numbers_of_iterations (cfun);
>
> for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
> {
> - changed |= canonicalize_loop_induction_variables (loop,
> - true, UL_SINGLE_ITER,
> - true, false, false);
> + changed
> + |= canonicalize_loop_induction_variables (loop,
> + true, UL_SINGLE_ITER,
> + true, false,
> + (const_sbitmap) innermost,
> + false);
> }
> gcc_assert (!need_ssa_update_p (cfun));
>
> @@ -1413,7 +1426,8 @@ canonicalize_induction_variables (void)
>
> static bool
> tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
> - bitmap father_bbs, class loop *loop, bool
> cunrolli)
> + bitmap father_bbs, class loop *loop,
> + const_sbitmap innermost, bool cunrolli)
> {
> class loop *loop_father;
> bool changed = false;
> @@ -1431,7 +1445,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size,
> bool unroll_outer,
> if (!child_father_bbs)
> child_father_bbs = BITMAP_ALLOC (NULL);
> if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
> - child_father_bbs, inner, cunrolli))
> + child_father_bbs, inner,
> + innermost, cunrolli))
> {
> bitmap_ior_into (father_bbs, child_father_bbs);
> bitmap_clear (child_father_bbs);
> @@ -1477,7 +1492,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size,
> bool unroll_outer,
> ul = UL_NO_GROWTH;
>
> if (canonicalize_loop_induction_variables
> - (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
> + (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer,
> + innermost, cunrolli))
> {
> /* If we'll continue unrolling, we need to propagate constants
> within the new basic blocks to fold away induction variable
> @@ -1503,19 +1519,28 @@ tree_unroll_loops_completely_1 (bool
> may_increase_size, bool unroll_outer,
>
> /* Unroll LOOPS completely if they iterate just few times. Unless
> MAY_INCREASE_SIZE is true, perform the unrolling only if the
> - size of the code does not increase. */
> + size of the code does not increase.
> + cunrolli is true when passs is cunrolli. */
>
> static unsigned int
> -tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
> +tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer,
> bool cunrolli)
> {
> bitmap father_bbs = BITMAP_ALLOC (NULL);
> bool changed;
> int iteration = 0;
> bool irred_invalidated = false;
> - bool cunrolli = true;
> + auto_sbitmap innermost (number_of_loops (cfun));
> + bitmap_clear (innermost);
>
> estimate_numbers_of_iterations (cfun);
>
> + /* Mark all innermost loop at the begining. */
> + for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
> + {
> + if (!loop->inner)
> + bitmap_set_bit (innermost, loop->num);
> + }
> +
> do
> {
> changed = false;
> @@ -1530,14 +1555,11 @@ tree_unroll_loops_completely (bool may_increase_size,
> bool unroll_outer)
> changed = tree_unroll_loops_completely_1 (may_increase_size,
> unroll_outer, father_bbs,
> current_loops->tree_root,
> + (const_sbitmap) innermost,
> cunrolli);
> if (changed)
> {
> unsigned i;
> - /* For the outer loop, considering that the inner loop is completely
> - unrolled, it would expose more optimization opportunities, so it's
> - better to keep 2/3 reduction of estimated unrolled size. */
> - cunrolli = false;
>
> unloop_loops (loops_to_unloop, loops_to_unloop_nunroll,
> edges_to_remove, loop_closed_ssa_invalidated,
> @@ -1697,7 +1719,7 @@ pass_complete_unroll::execute (function *fun)
> re-peeling the same loop multiple times. */
> if (flag_peel_loops)
> peeled_loops = BITMAP_ALLOC (NULL);
> - unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
> true);
> + unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size,
> true, false);
> if (peeled_loops)
> {
> BITMAP_FREE (peeled_loops);
> @@ -1753,7 +1775,7 @@ pass_complete_unrolli::execute (function *fun)
> if (number_of_loops (fun) > 1)
> {
> scev_initialize ();
> - ret = tree_unroll_loops_completely (optimize >= 3, false);
> + ret = tree_unroll_loops_completely (optimize >= 3, false, true);
> scev_finalize ();
> }
> loop_optimizer_finalize ();
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)