On Thu, 5 Dec 2024, liuhongt wrote: > r15-919-gef27b91b62c3aa removed 1 / 3 size reduction for innermost > loop, but it doesn't accurately remember what's "innermost" for 2 > testcases in PR117888. > > 1) For pass_cunroll, the "innermost" loop could be an originally outer > loop with inner loop completely unrolled by cunrolli. The patch moves > local variable cunrolli to parameter of tree_unroll_loops_completely > and passes it directly from execute of the pass. > > 2) For pass_cunrolli, cunrolli is set to false when the sibling loop > of a innermost loop is completely unrolled, and it inaccurately > takes the innermost loop as an "outer" loop. The patch add another > paramter visited_innermost to helps recognizing the innermost loop. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? > > gcc/ChangeLog: > > PR tree-optimization/117888 > * tree-ssa-loop-ivcanon.cc (try_unroll_loop_completely): Add > a new parameter visited_innermost to avoid inaccurately taking > innermost as an outer loop after it's sibling loop is > completed unrolled. > (canonicalize_loop_induction_variables): Ditto. > (canonicalize_induction_variables): Ditto. > (tree_unroll_loops_completely_1): Ditto. > (tree_unroll_loops_completely): Move local variable cunrolli > to parameter to indicate it's from pass cunrolli. > > gcc/testsuite/ChangeLog: > > * gcc.dg/pr117888-2.c: New test. > * gcc.dg/vect/pr117888-1.c: Ditto. > --- > gcc/testsuite/gcc.dg/pr117888-2.c | 38 ++++++++++++++ > gcc/testsuite/gcc.dg/vect/pr117888-1.c | 71 ++++++++++++++++++++++++++ > gcc/tree-ssa-loop-ivcanon.cc | 50 +++++++++++++----- > 3 files changed, 145 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/pr117888-2.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr117888-1.c > > diff --git a/gcc/testsuite/gcc.dg/pr117888-2.c > b/gcc/testsuite/gcc.dg/pr117888-2.c > new file mode 100644 > index 00000000000..1749f1509a6 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/pr117888-2.c > @@ -0,0 +1,38 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize > -fdump-tree-cunroll-details" } */ > +/* { dg-additional-options "--param max-completely-peeled-insns=200" { > target powerpc64*-*-* } } */ > + > +typedef struct { > + double real; > + double imag; > +} complex; > + > +typedef struct { complex e[3][3]; } su3_matrix; > + > +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ) > +{ > + int i,j; > + double t,ar,ai,br,bi,cr,ci; > + for(i=0;i<3;i++) > + for(j=0;j<3;j++){ > + > + ar=a->e[i][0].real; ai=a->e[i][0].imag; > + br=b->e[0][j].real; bi=b->e[0][j].imag; > + cr=ar*br; t=ai*bi; cr -= t; > + ci=ar*bi; t=ai*br; ci += t; > + > + ar=a->e[i][1].real; ai=a->e[i][1].imag; > + br=b->e[1][j].real; bi=b->e[1][j].imag; > + t=ar*br; cr += t; t=ai*bi; cr -= t; > + t=ar*bi; ci += t; t=ai*br; ci += t; > + > + ar=a->e[i][2].real; ai=a->e[i][2].imag; > + br=b->e[2][j].real; bi=b->e[2][j].imag; > + t=ar*br; cr += t; t=ai*bi; cr -= t; > + t=ar*bi; ci += t; t=ai*br; ci += t; > + > + c->e[i][j].real=cr; > + c->e[i][j].imag=ci; > + } > +} > +/* { dg-final { scan-tree-dump-times "optimized: loop with 2 iterations > completely unrolled" 1 "cunroll" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr117888-1.c > b/gcc/testsuite/gcc.dg/vect/pr117888-1.c > new file mode 100644 > index 00000000000..4796a7c83c1 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr117888-1.c > @@ -0,0 +1,71 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target vect_shift } */ > +/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */ > +/* { dg-additional-options "--param max-completely-peeled-insns=200" { > target powerpc64*-*-* } } */ > + > +typedef unsigned short ggml_fp16_t; > +static float table_f32_f16[1 << 16]; > + > +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { > + unsigned short s; > + __builtin_memcpy(&s, &f, sizeof(unsigned short)); > + return table_f32_f16[s]; > +} > + > +typedef struct { > + ggml_fp16_t d; > + ggml_fp16_t m; > + unsigned char qh[4]; > + unsigned char qs[32 / 2]; > +} block_q5_1; > + > +typedef struct { > + float d; > + float s; > + char qs[32]; > +} block_q8_1; > + > +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * > restrict vx, const void * restrict vy) { > + const int qk = 32; > + const int nb = n / qk; > + > + const block_q5_1 * restrict x = vx; > + const block_q8_1 * restrict y = vy; > + > + float sumf = 0.0; > + > + for (int i = 0; i < nb; i++) { > + unsigned qh; > + __builtin_memcpy(&qh, x[i].qh, sizeof(qh)); > + > + int sumi = 0; > + > + if (qh) { > + for (int j = 0; j < qk/2; ++j) { > + const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10; > + const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10; > + > + const int x0 = (x[i].qs[j] & 0xF) | xh_0; > + const int x1 = (x[i].qs[j] >> 4) | xh_1; > + > + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); > + } > + } > + else { > + for (int j = 0; j < qk/2; ++j) { > + const int x0 = (x[i].qs[j] & 0xF); > + const int x1 = (x[i].qs[j] >> 4); > + > + sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); > + } > + } > + > + sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + > ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s; > + } > + > + *s = sumf; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc > index 9a94d82fc4e..f3ed3f9dc51 100644 > --- a/gcc/tree-ssa-loop-ivcanon.cc > +++ b/gcc/tree-ssa-loop-ivcanon.cc > @@ -747,7 +747,7 @@ try_unroll_loop_completely (class loop *loop, > enum unroll_level ul, > HOST_WIDE_INT maxiter, > dump_user_location_t locus, bool allow_peel, > - bool cunrolli) > + auto_sbitmap& visited_innermost, bool cunrolli)
Please pass 'sbitmap' instead of auto_sbitmap&, it should properly decay to that. Applies everywhere I think. > { > unsigned HOST_WIDE_INT n_unroll = 0; > bool n_unroll_found = false; > @@ -939,7 +939,15 @@ try_unroll_loop_completely (class loop *loop, > 1) It could increase register pressure. > 2) Big loop after completely unroll may not be vectorized > by BB vectorizer. */ > - else if ((cunrolli && !loop->inner > + else if ((!loop->inner > + /* It's a visited innermost, not a outer loop whose > + inner loops is completely unrolled or loops added > + by the recursive calls because SSA form is not > + up-to-date. */ > + && (unsigned) loop->num > + < SBITMAP_SIZE ((sbitmap) visited_innermost) It seems this check would belong next to the bitmap_bit_p check instead? > + && (cunrolli > + || bitmap_bit_p (visited_innermost, loop->num)) > ? unr_insns : unr_insns - est_eliminated) > > (unsigned) param_max_completely_peeled_insns) > { > @@ -948,6 +956,9 @@ try_unroll_loop_completely (class loop *loop, > "number of insns in the unrolled sequence reaches " > "--param max-completely-peeled-insns limit.\n", > loop->num); > + if ((unsigned) loop->num > + < SBITMAP_SIZE ((sbitmap) visited_innermost)) > + bitmap_set_bit (visited_innermost, loop->num); so visited_innermost marks loops we don't unroll because of param_max_completely_peeled_insns. But it marks non-innermost loops as well? So I don't quite understand what it marks - in fact I'd have expected the bitmap (which sounds loop tree related) to be populated in the caller of try_unroll_loop_completely which could then pass an adjusted 'cunrolli' instead? In fact I wonder whether we should simply populate the bitmap from a for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST)) bitmap_set_bit (original_innermost, loop->num); loop instead and pass pass-is-cunrolli && bitmap_bit_p (..) as cunrolli down to try_unroll_loop_completely? Thanks, Richard. > return false; > } > } > @@ -1248,7 +1259,9 @@ try_peel_loop (class loop *loop, > static bool > canonicalize_loop_induction_variables (class loop *loop, > bool create_iv, enum unroll_level ul, > - bool try_eval, bool allow_peel, bool > cunrolli) > + bool try_eval, bool allow_peel, > + auto_sbitmap& visited_innermost, > + bool cunrolli) > { > edge exit = NULL; > tree niter; > @@ -1335,7 +1348,8 @@ canonicalize_loop_induction_variables (class loop *loop, > > dump_user_location_t locus = find_loop_location (loop); > if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul, > - maxiter, locus, allow_peel, cunrolli)) > + maxiter, locus, allow_peel, > + visited_innermost, cunrolli)) > return true; > > if (create_iv > @@ -1372,6 +1386,8 @@ canonicalize_induction_variables (void) > bool changed = false; > bool irred_invalidated = false; > bitmap loop_closed_ssa_invalidated = BITMAP_ALLOC (NULL); > + auto_sbitmap visited_innermost (number_of_loops (cfun)); > + bitmap_clear (visited_innermost); > > estimate_numbers_of_iterations (cfun); > > @@ -1379,7 +1395,8 @@ canonicalize_induction_variables (void) > { > changed |= canonicalize_loop_induction_variables (loop, > true, UL_SINGLE_ITER, > - true, false, false); > + true, false, > + visited_innermost, > false); > } > gcc_assert (!need_ssa_update_p (cfun)); > > @@ -1413,7 +1430,8 @@ canonicalize_induction_variables (void) > > static bool > tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer, > - bitmap father_bbs, class loop *loop, bool > cunrolli) > + bitmap father_bbs, class loop *loop, > + auto_sbitmap& visited_innermost, bool cunrolli) > { > class loop *loop_father; > bool changed = false; > @@ -1431,7 +1449,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, > bool unroll_outer, > if (!child_father_bbs) > child_father_bbs = BITMAP_ALLOC (NULL); > if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer, > - child_father_bbs, inner, cunrolli)) > + child_father_bbs, inner, > + visited_innermost, cunrolli)) > { > bitmap_ior_into (father_bbs, child_father_bbs); > bitmap_clear (child_father_bbs); > @@ -1477,7 +1496,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, > bool unroll_outer, > ul = UL_NO_GROWTH; > > if (canonicalize_loop_induction_variables > - (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli)) > + (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, > + visited_innermost, cunrolli)) > { > /* If we'll continue unrolling, we need to propagate constants > within the new basic blocks to fold away induction variable > @@ -1503,16 +1523,18 @@ tree_unroll_loops_completely_1 (bool > may_increase_size, bool unroll_outer, > > /* Unroll LOOPS completely if they iterate just few times. Unless > MAY_INCREASE_SIZE is true, perform the unrolling only if the > - size of the code does not increase. */ > + size of the code does not increase. > + cunrolli is true when passs is cunrolli. */ > > static unsigned int > -tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer) > +tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer, > bool cunrolli) > { > bitmap father_bbs = BITMAP_ALLOC (NULL); > bool changed; > int iteration = 0; > bool irred_invalidated = false; > - bool cunrolli = true; > + auto_sbitmap visited_innermost (number_of_loops (cfun)); > + bitmap_clear (visited_innermost); > > estimate_numbers_of_iterations (cfun); > > @@ -1530,7 +1552,7 @@ tree_unroll_loops_completely (bool may_increase_size, > bool unroll_outer) > changed = tree_unroll_loops_completely_1 (may_increase_size, > unroll_outer, father_bbs, > current_loops->tree_root, > - cunrolli); > + visited_innermost, cunrolli); > if (changed) > { > unsigned i; > @@ -1697,7 +1719,7 @@ pass_complete_unroll::execute (function *fun) > re-peeling the same loop multiple times. */ > if (flag_peel_loops) > peeled_loops = BITMAP_ALLOC (NULL); > - unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, > true); > + unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, > true, false); > if (peeled_loops) > { > BITMAP_FREE (peeled_loops); > @@ -1753,7 +1775,7 @@ pass_complete_unrolli::execute (function *fun) > if (number_of_loops (fun) > 1) > { > scev_initialize (); > - ret = tree_unroll_loops_completely (optimize >= 3, false); > + ret = tree_unroll_loops_completely (optimize >= 3, false, true); > scev_finalize (); > } > loop_optimizer_finalize (); > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)