On Mon, 9 Jan 2017, Jakub Jelinek wrote: > On Mon, Jan 09, 2017 at 11:08:08AM +0100, Richard Biener wrote: > > > > > There is one thing my patch doesn't do but should for efficiency, if > > > > > loop1 > > > > > (outer loop) is not successfully outer-loop vectorized, then we > > > > > should mark > > > > > loop2 (its inner loop) as dont_vectorize if the outer loop has been > > > > > LOOP_VECTORIZED guarded. Then the gcc.dg/gomp/pr68128-1.c change > > > > > wouldn't be actually needed. > > > > (*) > > Ok, I don't have too many spare cycles either so can you fix (*)? Then > > we can go with the extra versionings for GCC 7 for the moment and if any > > of us has enough time to revisit this soon we can. > > Here is untested (except for the affected testcases) patch to do that, > ok if it passes bootstrap/regtest? I'll file [8 Regression] with details > what we want to undo and do for GCC 8.
Ok. Thanks, Richard. > 2017-01-09 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/78899 > * tree-if-conv.c (version_loop_for_if_conversion): Instead of > returning bool return struct loop *, NULL for failure and the new > loop on success. > (versionable_outer_loop_p): Don't version outer loop if it has > dont_vectorized bit set. > (tree_if_conversion): When versioning outer loop, ensure > tree_if_conversion is performed also on the inner loop of the > non-vectorizable outer loop copy. > * tree-vectorizer.c (set_uid_loop_bbs): Formatting fix. Fold > LOOP_VECTORIZED in inner loop of the scalar outer loop and > prevent vectorization of it. > (vectorize_loops): For outer + inner LOOP_VECTORIZED, ensure > the outer loop vectorization of the non-scalar version is attempted > before vectorization of the inner loop in scalar version. If > outer LOOP_VECTORIZED guarded loop is not vectorized, prevent > vectorization of its inner loop. > * tree-vect-loop-manip.c (rename_variables_in_bb): If outer_loop > has 2 inner loops, rename also on edges from bb whose single pred > is outer_loop->header. Fix typo in function comment. > > * gcc.target/i386/pr78899.c: New test. > * gcc.dg/pr71077.c: New test. > > --- gcc/tree-if-conv.c.jj 2017-01-06 19:34:04.052560851 +0100 > +++ gcc/tree-if-conv.c 2017-01-09 11:52:59.154806369 +0100 > @@ -2535,7 +2535,7 @@ combine_blocks (struct loop *loop) > loop to execute. The vectorizer pass will fold this > internal call into either true or false. */ > > -static bool > +static struct loop * > version_loop_for_if_conversion (struct loop *loop) > { > basic_block cond_bb; > @@ -2566,7 +2566,7 @@ version_loop_for_if_conversion (struct l > ifc_bbs[i]->aux = saved_preds[i]; > > if (new_loop == NULL) > - return false; > + return NULL; > > new_loop->dont_vectorize = true; > new_loop->force_vectorize = false; > @@ -2574,7 +2574,7 @@ version_loop_for_if_conversion (struct l > gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, > new_loop->num)); > gsi_insert_before (&gsi, g, GSI_SAME_STMT); > update_ssa (TODO_update_ssa); > - return true; > + return new_loop; > } > > /* Return true when LOOP satisfies the follow conditions that will > @@ -2594,6 +2594,7 @@ static bool > versionable_outer_loop_p (struct loop *loop) > { > if (!loop_outer (loop) > + || loop->dont_vectorize > || !loop->inner > || loop->inner->next > || !single_exit (loop) > @@ -2602,7 +2603,7 @@ versionable_outer_loop_p (struct loop *l > || !single_pred_p (loop->latch) > || !single_pred_p (loop->inner->latch)) > return false; > - > + > basic_block outer_exit = single_pred (loop->latch); > basic_block inner_exit = single_pred (loop->inner->latch); > > @@ -2789,7 +2790,10 @@ tree_if_conversion (struct loop *loop) > { > unsigned int todo = 0; > bool aggressive_if_conv; > + struct loop *rloop; > > + again: > + rloop = NULL; > ifc_bbs = NULL; > any_pred_load_store = false; > any_complicated_phi = false; > @@ -2829,8 +2833,31 @@ tree_if_conversion (struct loop *loop) > struct loop *vloop > = (versionable_outer_loop_p (loop_outer (loop)) > ? loop_outer (loop) : loop); > - if (!version_loop_for_if_conversion (vloop)) > + struct loop *nloop = version_loop_for_if_conversion (vloop); > + if (nloop == NULL) > goto cleanup; > + if (vloop != loop) > + { > + /* If versionable_outer_loop_p decided to version the > + outer loop, version also the inner loop of the non-vectorized > + loop copy. So we transform: > + loop1 > + loop2 > + into: > + if (LOOP_VECTORIZED (1, 3)) > + { > + loop1 > + loop2 > + } > + else > + loop3 (copy of loop1) > + if (LOOP_VECTORIZED (4, 5)) > + loop4 (copy of loop2) > + else > + loop5 (copy of loop4) */ > + gcc_assert (nloop->inner && nloop->inner->next == NULL); > + rloop = nloop->inner; > + } > } > > /* Now all statements are if-convertible. Combine all the basic > @@ -2854,6 +2881,11 @@ tree_if_conversion (struct loop *loop) > free (ifc_bbs); > ifc_bbs = NULL; > } > + if (rloop != NULL) > + { > + loop = rloop; > + goto again; > + } > > return todo; > } > --- gcc/tree-vectorizer.c.jj 2017-01-06 19:34:04.064560697 +0100 > +++ gcc/tree-vectorizer.c 2017-01-09 12:04:08.721053174 +0100 > @@ -465,6 +465,7 @@ fold_loop_vectorized_call (gimple *g, tr > update_stmt (use_stmt); > } > } > + > /* Set the uids of all the statements in basic blocks inside loop > represented by LOOP_VINFO. LOOP_VECTORIZED_CALL is the internal > call guarding the loop which has been if converted. */ > @@ -477,9 +478,22 @@ set_uid_loop_bbs (loop_vec_info loop_vin > struct loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg)); > > LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop; > - gcc_checking_assert (vect_loop_vectorized_call > - (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) > + gcc_checking_assert (vect_loop_vectorized_call (scalar_loop) > == loop_vectorized_call); > + /* If we are going to vectorize outer loop, prevent vectorization > + of the inner loop in the scalar loop - either the scalar loop is > + thrown away, so it is a wasted work, or is used only for > + a few iterations. */ > + if (scalar_loop->inner) > + { > + gimple *g = vect_loop_vectorized_call (scalar_loop->inner); > + if (g) > + { > + arg = gimple_call_arg (g, 0); > + get_loop (cfun, tree_to_shwi (arg))->dont_vectorize = true; > + fold_loop_vectorized_call (g, boolean_false_node); > + } > + } > bbs = get_loop_body (scalar_loop); > for (i = 0; i < scalar_loop->num_nodes; i++) > { > @@ -534,14 +548,59 @@ vectorize_loops (void) > only over initial loops skipping newly generated ones. */ > FOR_EACH_LOOP (loop, 0) > if (loop->dont_vectorize) > - any_ifcvt_loops = true; > - else if ((flag_tree_loop_vectorize > - && optimize_loop_nest_for_speed_p (loop)) > - || loop->force_vectorize) > { > - loop_vec_info loop_vinfo, orig_loop_vinfo = NULL; > - gimple *loop_vectorized_call = vect_loop_vectorized_call (loop); > -vectorize_epilogue: > + any_ifcvt_loops = true; > + /* If-conversion sometimes versions both the outer loop > + (for the case when outer loop vectorization might be > + desirable) as well as the inner loop in the scalar version > + of the loop. So we have: > + if (LOOP_VECTORIZED (1, 3)) > + { > + loop1 > + loop2 > + } > + else > + loop3 (copy of loop1) > + if (LOOP_VECTORIZED (4, 5)) > + loop4 (copy of loop2) > + else > + loop5 (copy of loop4) > + If FOR_EACH_LOOP gives us loop3 first (which has > + dont_vectorize set), make sure to process loop1 before loop4; > + so that we can prevent vectorization of loop4 if loop1 > + is successfully vectorized. */ > + if (loop->inner) > + { > + gimple *loop_vectorized_call > + = vect_loop_vectorized_call (loop); > + if (loop_vectorized_call > + && vect_loop_vectorized_call (loop->inner)) > + { > + tree arg = gimple_call_arg (loop_vectorized_call, 0); > + struct loop *vector_loop > + = get_loop (cfun, tree_to_shwi (arg)); > + if (vector_loop && vector_loop != loop) > + { > + loop = vector_loop; > + /* Make sure we don't vectorize it twice. */ > + loop->dont_vectorize = true; > + goto try_vectorize; > + } > + } > + } > + } > + else > + { > + loop_vec_info loop_vinfo, orig_loop_vinfo; > + gimple *loop_vectorized_call; > + try_vectorize: > + if (!((flag_tree_loop_vectorize > + && optimize_loop_nest_for_speed_p (loop)) > + || loop->force_vectorize)) > + continue; > + orig_loop_vinfo = NULL; > + loop_vectorized_call = vect_loop_vectorized_call (loop); > + vectorize_epilogue: > vect_location = find_loop_location (loop); > if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION > && dump_enabled_p ()) > @@ -595,6 +654,12 @@ vectorize_epilogue: > ret |= TODO_cleanup_cfg; > } > } > + /* If outer loop vectorization fails for LOOP_VECTORIZED guarded > + loop, don't vectorize its inner loop; we'll attempt to > + vectorize LOOP_VECTORIZED guarded inner loop of the scalar > + loop version. */ > + if (loop_vectorized_call && loop->inner) > + loop->inner->dont_vectorize = true; > continue; > } > > --- gcc/tree-vect-loop-manip.c.jj 2017-01-06 19:34:04.040561006 +0100 > +++ gcc/tree-vect-loop-manip.c 2017-01-09 11:52:59.155806356 +0100 > @@ -71,7 +71,7 @@ rename_use_op (use_operand_p op_p) > } > > > -/* Renames the variables in basic block BB. Allow renaming of PHI argumnets > +/* Renames the variables in basic block BB. Allow renaming of PHI arguments > on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is > true. */ > > @@ -102,9 +102,25 @@ rename_variables_in_bb (basic_block bb, > > FOR_EACH_EDGE (e, ei, bb->preds) > { > - if (!flow_bb_inside_loop_p (loop, e->src) > - && (!rename_from_outer_loop || e->src != outer_loop->header)) > - continue; > + if (!flow_bb_inside_loop_p (loop, e->src)) > + { > + if (!rename_from_outer_loop) > + continue; > + if (e->src != outer_loop->header) > + { > + if (outer_loop->inner->next) > + { > + /* If outer_loop has 2 inner loops, allow there to > + be an extra basic block which decides which of the > + two loops to use using LOOP_VECTORIZED. */ > + if (!single_pred_p (e->src) > + || single_pred (e->src) != outer_loop->header) > + continue; > + } > + else > + continue; > + } > + } > for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); > gsi_next (&gsi)) > rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); > --- gcc/testsuite/gcc.target/i386/pr78899.c.jj 2017-01-09 > 11:52:59.156806343 +0100 > +++ gcc/testsuite/gcc.target/i386/pr78899.c 2017-01-09 11:52:59.156806343 > +0100 > @@ -0,0 +1,27 @@ > +/* PR tree-optimization/78899 */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -fopenmp-simd -mavx2 -mno-avx512f" } */ > + > +#define N 1024 > +#define M 4 > +int p1[N], p2[N], p3[N], c[N]; > + > +void > +foo (int n) > +{ > + int i, k; > + for (k = 0; k < n / M; k++) > + { > + #pragma omp simd > + for (i = 0; i < M; i++) > + if (c[k * M + i]) > + { > + p1[k * M + i] += 1; > + p2[k * M + i] = p3[k * M + i] + 2; > + } > + } > +} > + > +/* Ensure the loop is vectorized. */ > +/* { dg-final { scan-assembler "vpmaskmov" } } */ > +/* { dg-final { scan-assembler "vpadd" } } */ > --- gcc/testsuite/gcc.dg/pr71077.c.jj 2017-01-09 11:52:59.179806042 +0100 > +++ gcc/testsuite/gcc.dg/pr71077.c 2017-01-09 11:52:59.179806042 +0100 > @@ -0,0 +1,14 @@ > +/* PR c++/71077 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > +/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */ > + > +void > +foo (int *a, int n) > +{ > + int b, c; > + for (b = 0; b < n; b++) > + for (c = 0; c < 32; c++) > + if ((b & 1U) << c) > + a[b + c] = 0; > +} > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)