On Mon, 9 Jan 2017, Jakub Jelinek wrote:

> On Mon, Jan 09, 2017 at 11:08:08AM +0100, Richard Biener wrote:
> > > > > There is one thing my patch doesn't do but should for efficiency, if 
> > > > > loop1
> > > > > (outer loop) is not successfully outer-loop vectorized, then we 
> > > > > should mark
> > > > > loop2 (its inner loop) as dont_vectorize if the outer loop has been
> > > > > LOOP_VECTORIZED guarded.  Then the gcc.dg/gomp/pr68128-1.c change
> > > > > wouldn't be actually needed.
> > 
> > (*)
> > Ok, I don't have too many spare cycles either so can you fix (*)?  Then
> > we can go with the extra versionings for GCC 7 for the moment and if any
> > of us has enough time to revisit this soon we can.
> 
> Here is untested (except for the affected testcases) patch to do that,
> ok if it passes bootstrap/regtest?  I'll file [8 Regression] with details
> what we want to undo and do for GCC 8.

Ok.

Thanks,
Richard.

> 2017-01-09  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR tree-optimization/78899
>       * tree-if-conv.c (version_loop_for_if_conversion): Instead of
>       returning bool return struct loop *, NULL for failure and the new
>       loop on success.
>       (versionable_outer_loop_p): Don't version outer loop if it has
>       dont_vectorized bit set.
>       (tree_if_conversion): When versioning outer loop, ensure
>       tree_if_conversion is performed also on the inner loop of the
>       non-vectorizable outer loop copy.
>       * tree-vectorizer.c (set_uid_loop_bbs): Formatting fix.  Fold
>       LOOP_VECTORIZED in inner loop of the scalar outer loop and
>       prevent vectorization of it.
>       (vectorize_loops): For outer + inner LOOP_VECTORIZED, ensure
>       the outer loop vectorization of the non-scalar version is attempted
>       before vectorization of the inner loop in scalar version.  If
>       outer LOOP_VECTORIZED guarded loop is not vectorized, prevent
>       vectorization of its inner loop.
>       * tree-vect-loop-manip.c (rename_variables_in_bb): If outer_loop
>       has 2 inner loops, rename also on edges from bb whose single pred
>       is outer_loop->header.  Fix typo in function comment.
> 
>       * gcc.target/i386/pr78899.c: New test.
>       * gcc.dg/pr71077.c: New test.
> 
> --- gcc/tree-if-conv.c.jj     2017-01-06 19:34:04.052560851 +0100
> +++ gcc/tree-if-conv.c        2017-01-09 11:52:59.154806369 +0100
> @@ -2535,7 +2535,7 @@ combine_blocks (struct loop *loop)
>     loop to execute.  The vectorizer pass will fold this
>     internal call into either true or false.  */
>  
> -static bool
> +static struct loop *
>  version_loop_for_if_conversion (struct loop *loop)
>  {
>    basic_block cond_bb;
> @@ -2566,7 +2566,7 @@ version_loop_for_if_conversion (struct l
>      ifc_bbs[i]->aux = saved_preds[i];
>  
>    if (new_loop == NULL)
> -    return false;
> +    return NULL;
>  
>    new_loop->dont_vectorize = true;
>    new_loop->force_vectorize = false;
> @@ -2574,7 +2574,7 @@ version_loop_for_if_conversion (struct l
>    gimple_call_set_arg (g, 1, build_int_cst (integer_type_node, 
> new_loop->num));
>    gsi_insert_before (&gsi, g, GSI_SAME_STMT);
>    update_ssa (TODO_update_ssa);
> -  return true;
> +  return new_loop;
>  }
>  
>  /* Return true when LOOP satisfies the follow conditions that will
> @@ -2594,6 +2594,7 @@ static bool
>  versionable_outer_loop_p (struct loop *loop)
>  {
>    if (!loop_outer (loop)
> +      || loop->dont_vectorize
>        || !loop->inner
>        || loop->inner->next
>        || !single_exit (loop)
> @@ -2602,7 +2603,7 @@ versionable_outer_loop_p (struct loop *l
>        || !single_pred_p (loop->latch)
>        || !single_pred_p (loop->inner->latch))
>      return false;
> -  
> +
>    basic_block outer_exit = single_pred (loop->latch);
>    basic_block inner_exit = single_pred (loop->inner->latch);
>  
> @@ -2789,7 +2790,10 @@ tree_if_conversion (struct loop *loop)
>  {
>    unsigned int todo = 0;
>    bool aggressive_if_conv;
> +  struct loop *rloop;
>  
> + again:
> +  rloop = NULL;
>    ifc_bbs = NULL;
>    any_pred_load_store = false;
>    any_complicated_phi = false;
> @@ -2829,8 +2833,31 @@ tree_if_conversion (struct loop *loop)
>        struct loop *vloop
>       = (versionable_outer_loop_p (loop_outer (loop))
>          ? loop_outer (loop) : loop);
> -      if (!version_loop_for_if_conversion (vloop))
> +      struct loop *nloop = version_loop_for_if_conversion (vloop);
> +      if (nloop == NULL)
>       goto cleanup;
> +      if (vloop != loop)
> +     {
> +       /* If versionable_outer_loop_p decided to version the
> +          outer loop, version also the inner loop of the non-vectorized
> +          loop copy.  So we transform:
> +           loop1
> +             loop2
> +          into:
> +           if (LOOP_VECTORIZED (1, 3))
> +             {
> +               loop1
> +                 loop2
> +             }
> +           else
> +             loop3 (copy of loop1)
> +               if (LOOP_VECTORIZED (4, 5))
> +                 loop4 (copy of loop2)
> +               else
> +                 loop5 (copy of loop4)  */
> +       gcc_assert (nloop->inner && nloop->inner->next == NULL);
> +       rloop = nloop->inner;
> +     }
>      }
>  
>    /* Now all statements are if-convertible.  Combine all the basic
> @@ -2854,6 +2881,11 @@ tree_if_conversion (struct loop *loop)
>        free (ifc_bbs);
>        ifc_bbs = NULL;
>      }
> +  if (rloop != NULL)
> +    {
> +      loop = rloop;
> +      goto again;
> +    }
>  
>    return todo;
>  }
> --- gcc/tree-vectorizer.c.jj  2017-01-06 19:34:04.064560697 +0100
> +++ gcc/tree-vectorizer.c     2017-01-09 12:04:08.721053174 +0100
> @@ -465,6 +465,7 @@ fold_loop_vectorized_call (gimple *g, tr
>        update_stmt (use_stmt);
>      }
>  }
> +
>  /* Set the uids of all the statements in basic blocks inside loop
>     represented by LOOP_VINFO. LOOP_VECTORIZED_CALL is the internal
>     call guarding the loop which has been if converted.  */
> @@ -477,9 +478,22 @@ set_uid_loop_bbs (loop_vec_info loop_vin
>    struct loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
>  
>    LOOP_VINFO_SCALAR_LOOP (loop_vinfo) = scalar_loop;
> -  gcc_checking_assert (vect_loop_vectorized_call
> -                    (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
> +  gcc_checking_assert (vect_loop_vectorized_call (scalar_loop)
>                      == loop_vectorized_call);
> +  /* If we are going to vectorize outer loop, prevent vectorization
> +     of the inner loop in the scalar loop - either the scalar loop is
> +     thrown away, so it is a wasted work, or is used only for
> +     a few iterations.  */
> +  if (scalar_loop->inner)
> +    {
> +      gimple *g = vect_loop_vectorized_call (scalar_loop->inner);
> +      if (g)
> +     {
> +       arg = gimple_call_arg (g, 0);
> +       get_loop (cfun, tree_to_shwi (arg))->dont_vectorize = true;
> +       fold_loop_vectorized_call (g, boolean_false_node);
> +     }
> +    }
>    bbs = get_loop_body (scalar_loop);
>    for (i = 0; i < scalar_loop->num_nodes; i++)
>      {
> @@ -534,14 +548,59 @@ vectorize_loops (void)
>       only over initial loops skipping newly generated ones.  */
>    FOR_EACH_LOOP (loop, 0)
>      if (loop->dont_vectorize)
> -      any_ifcvt_loops = true;
> -    else if ((flag_tree_loop_vectorize
> -           && optimize_loop_nest_for_speed_p (loop))
> -          || loop->force_vectorize)
>        {
> -     loop_vec_info loop_vinfo, orig_loop_vinfo = NULL;
> -     gimple *loop_vectorized_call = vect_loop_vectorized_call (loop);
> -vectorize_epilogue:
> +     any_ifcvt_loops = true;
> +     /* If-conversion sometimes versions both the outer loop
> +        (for the case when outer loop vectorization might be
> +        desirable) as well as the inner loop in the scalar version
> +        of the loop.  So we have:
> +         if (LOOP_VECTORIZED (1, 3))
> +           {
> +             loop1
> +               loop2
> +           }
> +         else
> +           loop3 (copy of loop1)
> +             if (LOOP_VECTORIZED (4, 5))
> +               loop4 (copy of loop2)
> +             else
> +               loop5 (copy of loop4)
> +        If FOR_EACH_LOOP gives us loop3 first (which has
> +        dont_vectorize set), make sure to process loop1 before loop4;
> +        so that we can prevent vectorization of loop4 if loop1
> +        is successfully vectorized.  */
> +     if (loop->inner)
> +       {
> +         gimple *loop_vectorized_call
> +           = vect_loop_vectorized_call (loop);
> +         if (loop_vectorized_call
> +             && vect_loop_vectorized_call (loop->inner))
> +           {
> +             tree arg = gimple_call_arg (loop_vectorized_call, 0);
> +             struct loop *vector_loop
> +               = get_loop (cfun, tree_to_shwi (arg));
> +             if (vector_loop && vector_loop != loop)
> +               {
> +                 loop = vector_loop;
> +                 /* Make sure we don't vectorize it twice.  */
> +                 loop->dont_vectorize = true;
> +                 goto try_vectorize;
> +               }
> +           }
> +       }
> +      }
> +    else
> +      {
> +     loop_vec_info loop_vinfo, orig_loop_vinfo;
> +     gimple *loop_vectorized_call;
> +       try_vectorize:
> +     if (!((flag_tree_loop_vectorize
> +            && optimize_loop_nest_for_speed_p (loop))
> +           || loop->force_vectorize))
> +       continue;
> +     orig_loop_vinfo = NULL;
> +     loop_vectorized_call = vect_loop_vectorized_call (loop);
> +       vectorize_epilogue:
>       vect_location = find_loop_location (loop);
>          if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
>           && dump_enabled_p ())
> @@ -595,6 +654,12 @@ vectorize_epilogue:
>                   ret |= TODO_cleanup_cfg;
>                 }
>             }
> +         /* If outer loop vectorization fails for LOOP_VECTORIZED guarded
> +            loop, don't vectorize its inner loop; we'll attempt to
> +            vectorize LOOP_VECTORIZED guarded inner loop of the scalar
> +            loop version.  */
> +         if (loop_vectorized_call && loop->inner)
> +           loop->inner->dont_vectorize = true;
>           continue;
>         }
>  
> --- gcc/tree-vect-loop-manip.c.jj     2017-01-06 19:34:04.040561006 +0100
> +++ gcc/tree-vect-loop-manip.c        2017-01-09 11:52:59.155806356 +0100
> @@ -71,7 +71,7 @@ rename_use_op (use_operand_p op_p)
>  }
>  
>  
> -/* Renames the variables in basic block BB.  Allow renaming  of PHI argumnets
> +/* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
>     on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
>     true.  */
>  
> @@ -102,9 +102,25 @@ rename_variables_in_bb (basic_block bb,
>  
>    FOR_EACH_EDGE (e, ei, bb->preds)
>      {
> -      if (!flow_bb_inside_loop_p (loop, e->src)
> -       && (!rename_from_outer_loop || e->src != outer_loop->header))
> -     continue;
> +      if (!flow_bb_inside_loop_p (loop, e->src))
> +     {
> +       if (!rename_from_outer_loop)
> +         continue;
> +       if (e->src != outer_loop->header)
> +         {
> +           if (outer_loop->inner->next)
> +             {
> +               /* If outer_loop has 2 inner loops, allow there to
> +                  be an extra basic block which decides which of the
> +                  two loops to use using LOOP_VECTORIZED.  */
> +               if (!single_pred_p (e->src)
> +                   || single_pred (e->src) != outer_loop->header)
> +                 continue;
> +             }
> +           else
> +             continue;
> +         }
> +     }
>        for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
>          gsi_next (&gsi))
>          rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
> --- gcc/testsuite/gcc.target/i386/pr78899.c.jj        2017-01-09 
> 11:52:59.156806343 +0100
> +++ gcc/testsuite/gcc.target/i386/pr78899.c   2017-01-09 11:52:59.156806343 
> +0100
> @@ -0,0 +1,27 @@
> +/* PR tree-optimization/78899 */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -fopenmp-simd -mavx2 -mno-avx512f" } */
> +
> +#define N 1024
> +#define M 4
> +int p1[N], p2[N], p3[N], c[N];
> +
> +void
> +foo (int n)
> +{
> +  int i, k;
> +  for (k = 0; k < n / M; k++)
> +    {
> +    #pragma omp simd
> +      for (i = 0; i < M; i++)
> +     if (c[k * M + i])
> +       {
> +         p1[k * M + i] += 1;
> +         p2[k * M + i] = p3[k * M + i] + 2;
> +       }
> +    }
> +}
> +
> +/* Ensure the loop is vectorized.  */
> +/* { dg-final { scan-assembler "vpmaskmov" } } */
> +/* { dg-final { scan-assembler "vpadd" } } */
> --- gcc/testsuite/gcc.dg/pr71077.c.jj 2017-01-09 11:52:59.179806042 +0100
> +++ gcc/testsuite/gcc.dg/pr71077.c    2017-01-09 11:52:59.179806042 +0100
> @@ -0,0 +1,14 @@
> +/* PR c++/71077  */
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-additional-options "-mavx2" { target { i?86-*-* x86_64-*-* } } } */
> +
> +void
> +foo (int *a, int n)
> +{
> +  int b, c;
> +  for (b = 0; b < n; b++)
> +    for (c = 0; c < 32; c++)
> +      if ((b & 1U) << c)
> +     a[b + c] = 0;
> +}
> 
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)

Reply via email to