On Fri, Jul 20, 2018 at 12:22 PM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> We couldn't vectorise:
>
>   for (int j = 0; j < n; ++j)
>     {
>       for (int i = 0; i < 16; ++i)
>         a[i] = (b[i] + c[i]) >> 1;
>       a += step;
>       b += step;
>       c += step;
>     }
>
> at -O3 because cunrolli unrolled the inner loop and SLP couldn't handle
> AVG_FLOOR patterns (see also PR86504).  The problem was some overly
> strict checking of pattern statements compared to normal statements
> in vect_get_and_check_slp_defs:
>
>           switch (gimple_code (def_stmt))
>             {
>             case GIMPLE_PHI:
>             case GIMPLE_ASSIGN:
>               break;
>
>             default:
>               if (dump_enabled_p ())
>                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                                  "unsupported defining stmt:\n");
>               return -1;
>             }
>
> The easy fix would have been to add GIMPLE_CALL to the switch,
> but I don't think the switch is doing anything useful.  We only create
> pattern statements that the rest of the vectoriser can handle, and code
> in this function and elsewhere check whether SLP is possible.
>
> I'm also not sure why:
>
>           if (!first && !oprnd_info->first_pattern
>               /* Allow different pattern state for the defs of the
>                  first stmt in reduction chains.  */
>               && (oprnd_info->first_dt != vect_reduction_def
>
> is necessary.  All that should matter is that the statements in the
> node are "similar enough".  It turned out to be quite hard to find a
> convincing example that used a mixture of pattern and non-pattern
> statements, so bb-slp-pow-1.c is the best I could come up with.
> But it does show that the combination of "xi * xi" statements and
> "pow (xj, 2) -> xj * xj" patterns are handled correctly.
>
> The patch therefore just removes the whole if block.
>
> The loop also needed commutative swapping to be extended to at least
> AVG_FLOOR.
>
> This gives +3.9% on 525.x264_r at -O3.
>
> Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
> and x86_64-linux-gnu.  OK to install?

Always nice to see seemingly dead code go.  OK if you can still
build SPEC with this change and pass a test run.

At least I _do_ seem to remember having "issues" in this area...

Thanks,
Richard.

> Richard
>
>
> 2018-07-20  Richard Sandiford  <richard.sandif...@arm.com>
>
> gcc/
>         * internal-fn.h (first_commutative_argument): Declare.
>         * internal-fn.c (first_commutative_argument): New function.
>         * tree-vect-slp.c (vect_get_and_check_slp_defs): Remove extra
>         restrictions for pattern statements.  Use first_commutative_argument
>         to look for commutative operands in calls to internal functions.
>
> gcc/testsuite/
>         * gcc.dg/vect/bb-slp-over-widen-1.c: Expect AVG_FLOOR to be used
>         on vect_avg_qi targets.
>         * gcc.dg/vect/bb-slp-over-widen-2.c: Likewise.
>         * gcc.dg/vect/bb-slp-pow-1.c: New test.
>         * gcc.dg/vect/vect-avg-15.c: Likewise.
>
> Index: gcc/internal-fn.h
> ===================================================================
> --- gcc/internal-fn.h   2018-07-13 10:11:14.009847140 +0100
> +++ gcc/internal-fn.h   2018-07-20 11:18:58.167047743 +0100
> @@ -201,6 +201,8 @@ direct_internal_fn_supported_p (internal
>                                          opt_type);
>  }
>
> +extern int first_commutative_argument (internal_fn);
> +
>  extern bool set_edom_supported_p (void);
>
>  extern internal_fn get_conditional_internal_fn (tree_code);
> Index: gcc/internal-fn.c
> ===================================================================
> --- gcc/internal-fn.c   2018-07-13 10:11:14.009847140 +0100
> +++ gcc/internal-fn.c   2018-07-20 11:18:58.163047778 +0100
> @@ -3183,6 +3183,42 @@ direct_internal_fn_supported_p (internal
>    return direct_internal_fn_supported_p (fn, tree_pair (type, type), 
> opt_type);
>  }
>
> +/* If FN is commutative in two consecutive arguments, return the
> +   index of the first, otherwise return -1.  */
> +
> +int
> +first_commutative_argument (internal_fn fn)
> +{
> +  switch (fn)
> +    {
> +    case IFN_FMA:
> +    case IFN_FMS:
> +    case IFN_FNMA:
> +    case IFN_FNMS:
> +    case IFN_AVG_FLOOR:
> +    case IFN_AVG_CEIL:
> +    case IFN_FMIN:
> +    case IFN_FMAX:
> +      return 0;
> +
> +    case IFN_COND_ADD:
> +    case IFN_COND_MUL:
> +    case IFN_COND_MIN:
> +    case IFN_COND_MAX:
> +    case IFN_COND_AND:
> +    case IFN_COND_IOR:
> +    case IFN_COND_XOR:
> +    case IFN_COND_FMA:
> +    case IFN_COND_FMS:
> +    case IFN_COND_FNMA:
> +    case IFN_COND_FNMS:
> +      return 1;
> +
> +    default:
> +      return -1;
> +    }
> +}
> +
>  /* Return true if IFN_SET_EDOM is supported.  */
>
>  bool
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c 2018-07-13 10:11:15.113837768 +0100
> +++ gcc/tree-vect-slp.c 2018-07-20 11:18:58.167047743 +0100
> @@ -299,15 +299,20 @@ vect_get_and_check_slp_defs (vec_info *v
>    bool pattern = false;
>    slp_oprnd_info oprnd_info;
>    int first_op_idx = 1;
> -  bool commutative = false;
> +  unsigned int commutative_op = -1U;
>    bool first_op_cond = false;
>    bool first = stmt_num == 0;
>    bool second = stmt_num == 1;
>
> -  if (is_gimple_call (stmt))
> +  if (gcall *call = dyn_cast <gcall *> (stmt))
>      {
> -      number_of_oprnds = gimple_call_num_args (stmt);
> +      number_of_oprnds = gimple_call_num_args (call);
>        first_op_idx = 3;
> +      if (gimple_call_internal_p (call))
> +       {
> +         internal_fn ifn = gimple_call_internal_fn (call);
> +         commutative_op = first_commutative_argument (ifn);
> +       }
>      }
>    else if (is_gimple_assign (stmt))
>      {
> @@ -322,7 +327,7 @@ vect_get_and_check_slp_defs (vec_info *v
>           number_of_oprnds++;
>         }
>        else
> -       commutative = commutative_tree_code (code);
> +       commutative_op = commutative_tree_code (code) ? 0U : -1U;
>      }
>    else
>      return -1;
> @@ -361,65 +366,6 @@ vect_get_and_check_slp_defs (vec_info *v
>           return -1;
>         }
>
> -      /* Check if DEF_STMT is a part of a pattern in LOOP and get the def 
> stmt
> -         from the pattern.  Check that all the stmts of the node are in the
> -         pattern.  */
> -      if (def_stmt && gimple_bb (def_stmt)
> -         && vect_stmt_in_region_p (vinfo, def_stmt)
> -         && vinfo_for_stmt (def_stmt)
> -         && is_pattern_stmt_p (vinfo_for_stmt (def_stmt)))
> -        {
> -          pattern = true;
> -          if (!first && !oprnd_info->first_pattern
> -             /* Allow different pattern state for the defs of the
> -                first stmt in reduction chains.  */
> -             && (oprnd_info->first_dt != vect_reduction_def
> -                 || (!second && !oprnd_info->second_pattern)))
> -           {
> -             if (i == 0
> -                 && !swapped
> -                 && commutative)
> -               {
> -                 swapped = true;
> -                 goto again;
> -               }
> -
> -             if (dump_enabled_p ())
> -               {
> -                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                  "Build SLP failed: some of the stmts"
> -                                  " are in a pattern, and others are not ");
> -                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, 
> oprnd);
> -                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
> -               }
> -
> -             return 1;
> -            }
> -
> -          dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt));
> -
> -          if (dt == vect_unknown_def_type)
> -            {
> -              if (dump_enabled_p ())
> -                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                "Unsupported pattern.\n");
> -              return -1;
> -            }
> -
> -          switch (gimple_code (def_stmt))
> -            {
> -            case GIMPLE_PHI:
> -            case GIMPLE_ASSIGN:
> -             break;
> -
> -           default:
> -             if (dump_enabled_p ())
> -               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                "unsupported defining stmt:\n");
> -             return -1;
> -            }
> -        }
> -
>        if (second)
>         oprnd_info->second_pattern = pattern;
>
> @@ -447,9 +393,7 @@ vect_get_and_check_slp_defs (vec_info *v
>               || !types_compatible_p (oprnd_info->first_op_type, type))
>             {
>               /* Try swapping operands if we got a mismatch.  */
> -             if (i == 0
> -                 && !swapped
> -                 && commutative)
> +             if (i == commutative_op && !swapped)
>                 {
>                   swapped = true;
>                   goto again;
> @@ -548,8 +492,11 @@ vect_get_and_check_slp_defs (vec_info *v
>             }
>         }
>        else
> -       swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
> -                          gimple_assign_rhs2_ptr (stmt));
> +       {
> +         unsigned int op = commutative_op + first_op_idx;
> +         swap_ssa_operands (stmt, gimple_op_ptr (stmt, op),
> +                            gimple_op_ptr (stmt, op + 1));
> +       }
>        if (dump_enabled_p ())
>         {
>           dump_printf_loc (MSG_NOTE, vect_location,
> Index: gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-1.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-1.c     2018-07-04 
> 08:17:55.913442041 +0100
> +++ gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-1.c     2018-07-20 
> 11:18:58.167047743 +0100
> @@ -63,4 +63,5 @@ main (void)
>
>  /* { dg-final { scan-tree-dump "demoting int to signed short" "slp2" { 
> target { ! vect_widen_shift } } } } */
>  /* { dg-final { scan-tree-dump "demoting int to unsigned short" "slp2" { 
> target { ! vect_widen_shift } } } } */
> +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "slp2" { target vect_avg_qi } } 
> } */
>  /* { dg-final { scan-tree-dump-times "basic block vectorized" 2 "slp2" } } */
> Index: gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-2.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-2.c     2018-07-04 
> 08:17:55.913442041 +0100
> +++ gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-2.c     2018-07-20 
> 11:18:58.167047743 +0100
> @@ -62,4 +62,5 @@ main (void)
>
>  /* { dg-final { scan-tree-dump "demoting int to signed short" "slp2" { 
> target { ! vect_widen_shift } } } } */
>  /* { dg-final { scan-tree-dump "demoting int to unsigned short" "slp2" { 
> target { ! vect_widen_shift } } } } */
> +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "slp2" { target vect_avg_qi } } 
> } */
>  /* { dg-final { scan-tree-dump-times "basic block vectorized" 2 "slp2" } } */
> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pow-1.c
> ===================================================================
> --- /dev/null   2018-07-09 14:52:09.234750850 +0100
> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pow-1.c    2018-07-20 11:18:58.167047743 
> +0100
> @@ -0,0 +1,28 @@
> +/* { dg-additional-options "-fno-math-errno -fdisable-tree-sincos" } */
> +/* { dg-require-effective-target vect_float } */
> +
> +void __attribute__ ((noipa))
> +f (float *a)
> +{
> +  a[0] = a[0] * a[0];
> +  a[1] = __builtin_powf (a[1], 2);
> +  a[2] = a[2] * a[2];
> +  a[3] = __builtin_powf (a[3], 2);
> +}
> +
> +float a[4] = { 1, 2, 3, 4 };
> +
> +int
> +main (void)
> +{
> +  f (a);
> +  for (int i = 0; i < 4; ++i)
> +    {
> +      if (a[i] != (i + 1) * (i + 1))
> +       __builtin_abort ();
> +      asm volatile ("" ::: "memory");
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */
> Index: gcc/testsuite/gcc.dg/vect/vect-avg-15.c
> ===================================================================
> --- /dev/null   2018-07-09 14:52:09.234750850 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-avg-15.c     2018-07-20 11:18:58.167047743 
> +0100
> @@ -0,0 +1,52 @@
> +/* { dg-additional-options "-O3" } */
> +/* { dg-require-effective-target vect_int } */
> +
> +#include "tree-vect.h"
> +
> +#define N 80
> +
> +void __attribute__ ((noipa))
> +f (signed char *restrict a, signed char *restrict b,
> +   signed char *restrict c, int n, int step)
> +{
> +  for (int j = 0; j < n; ++j)
> +    {
> +      for (int i = 0; i < 16; ++i)
> +       a[i] = (b[i] + c[i]) >> 1;
> +      a += step;
> +      b += step;
> +      c += step;
> +    }
> +}
> +
> +#define BASE1 -126
> +#define BASE2 -42
> +
> +signed char a[N], b[N], c[N];
> +
> +int
> +main (void)
> +{
> +  check_vect ();
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = i;
> +      b[i] = BASE1 + i * 3;
> +      c[i] = BASE2 + i * 2;
> +      asm volatile ("" ::: "memory");
> +    }
> +  f (a, b, c, N / 20, 20);
> +  for (int i = 0; i < N; ++i)
> +    {
> +      int d = (BASE1 + BASE2 + i * 5) >> 1;
> +      if (a[i] != (i % 20 < 16 ? d : i))
> +       __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" 
> } } */
> +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } 
> } */
> +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" { 
> target vect_avg_qi } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target 
> vect_avg_qi } } } */

Reply via email to