On Fri, Jul 20, 2018 at 12:22 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > We couldn't vectorise: > > for (int j = 0; j < n; ++j) > { > for (int i = 0; i < 16; ++i) > a[i] = (b[i] + c[i]) >> 1; > a += step; > b += step; > c += step; > } > > at -O3 because cunrolli unrolled the inner loop and SLP couldn't handle > AVG_FLOOR patterns (see also PR86504). The problem was some overly > strict checking of pattern statements compared to normal statements > in vect_get_and_check_slp_defs: > > switch (gimple_code (def_stmt)) > { > case GIMPLE_PHI: > case GIMPLE_ASSIGN: > break; > > default: > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "unsupported defining stmt:\n"); > return -1; > } > > The easy fix would have been to add GIMPLE_CALL to the switch, > but I don't think the switch is doing anything useful. We only create > pattern statements that the rest of the vectoriser can handle, and code > in this function and elsewhere check whether SLP is possible. > > I'm also not sure why: > > if (!first && !oprnd_info->first_pattern > /* Allow different pattern state for the defs of the > first stmt in reduction chains. */ > && (oprnd_info->first_dt != vect_reduction_def > > is necessary. All that should matter is that the statements in the > node are "similar enough". It turned out to be quite hard to find a > convincing example that used a mixture of pattern and non-pattern > statements, so bb-slp-pow-1.c is the best I could come up with. > But it does show that the combination of "xi * xi" statements and > "pow (xj, 2) -> xj * xj" patterns are handled correctly. > > The patch therefore just removes the whole if block. > > The loop also needed commutative swapping to be extended to at least > AVG_FLOOR. > > This gives +3.9% on 525.x264_r at -O3. > > Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf > and x86_64-linux-gnu. OK to install?
Always nice to see seemingly dead code go. OK if you can still build SPEC with this change and pass a test run. At least I _do_ seem to remember having "issues" in this area... Thanks, Richard. > Richard > > > 2018-07-20 Richard Sandiford <richard.sandif...@arm.com> > > gcc/ > * internal-fn.h (first_commutative_argument): Declare. > * internal-fn.c (first_commutative_argument): New function. > * tree-vect-slp.c (vect_get_and_check_slp_defs): Remove extra > restrictions for pattern statements. Use first_commutative_argument > to look for commutative operands in calls to internal functions. > > gcc/testsuite/ > * gcc.dg/vect/bb-slp-over-widen-1.c: Expect AVG_FLOOR to be used > on vect_avg_qi targets. > * gcc.dg/vect/bb-slp-over-widen-2.c: Likewise. > * gcc.dg/vect/bb-slp-pow-1.c: New test. > * gcc.dg/vect/vect-avg-15.c: Likewise. > > Index: gcc/internal-fn.h > =================================================================== > --- gcc/internal-fn.h 2018-07-13 10:11:14.009847140 +0100 > +++ gcc/internal-fn.h 2018-07-20 11:18:58.167047743 +0100 > @@ -201,6 +201,8 @@ direct_internal_fn_supported_p (internal > opt_type); > } > > +extern int first_commutative_argument (internal_fn); > + > extern bool set_edom_supported_p (void); > > extern internal_fn get_conditional_internal_fn (tree_code); > Index: gcc/internal-fn.c > =================================================================== > --- gcc/internal-fn.c 2018-07-13 10:11:14.009847140 +0100 > +++ gcc/internal-fn.c 2018-07-20 11:18:58.163047778 +0100 > @@ -3183,6 +3183,42 @@ direct_internal_fn_supported_p (internal > return direct_internal_fn_supported_p (fn, tree_pair (type, type), > opt_type); > } > > +/* If FN is commutative in two consecutive arguments, return the > + index of the first, otherwise return -1. */ > + > +int > +first_commutative_argument (internal_fn fn) > +{ > + switch (fn) > + { > + case IFN_FMA: > + case IFN_FMS: > + case IFN_FNMA: > + case IFN_FNMS: > + case IFN_AVG_FLOOR: > + case IFN_AVG_CEIL: > + case IFN_FMIN: > + case IFN_FMAX: > + return 0; > + > + case IFN_COND_ADD: > + case IFN_COND_MUL: > + case IFN_COND_MIN: > + case IFN_COND_MAX: > + case IFN_COND_AND: > + case IFN_COND_IOR: > + case IFN_COND_XOR: > + case IFN_COND_FMA: > + case IFN_COND_FMS: > + case IFN_COND_FNMA: > + case IFN_COND_FNMS: > + return 1; > + > + default: > + return -1; > + } > +} > + > /* Return true if IFN_SET_EDOM is supported. */ > > bool > Index: gcc/tree-vect-slp.c > =================================================================== > --- gcc/tree-vect-slp.c 2018-07-13 10:11:15.113837768 +0100 > +++ gcc/tree-vect-slp.c 2018-07-20 11:18:58.167047743 +0100 > @@ -299,15 +299,20 @@ vect_get_and_check_slp_defs (vec_info *v > bool pattern = false; > slp_oprnd_info oprnd_info; > int first_op_idx = 1; > - bool commutative = false; > + unsigned int commutative_op = -1U; > bool first_op_cond = false; > bool first = stmt_num == 0; > bool second = stmt_num == 1; > > - if (is_gimple_call (stmt)) > + if (gcall *call = dyn_cast <gcall *> (stmt)) > { > - number_of_oprnds = gimple_call_num_args (stmt); > + number_of_oprnds = gimple_call_num_args (call); > first_op_idx = 3; > + if (gimple_call_internal_p (call)) > + { > + internal_fn ifn = gimple_call_internal_fn (call); > + commutative_op = first_commutative_argument (ifn); > + } > } > else if (is_gimple_assign (stmt)) > { > @@ -322,7 +327,7 @@ vect_get_and_check_slp_defs (vec_info *v > number_of_oprnds++; > } > else > - commutative = commutative_tree_code (code); > + commutative_op = commutative_tree_code (code) ? 0U : -1U; > } > else > return -1; > @@ -361,65 +366,6 @@ vect_get_and_check_slp_defs (vec_info *v > return -1; > } > > - /* Check if DEF_STMT is a part of a pattern in LOOP and get the def > stmt > - from the pattern. Check that all the stmts of the node are in the > - pattern. */ > - if (def_stmt && gimple_bb (def_stmt) > - && vect_stmt_in_region_p (vinfo, def_stmt) > - && vinfo_for_stmt (def_stmt) > - && is_pattern_stmt_p (vinfo_for_stmt (def_stmt))) > - { > - pattern = true; > - if (!first && !oprnd_info->first_pattern > - /* Allow different pattern state for the defs of the > - first stmt in reduction chains. */ > - && (oprnd_info->first_dt != vect_reduction_def > - || (!second && !oprnd_info->second_pattern))) > - { > - if (i == 0 > - && !swapped > - && commutative) > - { > - swapped = true; > - goto again; > - } > - > - if (dump_enabled_p ()) > - { > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "Build SLP failed: some of the stmts" > - " are in a pattern, and others are not "); > - dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, > oprnd); > - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); > - } > - > - return 1; > - } > - > - dt = STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)); > - > - if (dt == vect_unknown_def_type) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "Unsupported pattern.\n"); > - return -1; > - } > - > - switch (gimple_code (def_stmt)) > - { > - case GIMPLE_PHI: > - case GIMPLE_ASSIGN: > - break; > - > - default: > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "unsupported defining stmt:\n"); > - return -1; > - } > - } > - > if (second) > oprnd_info->second_pattern = pattern; > > @@ -447,9 +393,7 @@ vect_get_and_check_slp_defs (vec_info *v > || !types_compatible_p (oprnd_info->first_op_type, type)) > { > /* Try swapping operands if we got a mismatch. */ > - if (i == 0 > - && !swapped > - && commutative) > + if (i == commutative_op && !swapped) > { > swapped = true; > goto again; > @@ -548,8 +492,11 @@ vect_get_and_check_slp_defs (vec_info *v > } > } > else > - swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt), > - gimple_assign_rhs2_ptr (stmt)); > + { > + unsigned int op = commutative_op + first_op_idx; > + swap_ssa_operands (stmt, gimple_op_ptr (stmt, op), > + gimple_op_ptr (stmt, op + 1)); > + } > if (dump_enabled_p ()) > { > dump_printf_loc (MSG_NOTE, vect_location, > Index: gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-1.c > =================================================================== > --- gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-1.c 2018-07-04 > 08:17:55.913442041 +0100 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-1.c 2018-07-20 > 11:18:58.167047743 +0100 > @@ -63,4 +63,5 @@ main (void) > > /* { dg-final { scan-tree-dump "demoting int to signed short" "slp2" { > target { ! vect_widen_shift } } } } */ > /* { dg-final { scan-tree-dump "demoting int to unsigned short" "slp2" { > target { ! vect_widen_shift } } } } */ > +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "slp2" { target vect_avg_qi } } > } */ > /* { dg-final { scan-tree-dump-times "basic block vectorized" 2 "slp2" } } */ > Index: gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-2.c > =================================================================== > --- gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-2.c 2018-07-04 > 08:17:55.913442041 +0100 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-over-widen-2.c 2018-07-20 > 11:18:58.167047743 +0100 > @@ -62,4 +62,5 @@ main (void) > > /* { dg-final { scan-tree-dump "demoting int to signed short" "slp2" { > target { ! vect_widen_shift } } } } */ > /* { dg-final { scan-tree-dump "demoting int to unsigned short" "slp2" { > target { ! vect_widen_shift } } } } */ > +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "slp2" { target vect_avg_qi } } > } */ > /* { dg-final { scan-tree-dump-times "basic block vectorized" 2 "slp2" } } */ > Index: gcc/testsuite/gcc.dg/vect/bb-slp-pow-1.c > =================================================================== > --- /dev/null 2018-07-09 14:52:09.234750850 +0100 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-pow-1.c 2018-07-20 11:18:58.167047743 > +0100 > @@ -0,0 +1,28 @@ > +/* { dg-additional-options "-fno-math-errno -fdisable-tree-sincos" } */ > +/* { dg-require-effective-target vect_float } */ > + > +void __attribute__ ((noipa)) > +f (float *a) > +{ > + a[0] = a[0] * a[0]; > + a[1] = __builtin_powf (a[1], 2); > + a[2] = a[2] * a[2]; > + a[3] = __builtin_powf (a[3], 2); > +} > + > +float a[4] = { 1, 2, 3, 4 }; > + > +int > +main (void) > +{ > + f (a); > + for (int i = 0; i < 4; ++i) > + { > + if (a[i] != (i + 1) * (i + 1)) > + __builtin_abort (); > + asm volatile ("" ::: "memory"); > + } > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ > Index: gcc/testsuite/gcc.dg/vect/vect-avg-15.c > =================================================================== > --- /dev/null 2018-07-09 14:52:09.234750850 +0100 > +++ gcc/testsuite/gcc.dg/vect/vect-avg-15.c 2018-07-20 11:18:58.167047743 > +0100 > @@ -0,0 +1,52 @@ > +/* { dg-additional-options "-O3" } */ > +/* { dg-require-effective-target vect_int } */ > + > +#include "tree-vect.h" > + > +#define N 80 > + > +void __attribute__ ((noipa)) > +f (signed char *restrict a, signed char *restrict b, > + signed char *restrict c, int n, int step) > +{ > + for (int j = 0; j < n; ++j) > + { > + for (int i = 0; i < 16; ++i) > + a[i] = (b[i] + c[i]) >> 1; > + a += step; > + b += step; > + c += step; > + } > +} > + > +#define BASE1 -126 > +#define BASE2 -42 > + > +signed char a[N], b[N], c[N]; > + > +int > +main (void) > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + { > + a[i] = i; > + b[i] = BASE1 + i * 3; > + c[i] = BASE2 + i * 2; > + asm volatile ("" ::: "memory"); > + } > + f (a, b, c, N / 20, 20); > + for (int i = 0; i < N; ++i) > + { > + int d = (BASE1 + BASE2 + i * 5) >> 1; > + if (a[i] != (i % 20 < 16 ? d : i)) > + __builtin_abort (); > + } > + return 0; > +} > + > +/* { dg-final { scan-tree-dump "vect_recog_average_pattern: detected" "vect" > } } */ > +/* { dg-final { scan-tree-dump {\.AVG_FLOOR} "vect" { target vect_avg_qi } } > } */ > +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" { > target vect_avg_qi } } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" { target > vect_avg_qi } } } */