Jakub Jelinek <ja...@redhat.com> wrote on 16/04/2011 09:16:23 AM: > > Hi! > > As the attached testcase shows, while the current detection of what > shifts are by scalar and what shifts are by vector shift count > may work well for loop vectorizer (vect_internal_def being vector > shift, vect_external_def or vect_constant_def scalar shift), > it is incorrect for SLP, where vect_external_def and vect_constant_def > may be either scalar or vector shift (vect_external_def is e.g. > if def_stmt is in a different bb and either it can be the same > SSA_NAMEs in all shifts, or different, for vect_constant_def it > can be either the same constant in all cases, or different) > and in theory vect_internal_def could be also scalar shift (tried > to test that in fn3's first bb, though currently SLP doesn't attempt > to vectorize that, missed-optimization). > > The following patch fixes that. Bootstrapped/regtested on x86_64-linux > and i686-linux and additionally tested on the testcase with additional > -mxop. Ok for trunk/4.6? > > For better test coverage, perhaps the testcase should be duplicated > after effective target xop and xop_runtime are added and additionally > perhaps the testcase in gcc.dg/vect with -mxop testing what is SLP > vectorized from the dumps would be nice. > > 2011-04-16 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/48616 > * tree-vect-stmts.c (vectorizable_shift): If SLP, determine > whether the shift is by scalar or vector based on whether all SLP > scalar stmts have the same rhs. > > * gcc.dg/pr48616.c: New test. > > --- gcc/tree-vect-stmts.c.jj 2011-03-14 14:12:15.000000000 +0100 > +++ gcc/tree-vect-stmts.c 2011-04-15 20:44:30.000000000 +0200 > @@ -2077,7 +2077,7 @@ vectorizable_shift (gimple stmt, gimple_ > VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL; > tree vop0, vop1; > unsigned int k; > - bool scalar_shift_arg = false; > + bool scalar_shift_arg = true; > bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); > int vf; > > @@ -2159,8 +2159,34 @@ vectorizable_shift (gimple stmt, gimple_ > /* Determine whether the shift amount is a vector, or scalar. If the > shift/rotate amount is a vector, use the vector/vector shift optabs. */ > > + if (dt[1] == vect_internal_def && !slp_node) > + scalar_shift_arg = false; > + else if (dt[1] == vect_constant_def > + || dt[1] == vect_external_def > + || dt[1] == vect_internal_def) > + { > + /* In SLP, need to check whether the shift count is the same, > + in loops if it is a constant or invariant, it is always > + a scalar shift. */ > + if (slp_node) > + { > + VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); > + gimple slpstmt; > + > + FOR_EACH_VEC_ELT (gimple, stmts, k, slpstmt) > + if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0)) > + scalar_shift_arg = false;
We already have this check in vect_build_slp_tree(). It didn't work for the testcase because it doesn't take into account the type of definition. I agree that it's better to move it here and base the vector/scalar decision on it, but please remove the now redundant check from vect_build_slp_tree (). Otherwise, OK for trunk. Thanks, Ira > + } > + } > + else > + { > + if (vect_print_dump_info (REPORT_DETAILS)) > + fprintf (vect_dump, "operand mode requires invariant argument."); > + return false; > + } > + > /* Vector shifted by vector. */ > - if (dt[1] == vect_internal_def) > + if (!scalar_shift_arg) > { > optab = optab_for_tree_code (code, vectype, optab_vector); > if (vect_print_dump_info (REPORT_DETAILS)) > @@ -2168,13 +2194,12 @@ vectorizable_shift (gimple stmt, gimple_ > } > /* See if the machine has a vector shifted by scalar insn and if not > then see if it has a vector shifted by vector insn. */ > - else if (dt[1] == vect_constant_def || dt[1] == vect_external_def) > + else > { > optab = optab_for_tree_code (code, vectype, optab_scalar); > if (optab > && optab_handler (optab, TYPE_MODE (vectype)) != CODE_FOR_nothing) > { > - scalar_shift_arg = true; > if (vect_print_dump_info (REPORT_DETAILS)) > fprintf (vect_dump, "vector/scalar shift/rotate found."); > } > @@ -2185,6 +2210,8 @@ vectorizable_shift (gimple stmt, gimple_ > && (optab_handler (optab, TYPE_MODE (vectype)) > != CODE_FOR_nothing)) > { > + scalar_shift_arg = false; > + > if (vect_print_dump_info (REPORT_DETAILS)) > fprintf (vect_dump, "vector/vector shift/rotate found."); > > @@ -2197,12 +2224,6 @@ vectorizable_shift (gimple stmt, gimple_ > } > } > } > - else > - { > - if (vect_print_dump_info (REPORT_DETAILS)) > - fprintf (vect_dump, "operand mode requires invariant argument."); > - return false; > - } > > /* Supportable by target? */ > if (!optab) > --- gcc/testsuite/gcc.dg/pr48616.c.jj 2011-04-15 21:02:27.000000000 +0200 > +++ gcc/testsuite/gcc.dg/pr48616.c 2011-04-15 21:00:24.000000000 +0200 > @@ -0,0 +1,134 @@ > +/* PR tree-optimization/48616 */ > +/* { dg-do run } */ > +/* { dg-options "-O2 -ftree-vectorize" } */ > + > +extern void abort (void); > +int a[4] __attribute__((aligned (32))); > +int b[4] __attribute__((aligned (32))); > +int c[4] __attribute__((aligned (32))); > +int d[4] __attribute__((aligned (32))); > +int e[4] __attribute__((aligned (32))); > + > +__attribute__((noinline, noclone)) > +int > +foo (int x) > +{ > + asm ("" : "+r" (x)); > + return x; > +} > + > +__attribute__((noinline, noclone)) > +void > +fn1 (int i) > +{ > + a[0] = b[0] << c[0]; > + a[1] = b[1] << c[1]; > + a[2] = b[2] << c[2]; > + a[3] = b[3] << c[3]; > + if (i) > + { > + d[0] = e[0] >> c[0]; > + d[1] = e[1] >> c[1]; > + d[2] = e[2] >> c[2]; > + d[3] = e[3] >> c[3]; > + } > +} > + > +__attribute__((noinline, noclone)) > +void > +fn2 (int i) > +{ > + a[0] = b[0] << 1; > + a[1] = b[1] << 2; > + a[2] = b[2] << 3; > + a[3] = b[3] << 4; > + if (i) > + { > + d[0] = e[0] >> 1; > + d[1] = e[1] >> 2; > + d[2] = e[2] >> 3; > + d[3] = e[3] >> 4; > + } > +} > + > +__attribute__((noinline, noclone)) > +void > +fn3 (int i, int j) > +{ > + int x = foo (j); > + a[0] = b[0] << x; > + a[1] = b[1] << x; > + a[2] = b[2] << x; > + a[3] = b[3] << x; > + if (i) > + { > + d[0] = e[0] >> x; > + d[1] = e[1] >> x; > + d[2] = e[2] >> x; > + d[3] = e[3] >> x; > + } > +} > + > +__attribute__((noinline, noclone)) > +void > +fn4 (int i) > +{ > + a[0] = b[0] << 1; > + a[1] = b[1] << 1; > + a[2] = b[2] << 1; > + a[3] = b[3] << 1; > + if (i) > + { > + d[0] = e[0] >> 1; > + d[1] = e[1] >> 1; > + d[2] = e[2] >> 1; > + d[3] = e[3] >> 1; > + } > +} > + > +int > +main () > +{ > + int i; > + int *t; > + for (i = 0; i < 4; i++) > + { > + b[i] = 32; > + c[i] = i + 1; > + e[i] = 32; > + } > + asm volatile ("" : : "r" (b) : "memory"); > + asm volatile ("" : : "r" (c) : "memory"); > + asm volatile ("" : "=r" (t) : "0" (d) : "memory"); > + fn1 (t != 0); > + for (i = 0; i < 4; i++) > + { > + if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1))) > + abort (); > + a[i] = 0; > + d[i] = 0; > + } > + fn2 (t != 0); > + for (i = 0; i < 4; i++) > + { > + if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1))) > + abort (); > + a[i] = 0; > + d[i] = 0; > + } > + fn3 (t != 0, t != 0); > + for (i = 0; i < 4; i++) > + { > + if (a[i] != (32 << 1) || d[i] != (32 >> 1)) > + abort (); > + a[i] = 0; > + d[i] = 0; > + } > + fn4 (t != 0); > + for (i = 0; i < 4; i++) > + { > + if (a[i] != (32 << 1) || d[i] != (32 >> 1)) > + abort (); > + } > + return 0; > +} > > Jakub