Jakub Jelinek <ja...@redhat.com> wrote on 16/04/2011 09:16:23 AM:
>
> Hi!
>
> As the attached testcase shows, while the current detection of what
> shifts are by scalar and what shifts are by vector shift count
> may work well for loop vectorizer (vect_internal_def being vector
> shift, vect_external_def or vect_constant_def scalar shift),
> it is incorrect for SLP, where vect_external_def and vect_constant_def
> may be either scalar or vector shift (vect_external_def is e.g.
> if def_stmt is in a different bb and either it can be the same
> SSA_NAMEs in all shifts, or different, for vect_constant_def it
> can be either the same constant in all cases, or different)
> and in theory vect_internal_def could be also scalar shift (tried
> to test that in fn3's first bb, though currently SLP doesn't attempt
> to vectorize that, missed-optimization).
>
> The following patch fixes that.  Bootstrapped/regtested on x86_64-linux
> and i686-linux and additionally tested on the testcase with additional
> -mxop.  Ok for trunk/4.6?
>
> For better test coverage, perhaps the testcase should be duplicated
> after effective target xop and xop_runtime are added and additionally
> perhaps the testcase in gcc.dg/vect with -mxop testing what is SLP
> vectorized from the dumps would be nice.
>
> 2011-04-16  Jakub Jelinek  <ja...@redhat.com>
>
>    PR tree-optimization/48616
>    * tree-vect-stmts.c (vectorizable_shift): If SLP, determine
>    whether the shift is by scalar or vector based on whether all SLP
>    scalar stmts have the same rhs.
>
>    * gcc.dg/pr48616.c: New test.
>
> --- gcc/tree-vect-stmts.c.jj   2011-03-14 14:12:15.000000000 +0100
> +++ gcc/tree-vect-stmts.c   2011-04-15 20:44:30.000000000 +0200
> @@ -2077,7 +2077,7 @@ vectorizable_shift (gimple stmt, gimple_
>    VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL;
>    tree vop0, vop1;
>    unsigned int k;
> -  bool scalar_shift_arg = false;
> +  bool scalar_shift_arg = true;
>    bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
>    int vf;
>
> @@ -2159,8 +2159,34 @@ vectorizable_shift (gimple stmt, gimple_
>    /* Determine whether the shift amount is a vector, or scalar.  If the
>       shift/rotate amount is a vector, use the vector/vector shift
optabs.  */
>
> +  if (dt[1] == vect_internal_def && !slp_node)
> +    scalar_shift_arg = false;
> +  else if (dt[1] == vect_constant_def
> +      || dt[1] == vect_external_def
> +      || dt[1] == vect_internal_def)
> +    {
> +      /* In SLP, need to check whether the shift count is the same,
> +    in loops if it is a constant or invariant, it is always
> +    a scalar shift.  */
> +      if (slp_node)
> +   {
> +     VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
> +     gimple slpstmt;
> +
> +     FOR_EACH_VEC_ELT (gimple, stmts, k, slpstmt)
> +       if (!operand_equal_p (gimple_assign_rhs2 (slpstmt), op1, 0))
> +         scalar_shift_arg = false;

We already have this check in vect_build_slp_tree(). It didn't work for the
testcase because it doesn't take into account the type of definition. I
agree that it's better to move it here and base the vector/scalar  decision
on it, but please remove the now redundant check from vect_build_slp_tree
().

Otherwise, OK for trunk.

Thanks,
Ira

> +   }
> +    }
> +  else
> +    {
> +      if (vect_print_dump_info (REPORT_DETAILS))
> +   fprintf (vect_dump, "operand mode requires invariant argument.");
> +      return false;
> +    }
> +
>    /* Vector shifted by vector.  */
> -  if (dt[1] == vect_internal_def)
> +  if (!scalar_shift_arg)
>      {
>        optab = optab_for_tree_code (code, vectype, optab_vector);
>        if (vect_print_dump_info (REPORT_DETAILS))
> @@ -2168,13 +2194,12 @@ vectorizable_shift (gimple stmt, gimple_
>      }
>    /* See if the machine has a vector shifted by scalar insn and if not
>       then see if it has a vector shifted by vector insn.  */
> -  else if (dt[1] == vect_constant_def || dt[1] == vect_external_def)
> +  else
>      {
>        optab = optab_for_tree_code (code, vectype, optab_scalar);
>        if (optab
>            && optab_handler (optab, TYPE_MODE (vectype)) !=
CODE_FOR_nothing)
>          {
> -          scalar_shift_arg = true;
>            if (vect_print_dump_info (REPORT_DETAILS))
>              fprintf (vect_dump, "vector/scalar shift/rotate found.");
>          }
> @@ -2185,6 +2210,8 @@ vectorizable_shift (gimple stmt, gimple_
>                 && (optab_handler (optab, TYPE_MODE (vectype))
>                        != CODE_FOR_nothing))
>              {
> +         scalar_shift_arg = false;
> +
>                if (vect_print_dump_info (REPORT_DETAILS))
>                  fprintf (vect_dump, "vector/vector shift/rotate
found.");
>
> @@ -2197,12 +2224,6 @@ vectorizable_shift (gimple stmt, gimple_
>              }
>          }
>      }
> -  else
> -    {
> -      if (vect_print_dump_info (REPORT_DETAILS))
> -        fprintf (vect_dump, "operand mode requires invariant
argument.");
> -      return false;
> -    }
>
>    /* Supportable by target?  */
>    if (!optab)
> --- gcc/testsuite/gcc.dg/pr48616.c.jj   2011-04-15 21:02:27.000000000
+0200
> +++ gcc/testsuite/gcc.dg/pr48616.c   2011-04-15 21:00:24.000000000 +0200
> @@ -0,0 +1,134 @@
> +/* PR tree-optimization/48616 */
> +/* { dg-do run } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +extern void abort (void);
> +int a[4] __attribute__((aligned (32)));
> +int b[4] __attribute__((aligned (32)));
> +int c[4] __attribute__((aligned (32)));
> +int d[4] __attribute__((aligned (32)));
> +int e[4] __attribute__((aligned (32)));
> +
> +__attribute__((noinline, noclone))
> +int
> +foo (int x)
> +{
> +  asm ("" : "+r" (x));
> +  return x;
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn1 (int i)
> +{
> +  a[0] = b[0] << c[0];
> +  a[1] = b[1] << c[1];
> +  a[2] = b[2] << c[2];
> +  a[3] = b[3] << c[3];
> +  if (i)
> +    {
> +      d[0] = e[0] >> c[0];
> +      d[1] = e[1] >> c[1];
> +      d[2] = e[2] >> c[2];
> +      d[3] = e[3] >> c[3];
> +    }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn2 (int i)
> +{
> +  a[0] = b[0] << 1;
> +  a[1] = b[1] << 2;
> +  a[2] = b[2] << 3;
> +  a[3] = b[3] << 4;
> +  if (i)
> +    {
> +      d[0] = e[0] >> 1;
> +      d[1] = e[1] >> 2;
> +      d[2] = e[2] >> 3;
> +      d[3] = e[3] >> 4;
> +    }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn3 (int i, int j)
> +{
> +  int x = foo (j);
> +  a[0] = b[0] << x;
> +  a[1] = b[1] << x;
> +  a[2] = b[2] << x;
> +  a[3] = b[3] << x;
> +  if (i)
> +    {
> +      d[0] = e[0] >> x;
> +      d[1] = e[1] >> x;
> +      d[2] = e[2] >> x;
> +      d[3] = e[3] >> x;
> +    }
> +}
> +
> +__attribute__((noinline, noclone))
> +void
> +fn4 (int i)
> +{
> +  a[0] = b[0] << 1;
> +  a[1] = b[1] << 1;
> +  a[2] = b[2] << 1;
> +  a[3] = b[3] << 1;
> +  if (i)
> +    {
> +      d[0] = e[0] >> 1;
> +      d[1] = e[1] >> 1;
> +      d[2] = e[2] >> 1;
> +      d[3] = e[3] >> 1;
> +    }
> +}
> +
> +int
> +main ()
> +{
> +  int i;
> +  int *t;
> +  for (i = 0; i < 4; i++)
> +    {
> +      b[i] = 32;
> +      c[i] = i + 1;
> +      e[i] = 32;
> +    }
> +  asm volatile ("" : : "r" (b) : "memory");
> +  asm volatile ("" : : "r" (c) : "memory");
> +  asm volatile ("" : "=r" (t) : "0" (d) : "memory");
> +  fn1 (t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
> +   abort ();
> +      a[i] = 0;
> +      d[i] = 0;
> +    }
> +  fn2 (t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << (i + 1)) || d[i] != (32 >> (i + 1)))
> +   abort ();
> +      a[i] = 0;
> +      d[i] = 0;
> +    }
> +  fn3 (t != 0, t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
> +   abort ();
> +      a[i] = 0;
> +      d[i] = 0;
> +    }
> +  fn4 (t != 0);
> +  for (i = 0; i < 4; i++)
> +    {
> +      if (a[i] != (32 << 1) || d[i] != (32 >> 1))
> +   abort ();
> +    }
> +  return 0;
> +}
>
>    Jakub

Reply via email to