On Wed, Jan 31, 2018 at 4:06 PM, Richard Sandiford
<richard.sandif...@linaro.org> wrote:
> This patch implements the original suggestion for fixing PR 81635:
> use range info in split_constant_offset to see whether a conversion
> of a wrapping type can be split.  The range info problem described in:
>
>   https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01002.html
>
> seems to have been fixed.
>
> The patch is part 1.  There needs to be a follow-on patch to handle:
>
>   for (unsigned int i = 0; i < n; i += 4)
>     {
>       ...[i + 2]...
>       ...[i + 3]...
>
> which the old SCEV test handles, but which the range check doesn't.
> At the moment we record that the low two bits of "i" are clear,
> but we still end up with a maximum range of 0xffffffff rather than
> 0xfffffffc.
>
> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.
> Also tested by comparing the before and after testsuite assembly output
> for at least one target per CPU directory.  Excluding a small number
> of register renamings on some targets, there were two differences:
>
> (1) In gcc.c-torture/compile/pr55350.c:
>
>     void
>     foo (__INTPTR_TYPE__ x, __INTPTR_TYPE__ y)
>     {
>       int i;
>       void **a = (void *)  (8UL * (x / 8UL));
>       for (i = 0; i < x; i++)
>         a[i] = (void *) y;
>     }
>
>     we previously kept base "a" and offset 0, but now use
>     "(void **) _n" (where _n holds the multiplication result).
>
>     This is because the old test had the side-effect of prohibiting
>     casts from unsigned ints to pointers of the same size.
>     What we do for code like this isn't going to help much though.
>
> (2) In gcc.c-torture/execute/2003016-1.c, we unrolled:
>
>     void f (unsigned int *x)
>     {
>       unsigned char i;
>       int j;
>
>       i = 0x10;
>       for (j = 0; j < 0x10; j++)
>         {
>           i += 0xe8;
>           x[i] = 0;
>           i -= 0xe7;
>         }
>     }
>
>     and ended up with an unpropagated degenerate phi:
>
>       # i_38 = PHI <16(3)>
>       ...
>       i_40 = i_38 + 1;
>       ...
>       i_48 = i_40 + 1;
>       ... etc ...
>       i_178 = i_168 + 1;
>       i_16 = i_178 + 232;
>       _17 = (long unsigned int) i_16;
>       _18 = _17 * 4;
>       _19 = &x + _18;
>       *_19 = 0;
>
>     Calling split_constant_offset on each (long unsigned int) operand
>     gives i_38 + 0xe8, i_38 + 0xe9, ..., i_38 + 0xf7, with i_38
>     still having the range [0x10, 0x20].  We can therefore tell
>     that i_38 + 0xf0 has the range [0x00, 0x10], and similarly
>     for +0xf1...+0xf7.
>
>     We should really be folding to constants here though.
>
> OK to install?
>
>
> 2018-01-31  Richard Sandiford  <richard.sandif...@linaro.org>
>
> gcc/
>         PR tree-optimization/81635
>         * tree-data-ref.c (split_constant_offset_1): For types that
>         wrap on overflow, try to use range info to prove that wrapping
>         cannot occur.
>
> gcc/testsuite/
>         PR tree-optimization/81635
>         * gcc.dg/vect/bb-slp-pr81635-1.c: New test.
>         * gcc.dg/vect/bb-slp-pr81635-2.c: Likewise.
>
> Index: gcc/tree-data-ref.c
> ===================================================================
> --- gcc/tree-data-ref.c 2018-01-13 18:02:00.946360352 +0000
> +++ gcc/tree-data-ref.c 2018-01-31 13:26:13.488630604 +0000
> @@ -704,11 +704,46 @@ split_constant_offset_1 (tree type, tree
>            and the outer precision is at least as large as the inner.  */
>         tree itype = TREE_TYPE (op0);
>         if ((POINTER_TYPE_P (itype)
> -            || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED (itype)))
> +            || (INTEGRAL_TYPE_P (itype) && !TYPE_OVERFLOW_TRAPS (itype)))
>             && TYPE_PRECISION (type) >= TYPE_PRECISION (itype)
>             && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type)))
>           {
> -           split_constant_offset (op0, &var0, off);
> +           if (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_WRAPS (itype))
> +             {
> +               /* Split the unconverted operand and try to prove that
> +                  wrapping isn't a problem.  */
> +               tree tmp_var, tmp_off;
> +               split_constant_offset (op0, &tmp_var, &tmp_off);
> +
> +               /* See whether we have an SSA_NAME whose range is known
> +                  to be [A, B].  */
> +               if (TREE_CODE (tmp_var) != SSA_NAME)
> +                 return false;
> +               wide_int var_min, var_max;
> +               if (get_range_info (tmp_var, &var_min, &var_max) != VR_RANGE)
> +                 return false;
> +
> +               /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)
> +                  is known to be [A + TMP_OFF, B + TMP_OFF], with all
> +                  operations done in ITYPE.  The addition must overflow
> +                  at both ends of the range or at neither.  */

Hmm, if they overflow at both ends that's still an issue given we effectively
widen the addition?  Consider char -> short and [255U, 255U] + 1 which
when widened will be 256 but when wrapping in char it's 0...

> +               bool overflow[2];
> +               signop sgn = TYPE_SIGN (itype);
> +               unsigned int prec = TYPE_PRECISION (itype);
> +               wide_int woff = wi::to_wide (tmp_off, prec);
> +               wide_int op0_min = wi::add (var_min, woff, sgn, &overflow[0]);
> +               wi::add (var_max, woff, sgn, &overflow[1]);
> +               if (overflow[0] != overflow[1])
> +                 return false;
> +
> +               /* Calculate (ssizetype) OP0 - (ssizetype) TMP_VAR.  */
> +               widest_int diff = (widest_int::from (op0_min, sgn)
> +                                  - widest_int::from (var_min, sgn));
> +               var0 = tmp_var;
> +               *off = wide_int_to_tree (ssizetype, diff);

... or is this part trying to deal with that case?  Because I don't understand
it - isn't 'off' simply supposed to be 'tmp_off' literally?  I think this needs
more explanation...  if there's no overflow that would be all that is to do, no?

Thanks,
Richard.

> +             }
> +           else
> +             split_constant_offset (op0, &var0, off);
>             *var = fold_convert (type, var0);
>             return true;
>           }
> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c
> ===================================================================
> --- /dev/null   2018-01-30 17:30:22.185477046 +0000
> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c        2018-01-31 
> 13:26:13.487630644 +0000
> @@ -0,0 +1,92 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */
> +/* { dg-require-effective-target vect_double } */
> +/* { dg-require-effective-target lp64 } */
> +
> +void
> +f1 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 0; i < 1000; i += 4)
> +    {
> +      double a = q[i] + p[i];
> +      double b = q[i + 1] + p[i + 1];
> +      q[i] = a;
> +      q[i + 1] = b;
> +    }
> +}
> +
> +void
> +f2 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)
> +    {
> +      double a = q[i] + p[i];
> +      double b = q[i + 1] + p[i + 1];
> +      q[i] = a;
> +      q[i + 1] = b;
> +    }
> +}
> +
> +void
> +f3 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)
> +    {
> +      double a = q[i + 2] + p[i + 2];
> +      double b = q[i + 3] + p[i + 3];
> +      q[i + 2] = a;
> +      q[i + 3] = b;
> +    }
> +}
> +
> +void
> +f4 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 0; i < 500; i += 6)
> +    for (unsigned int j = 0; j < 500; j += 4)
> +      {
> +       double a = q[j] + p[i];
> +       double b = q[j + 1] + p[i + 1];
> +       q[i] = a;
> +       q[i + 1] = b;
> +      }
> +}
> +
> +void
> +f5 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 2; i < 1000; i += 4)
> +    {
> +      double a = q[i - 2] + p[i - 2];
> +      double b = q[i - 1] + p[i - 1];
> +      q[i - 2] = a;
> +      q[i - 1] = b;
> +    }
> +}
> +
> +double p[1000];
> +double q[1000];
> +
> +void
> +f6 (int n)
> +{
> +  for (unsigned int i = 0; i < n; i += 4)
> +    {
> +      double a = q[i] + p[i];
> +      double b = q[i + 1] + p[i + 1];
> +      q[i] = a;
> +      q[i + 1] = b;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 6 "slp1" } } */
> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c
> ===================================================================
> --- /dev/null   2018-01-30 17:30:22.185477046 +0000
> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c        2018-01-31 
> 13:26:13.487630644 +0000
> @@ -0,0 +1,64 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */
> +/* { dg-require-effective-target lp64 } */
> +
> +double p[1000];
> +double q[1000];
> +
> +void
> +f1 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)
> +    {
> +      double a = q[i + 2] + p[i + 2];
> +      double b = q[i + 3] + p[i + 3];
> +      q[i + 2] = a;
> +      q[i + 3] = b;
> +    }
> +}
> +
> +void
> +f2 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)
> +    {
> +      double a = q[i + 4] + p[i + 4];
> +      double b = q[i + 5] + p[i + 5];
> +      q[i + 4] = a;
> +      q[i + 5] = b;
> +    }
> +}
> +
> +void
> +f3 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 0; i < 1000; i += 4)
> +    {
> +      double a = q[i - 2] + p[i - 2];
> +      double b = q[i - 1] + p[i - 1];
> +      q[i - 2] = a;
> +      q[i - 1] = b;
> +    }
> +}
> +
> +void
> +f4 (double *p, double *q)
> +{
> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
> +  for (unsigned int i = 2; i < 1000; i += 4)
> +    {
> +      double a = q[i - 4] + p[i - 4];
> +      double b = q[i - 3] + p[i - 3];
> +      q[i - 4] = a;
> +      q[i - 3] = b;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */

Reply via email to