On Thu, Feb 1, 2018 at 2:18 PM, Richard Sandiford
<richard.sandif...@linaro.org> wrote:
> Richard Biener <richard.guent...@gmail.com> writes:
>> On Wed, Jan 31, 2018 at 4:06 PM, Richard Sandiford
>> <richard.sandif...@linaro.org> wrote:
>>> This patch implements the original suggestion for fixing PR 81635:
>>> use range info in split_constant_offset to see whether a conversion
>>> of a wrapping type can be split.  The range info problem described in:
>>>
>>>   https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01002.html
>>>
>>> seems to have been fixed.
>>>
>>> The patch is part 1.  There needs to be a follow-on patch to handle:
>>>
>>>   for (unsigned int i = 0; i < n; i += 4)
>>>     {
>>>       ...[i + 2]...
>>>       ...[i + 3]...
>>>
>>> which the old SCEV test handles, but which the range check doesn't.
>>> At the moment we record that the low two bits of "i" are clear,
>>> but we still end up with a maximum range of 0xffffffff rather than
>>> 0xfffffffc.
>>>
>>> Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu.
>>> Also tested by comparing the before and after testsuite assembly output
>>> for at least one target per CPU directory.  Excluding a small number
>>> of register renamings on some targets, there were two differences:
>>>
>>> (1) In gcc.c-torture/compile/pr55350.c:
>>>
>>>     void
>>>     foo (__INTPTR_TYPE__ x, __INTPTR_TYPE__ y)
>>>     {
>>>       int i;
>>>       void **a = (void *)  (8UL * (x / 8UL));
>>>       for (i = 0; i < x; i++)
>>>         a[i] = (void *) y;
>>>     }
>>>
>>>     we previously kept base "a" and offset 0, but now use
>>>     "(void **) _n" (where _n holds the multiplication result).
>>>
>>>     This is because the old test had the side-effect of prohibiting
>>>     casts from unsigned ints to pointers of the same size.
>>>     What we do for code like this isn't going to help much though.
>>>
>>> (2) In gcc.c-torture/execute/2003016-1.c, we unrolled:
>>>
>>>     void f (unsigned int *x)
>>>     {
>>>       unsigned char i;
>>>       int j;
>>>
>>>       i = 0x10;
>>>       for (j = 0; j < 0x10; j++)
>>>         {
>>>           i += 0xe8;
>>>           x[i] = 0;
>>>           i -= 0xe7;
>>>         }
>>>     }
>>>
>>>     and ended up with an unpropagated degenerate phi:
>>>
>>>       # i_38 = PHI <16(3)>
>>>       ...
>>>       i_40 = i_38 + 1;
>>>       ...
>>>       i_48 = i_40 + 1;
>>>       ... etc ...
>>>       i_178 = i_168 + 1;
>>>       i_16 = i_178 + 232;
>>>       _17 = (long unsigned int) i_16;
>>>       _18 = _17 * 4;
>>>       _19 = &x + _18;
>>>       *_19 = 0;
>>>
>>>     Calling split_constant_offset on each (long unsigned int) operand
>>>     gives i_38 + 0xe8, i_38 + 0xe9, ..., i_38 + 0xf7, with i_38
>>>     still having the range [0x10, 0x20].  We can therefore tell
>>>     that i_38 + 0xf0 has the range [0x00, 0x10], and similarly
>>>     for +0xf1...+0xf7.
>>>
>>>     We should really be folding to constants here though.
>>>
>>> OK to install?
>>>
>>>
>>> 2018-01-31  Richard Sandiford  <richard.sandif...@linaro.org>
>>>
>>> gcc/
>>>         PR tree-optimization/81635
>>>         * tree-data-ref.c (split_constant_offset_1): For types that
>>>         wrap on overflow, try to use range info to prove that wrapping
>>>         cannot occur.
>>>
>>> gcc/testsuite/
>>>         PR tree-optimization/81635
>>>         * gcc.dg/vect/bb-slp-pr81635-1.c: New test.
>>>         * gcc.dg/vect/bb-slp-pr81635-2.c: Likewise.
>>>
>>> Index: gcc/tree-data-ref.c
>>> ===================================================================
>>> --- gcc/tree-data-ref.c 2018-01-13 18:02:00.946360352 +0000
>>> +++ gcc/tree-data-ref.c 2018-01-31 13:26:13.488630604 +0000
>>> @@ -704,11 +704,46 @@ split_constant_offset_1 (tree type, tree
>>>            and the outer precision is at least as large as the inner.  */
>>>         tree itype = TREE_TYPE (op0);
>>>         if ((POINTER_TYPE_P (itype)
>>> -            || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED 
>>> (itype)))
>>> +            || (INTEGRAL_TYPE_P (itype) && !TYPE_OVERFLOW_TRAPS (itype)))
>>>             && TYPE_PRECISION (type) >= TYPE_PRECISION (itype)
>>>             && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type)))
>>>           {
>>> -           split_constant_offset (op0, &var0, off);
>>> +           if (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_WRAPS (itype))
>>> +             {
>>> +               /* Split the unconverted operand and try to prove that
>>> +                  wrapping isn't a problem.  */
>>> +               tree tmp_var, tmp_off;
>>> +               split_constant_offset (op0, &tmp_var, &tmp_off);
>>> +
>>> +               /* See whether we have an SSA_NAME whose range is known
>>> +                  to be [A, B].  */
>>> +               if (TREE_CODE (tmp_var) != SSA_NAME)
>>> +                 return false;
>>> +               wide_int var_min, var_max;
>>> +               if (get_range_info (tmp_var, &var_min, &var_max) != 
>>> VR_RANGE)
>>> +                 return false;
>>> +
>>> +               /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF)
>>> +                  is known to be [A + TMP_OFF, B + TMP_OFF], with all
>>> +                  operations done in ITYPE.  The addition must overflow
>>> +                  at both ends of the range or at neither.  */
>>
>> Hmm, if they overflow at both ends that's still an issue given we effectively
>> widen the addition?  Consider char -> short and [255U, 255U] + 1 which
>> when widened will be 256 but when wrapping in char it's 0...
>>
>>> +               bool overflow[2];
>>> +               signop sgn = TYPE_SIGN (itype);
>>> +               unsigned int prec = TYPE_PRECISION (itype);
>>> +               wide_int woff = wi::to_wide (tmp_off, prec);
>>> +               wide_int op0_min = wi::add (var_min, woff, sgn, 
>>> &overflow[0]);
>>> +               wi::add (var_max, woff, sgn, &overflow[1]);
>>> +               if (overflow[0] != overflow[1])
>>> +                 return false;
>>> +
>>> +               /* Calculate (ssizetype) OP0 - (ssizetype) TMP_VAR.  */
>>> +               widest_int diff = (widest_int::from (op0_min, sgn)
>>> +                                  - widest_int::from (var_min, sgn));
>>> +               var0 = tmp_var;
>>> +               *off = wide_int_to_tree (ssizetype, diff);
>>
>> ... or is this part trying to deal with that case?  Because I don't 
>> understand
>> it - isn't 'off' simply supposed to be 'tmp_off' literally?  I think this 
>> needs
>> more explanation...  if there's no overflow that would be all that is to do, 
>> no?
>
> Yeah, like you say, this is to handle the case in which they overflow
> at both ends.  So in your example op0_min is 0 and var_min is 255,
> so we end up with the inner char + -255.
>
> The idea is that we don't care whether overflow happens per se.
> All we care about is whether values of the unconverted operand (op0)
> are displaced from the values of the inner variable by a constant
> amount, calculated as diff here.

Ok.  Can you amend the "Calculate (ssizetype) OP0 ... comment by,
say, ".. to handle the case where both ends wrapped"?

Ok with that change.
Richard.

> Thanks,
> Richard
>
>
>>
>> Thanks,
>> Richard.
>>
>>> +             }
>>> +           else
>>> +             split_constant_offset (op0, &var0, off);
>>>             *var = fold_convert (type, var0);
>>>             return true;
>>>           }
>>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c
>>> ===================================================================
>>> --- /dev/null   2018-01-30 17:30:22.185477046 +0000
>>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c        2018-01-31 
>>> 13:26:13.487630644 +0000
>>> @@ -0,0 +1,92 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */
>>> +/* { dg-require-effective-target vect_double } */
>>> +/* { dg-require-effective-target lp64 } */
>>> +
>>> +void
>>> +f1 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < 1000; i += 4)
>>> +    {
>>> +      double a = q[i] + p[i];
>>> +      double b = q[i + 1] + p[i + 1];
>>> +      q[i] = a;
>>> +      q[i + 1] = b;
>>> +    }
>>> +}
>>> +
>>> +void
>>> +f2 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)
>>> +    {
>>> +      double a = q[i] + p[i];
>>> +      double b = q[i + 1] + p[i + 1];
>>> +      q[i] = a;
>>> +      q[i + 1] = b;
>>> +    }
>>> +}
>>> +
>>> +void
>>> +f3 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)
>>> +    {
>>> +      double a = q[i + 2] + p[i + 2];
>>> +      double b = q[i + 3] + p[i + 3];
>>> +      q[i + 2] = a;
>>> +      q[i + 3] = b;
>>> +    }
>>> +}
>>> +
>>> +void
>>> +f4 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < 500; i += 6)
>>> +    for (unsigned int j = 0; j < 500; j += 4)
>>> +      {
>>> +       double a = q[j] + p[i];
>>> +       double b = q[j + 1] + p[i + 1];
>>> +       q[i] = a;
>>> +       q[i + 1] = b;
>>> +      }
>>> +}
>>> +
>>> +void
>>> +f5 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 2; i < 1000; i += 4)
>>> +    {
>>> +      double a = q[i - 2] + p[i - 2];
>>> +      double b = q[i - 1] + p[i - 1];
>>> +      q[i - 2] = a;
>>> +      q[i - 1] = b;
>>> +    }
>>> +}
>>> +
>>> +double p[1000];
>>> +double q[1000];
>>> +
>>> +void
>>> +f6 (int n)
>>> +{
>>> +  for (unsigned int i = 0; i < n; i += 4)
>>> +    {
>>> +      double a = q[i] + p[i];
>>> +      double b = q[i + 1] + p[i + 1];
>>> +      q[i] = a;
>>> +      q[i + 1] = b;
>>> +    }
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "basic block vectorized" 6 "slp1" } } 
>>> */
>>> Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c
>>> ===================================================================
>>> --- /dev/null   2018-01-30 17:30:22.185477046 +0000
>>> +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c        2018-01-31 
>>> 13:26:13.487630644 +0000
>>> @@ -0,0 +1,64 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-additional-options "-fno-tree-loop-vectorize" } */
>>> +/* { dg-require-effective-target lp64 } */
>>> +
>>> +double p[1000];
>>> +double q[1000];
>>> +
>>> +void
>>> +f1 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 2; i < ~0U - 4; i += 4)
>>> +    {
>>> +      double a = q[i + 2] + p[i + 2];
>>> +      double b = q[i + 3] + p[i + 3];
>>> +      q[i + 2] = a;
>>> +      q[i + 3] = b;
>>> +    }
>>> +}
>>> +
>>> +void
>>> +f2 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < ~0U - 3; i += 4)
>>> +    {
>>> +      double a = q[i + 4] + p[i + 4];
>>> +      double b = q[i + 5] + p[i + 5];
>>> +      q[i + 4] = a;
>>> +      q[i + 5] = b;
>>> +    }
>>> +}
>>> +
>>> +void
>>> +f3 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 0; i < 1000; i += 4)
>>> +    {
>>> +      double a = q[i - 2] + p[i - 2];
>>> +      double b = q[i - 1] + p[i - 1];
>>> +      q[i - 2] = a;
>>> +      q[i - 1] = b;
>>> +    }
>>> +}
>>> +
>>> +void
>>> +f4 (double *p, double *q)
>>> +{
>>> +  p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2);
>>> +  q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2);
>>> +  for (unsigned int i = 2; i < 1000; i += 4)
>>> +    {
>>> +      double a = q[i - 4] + p[i - 4];
>>> +      double b = q[i - 3] + p[i - 3];
>>> +      q[i - 4] = a;
>>> +      q[i - 3] = b;
>>> +    }
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */

Reply via email to