On Wed, Jan 31, 2018 at 4:06 PM, Richard Sandiford <richard.sandif...@linaro.org> wrote: > This patch implements the original suggestion for fixing PR 81635: > use range info in split_constant_offset to see whether a conversion > of a wrapping type can be split. The range info problem described in: > > https://gcc.gnu.org/ml/gcc-patches/2017-08/msg01002.html > > seems to have been fixed. > > The patch is part 1. There needs to be a follow-on patch to handle: > > for (unsigned int i = 0; i < n; i += 4) > { > ...[i + 2]... > ...[i + 3]... > > which the old SCEV test handles, but which the range check doesn't. > At the moment we record that the low two bits of "i" are clear, > but we still end up with a maximum range of 0xffffffff rather than > 0xfffffffc. > > Tested on aarch64-linux-gnu, x86_64-linux-gnu and powerpc64le-linux-gnu. > Also tested by comparing the before and after testsuite assembly output > for at least one target per CPU directory. Excluding a small number > of register renamings on some targets, there were two differences: > > (1) In gcc.c-torture/compile/pr55350.c: > > void > foo (__INTPTR_TYPE__ x, __INTPTR_TYPE__ y) > { > int i; > void **a = (void *) (8UL * (x / 8UL)); > for (i = 0; i < x; i++) > a[i] = (void *) y; > } > > we previously kept base "a" and offset 0, but now use > "(void **) _n" (where _n holds the multiplication result). > > This is because the old test had the side-effect of prohibiting > casts from unsigned ints to pointers of the same size. > What we do for code like this isn't going to help much though. > > (2) In gcc.c-torture/execute/2003016-1.c, we unrolled: > > void f (unsigned int *x) > { > unsigned char i; > int j; > > i = 0x10; > for (j = 0; j < 0x10; j++) > { > i += 0xe8; > x[i] = 0; > i -= 0xe7; > } > } > > and ended up with an unpropagated degenerate phi: > > # i_38 = PHI <16(3)> > ... > i_40 = i_38 + 1; > ... > i_48 = i_40 + 1; > ... etc ... > i_178 = i_168 + 1; > i_16 = i_178 + 232; > _17 = (long unsigned int) i_16; > _18 = _17 * 4; > _19 = &x + _18; > *_19 = 0; > > Calling split_constant_offset on each (long unsigned int) operand > gives i_38 + 0xe8, i_38 + 0xe9, ..., i_38 + 0xf7, with i_38 > still having the range [0x10, 0x20]. We can therefore tell > that i_38 + 0xf0 has the range [0x00, 0x10], and similarly > for +0xf1...+0xf7. > > We should really be folding to constants here though. > > OK to install? > > > 2018-01-31 Richard Sandiford <richard.sandif...@linaro.org> > > gcc/ > PR tree-optimization/81635 > * tree-data-ref.c (split_constant_offset_1): For types that > wrap on overflow, try to use range info to prove that wrapping > cannot occur. > > gcc/testsuite/ > PR tree-optimization/81635 > * gcc.dg/vect/bb-slp-pr81635-1.c: New test. > * gcc.dg/vect/bb-slp-pr81635-2.c: Likewise. > > Index: gcc/tree-data-ref.c > =================================================================== > --- gcc/tree-data-ref.c 2018-01-13 18:02:00.946360352 +0000 > +++ gcc/tree-data-ref.c 2018-01-31 13:26:13.488630604 +0000 > @@ -704,11 +704,46 @@ split_constant_offset_1 (tree type, tree > and the outer precision is at least as large as the inner. */ > tree itype = TREE_TYPE (op0); > if ((POINTER_TYPE_P (itype) > - || (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_UNDEFINED (itype))) > + || (INTEGRAL_TYPE_P (itype) && !TYPE_OVERFLOW_TRAPS (itype))) > && TYPE_PRECISION (type) >= TYPE_PRECISION (itype) > && (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type))) > { > - split_constant_offset (op0, &var0, off); > + if (INTEGRAL_TYPE_P (itype) && TYPE_OVERFLOW_WRAPS (itype)) > + { > + /* Split the unconverted operand and try to prove that > + wrapping isn't a problem. */ > + tree tmp_var, tmp_off; > + split_constant_offset (op0, &tmp_var, &tmp_off); > + > + /* See whether we have an SSA_NAME whose range is known > + to be [A, B]. */ > + if (TREE_CODE (tmp_var) != SSA_NAME) > + return false; > + wide_int var_min, var_max; > + if (get_range_info (tmp_var, &var_min, &var_max) != VR_RANGE) > + return false; > + > + /* See whether the range of OP0 (i.e. TMP_VAR + TMP_OFF) > + is known to be [A + TMP_OFF, B + TMP_OFF], with all > + operations done in ITYPE. The addition must overflow > + at both ends of the range or at neither. */
Hmm, if they overflow at both ends that's still an issue given we effectively widen the addition? Consider char -> short and [255U, 255U] + 1 which when widened will be 256 but when wrapping in char it's 0... > + bool overflow[2]; > + signop sgn = TYPE_SIGN (itype); > + unsigned int prec = TYPE_PRECISION (itype); > + wide_int woff = wi::to_wide (tmp_off, prec); > + wide_int op0_min = wi::add (var_min, woff, sgn, &overflow[0]); > + wi::add (var_max, woff, sgn, &overflow[1]); > + if (overflow[0] != overflow[1]) > + return false; > + > + /* Calculate (ssizetype) OP0 - (ssizetype) TMP_VAR. */ > + widest_int diff = (widest_int::from (op0_min, sgn) > + - widest_int::from (var_min, sgn)); > + var0 = tmp_var; > + *off = wide_int_to_tree (ssizetype, diff); ... or is this part trying to deal with that case? Because I don't understand it - isn't 'off' simply supposed to be 'tmp_off' literally? I think this needs more explanation... if there's no overflow that would be all that is to do, no? Thanks, Richard. > + } > + else > + split_constant_offset (op0, &var0, off); > *var = fold_convert (type, var0); > return true; > } > Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c > =================================================================== > --- /dev/null 2018-01-30 17:30:22.185477046 +0000 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-1.c 2018-01-31 > 13:26:13.487630644 +0000 > @@ -0,0 +1,92 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-fno-tree-loop-vectorize" } */ > +/* { dg-require-effective-target vect_double } */ > +/* { dg-require-effective-target lp64 } */ > + > +void > +f1 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 0; i < 1000; i += 4) > + { > + double a = q[i] + p[i]; > + double b = q[i + 1] + p[i + 1]; > + q[i] = a; > + q[i + 1] = b; > + } > +} > + > +void > +f2 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 2; i < ~0U - 4; i += 4) > + { > + double a = q[i] + p[i]; > + double b = q[i + 1] + p[i + 1]; > + q[i] = a; > + q[i + 1] = b; > + } > +} > + > +void > +f3 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 0; i < ~0U - 3; i += 4) > + { > + double a = q[i + 2] + p[i + 2]; > + double b = q[i + 3] + p[i + 3]; > + q[i + 2] = a; > + q[i + 3] = b; > + } > +} > + > +void > +f4 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 0; i < 500; i += 6) > + for (unsigned int j = 0; j < 500; j += 4) > + { > + double a = q[j] + p[i]; > + double b = q[j + 1] + p[i + 1]; > + q[i] = a; > + q[i + 1] = b; > + } > +} > + > +void > +f5 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 2; i < 1000; i += 4) > + { > + double a = q[i - 2] + p[i - 2]; > + double b = q[i - 1] + p[i - 1]; > + q[i - 2] = a; > + q[i - 1] = b; > + } > +} > + > +double p[1000]; > +double q[1000]; > + > +void > +f6 (int n) > +{ > + for (unsigned int i = 0; i < n; i += 4) > + { > + double a = q[i] + p[i]; > + double b = q[i + 1] + p[i + 1]; > + q[i] = a; > + q[i + 1] = b; > + } > +} > + > +/* { dg-final { scan-tree-dump-times "basic block vectorized" 6 "slp1" } } */ > Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c > =================================================================== > --- /dev/null 2018-01-30 17:30:22.185477046 +0000 > +++ gcc/testsuite/gcc.dg/vect/bb-slp-pr81635-2.c 2018-01-31 > 13:26:13.487630644 +0000 > @@ -0,0 +1,64 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-fno-tree-loop-vectorize" } */ > +/* { dg-require-effective-target lp64 } */ > + > +double p[1000]; > +double q[1000]; > + > +void > +f1 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 2; i < ~0U - 4; i += 4) > + { > + double a = q[i + 2] + p[i + 2]; > + double b = q[i + 3] + p[i + 3]; > + q[i + 2] = a; > + q[i + 3] = b; > + } > +} > + > +void > +f2 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 0; i < ~0U - 3; i += 4) > + { > + double a = q[i + 4] + p[i + 4]; > + double b = q[i + 5] + p[i + 5]; > + q[i + 4] = a; > + q[i + 5] = b; > + } > +} > + > +void > +f3 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 0; i < 1000; i += 4) > + { > + double a = q[i - 2] + p[i - 2]; > + double b = q[i - 1] + p[i - 1]; > + q[i - 2] = a; > + q[i - 1] = b; > + } > +} > + > +void > +f4 (double *p, double *q) > +{ > + p = (double *) __builtin_assume_aligned (p, sizeof (double) * 2); > + q = (double *) __builtin_assume_aligned (q, sizeof (double) * 2); > + for (unsigned int i = 2; i < 1000; i += 4) > + { > + double a = q[i - 4] + p[i - 4]; > + double b = q[i - 3] + p[i - 3]; > + q[i - 4] = a; > + q[i - 3] = b; > + } > +} > + > +/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */