On Fri, Jan 4, 2019 at 12:49 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > This patch folds certain reductions of X & CST to X[I] & CST[I] if I is > the only nonzero element of CST. This includes the motivating case in > which CST[I] is -1. > > We could do the same for REDUC_MAX on unsigned types, but I wasn't sure > that that special case was worth it. > > Tested on aarch64-linux-gnu, aarch64_be-elf and x86_64-linux-gnu. > OK to install? > > Obvious follow-ons include handling multiplications of > single_nonzero_element constants, multiplications of uniform constants, > and additions of any constant, but this is enough to fix the PR.
OK. Thanks, Richard. > Richard > > > 2019-01-04 Richard Sandiford <richard.sandif...@arm.com> > > gcc/ > PR tree-optimization/88598 > * tree.h (single_nonzero_element): Declare. > * tree.c (single_nonzero_element): New function. > * match.pd: Fold certain reductions of X & CST to X[I] & CST[I] > if I is the only nonzero element of CST. > > gcc/testsuite/ > PR tree-optimization/88598 > * gcc.dg/vect/pr88598-1.c: New test. > * gcc.dg/vect/pr88598-2.c: Likewise. > * gcc.dg/vect/pr88598-3.c: Likewise. > * gcc.dg/vect/pr88598-4.c: Likewise. > * gcc.dg/vect/pr88598-5.c: Likewise. > * gcc.dg/vect/pr88598-6.c: Likewise. > > Index: gcc/tree.h > =================================================================== > --- gcc/tree.h 2019-01-04 11:40:33.141683783 +0000 > +++ gcc/tree.h 2019-01-04 11:40:36.581654424 +0000 > @@ -4522,6 +4522,8 @@ extern tree uniform_vector_p (const_tree > > extern tree uniform_integer_cst_p (tree); > > +extern int single_nonzero_element (const_tree); > + > /* Given a CONSTRUCTOR CTOR, return the element values as a vector. */ > > extern vec<tree, va_gc> *ctor_to_vec (tree); > Index: gcc/tree.c > =================================================================== > --- gcc/tree.c 2019-01-04 11:40:33.141683783 +0000 > +++ gcc/tree.c 2019-01-04 11:40:36.577654458 +0000 > @@ -11355,6 +11355,38 @@ uniform_integer_cst_p (tree t) > return NULL_TREE; > } > > +/* If VECTOR_CST T has a single nonzero element, return the index of that > + element, otherwise return -1. */ > + > +int > +single_nonzero_element (const_tree t) > +{ > + unsigned HOST_WIDE_INT nelts; > + unsigned int repeat_nelts; > + if (VECTOR_CST_NELTS (t).is_constant (&nelts)) > + repeat_nelts = nelts; > + else if (VECTOR_CST_NELTS_PER_PATTERN (t) == 2) > + { > + nelts = vector_cst_encoded_nelts (t); > + repeat_nelts = VECTOR_CST_NPATTERNS (t); > + } > + else > + return -1; > + > + int res = -1; > + for (unsigned int i = 0; i < nelts; ++i) > + { > + tree elt = vector_cst_elt (t, i); > + if (!integer_zerop (elt) && !real_zerop (elt)) > + { > + if (res >= 0 || i >= repeat_nelts) > + return -1; > + res = i; > + } > + } > + return res; > +} > + > /* Build an empty statement at location LOC. */ > > tree > Index: gcc/match.pd > =================================================================== > --- gcc/match.pd 2019-01-04 11:40:33.137683817 +0000 > +++ gcc/match.pd 2019-01-04 11:40:36.573654492 +0000 > @@ -5251,3 +5251,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > { wide_int_to_tree (sizetype, off); }) > { swap_p ? @0 : @2; })) > { rhs_tree; }))))))))) > + > +/* Fold REDUC (@0 & @1) -> @0[I] & @1[I] if element I is the only nonzero > + element of @1. */ > +(for reduc (IFN_REDUC_PLUS IFN_REDUC_IOR IFN_REDUC_XOR) > + (simplify (reduc (view_convert? (bit_and @0 VECTOR_CST@1))) > + (with { int i = single_nonzero_element (@1); } > + (if (i >= 0) > + (with { tree elt = vector_cst_elt (@1, i); > + tree elt_type = TREE_TYPE (elt); > + unsigned int elt_bits = tree_to_uhwi (TYPE_SIZE (elt_type)); > + tree size = bitsize_int (elt_bits); > + tree pos = bitsize_int (elt_bits * i); } > + (view_convert > + (bit_and:elt_type > + (BIT_FIELD_REF:elt_type @0 { size; } { pos; }) > + { elt; }))))))) > Index: gcc/testsuite/gcc.dg/vect/pr88598-1.c > =================================================================== > --- /dev/null 2018-12-31 11:20:29.178325188 +0000 > +++ gcc/testsuite/gcc.dg/vect/pr88598-1.c 2019-01-04 11:40:36.577654458 > +0000 > @@ -0,0 +1,55 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-fdump-tree-optimized" } */ > + > +#include "tree-vect.h" > + > +#define N 4 > + > +int a[N]; > + > +int __attribute__ ((noipa)) > +f1 (void) > +{ > + int b[N] = { 15, 0, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] & b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f2 (void) > +{ > + int b[N] = { 0, 31, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] & b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f3 (void) > +{ > + int b[N] = { 0, 0, 0, -1 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] & b[i]; > + return res; > +} > + > +int > +main () > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + a[i] = 0xe0 + i; > + > + if (f1 () != (a[0] & 15) > + || f2 () != (a[1] & 31) > + || f3 () != a[N - 1]) > + __builtin_abort (); > + > + return 0; > +} > + > +/* ??? We need more constant folding for this to work with fully-masked > + loops. */ > +/* { dg-final { scan-tree-dump-not {REDUC_PLUS} "optimized" { xfail > aarch64_sve } } } */ > Index: gcc/testsuite/gcc.dg/vect/pr88598-2.c > =================================================================== > --- /dev/null 2018-12-31 11:20:29.178325188 +0000 > +++ gcc/testsuite/gcc.dg/vect/pr88598-2.c 2019-01-04 11:40:36.577654458 > +0000 > @@ -0,0 +1,55 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-fdump-tree-optimized" } */ > + > +#include "tree-vect.h" > + > +#define N 4 > + > +int a[N]; > + > +int __attribute__ ((noipa)) > +f1 (void) > +{ > + int b[N] = { 1, 0, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f2 (void) > +{ > + int b[N] = { 0, 1, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f3 (void) > +{ > + int b[N] = { 0, 0, 0, 1 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int > +main () > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + a[i] = 0xe0 + i; > + > + if (f1 () != a[0] > + || f2 () != a[1] > + || f3 () != a[N - 1]) > + __builtin_abort (); > + > + return 0; > +} > + > +/* ??? We need more constant folding for this to work with fully-masked > + loops. */ > +/* { dg-final { scan-tree-dump-not {REDUC_PLUS} "optimized" { xfail > aarch64_sve } } } */ > Index: gcc/testsuite/gcc.dg/vect/pr88598-3.c > =================================================================== > --- /dev/null 2018-12-31 11:20:29.178325188 +0000 > +++ gcc/testsuite/gcc.dg/vect/pr88598-3.c 2019-01-04 11:40:36.577654458 > +0000 > @@ -0,0 +1,55 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-ffast-math -fdump-tree-optimized" } */ > + > +#include "tree-vect.h" > + > +#define N 4 > + > +float a[N]; > + > +float __attribute__ ((noipa)) > +f1 (void) > +{ > + float b[N] = { 1, 0, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +float __attribute__ ((noipa)) > +f2 (void) > +{ > + float b[N] = { 0, 1, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +float __attribute__ ((noipa)) > +f3 (void) > +{ > + float b[N] = { 0, 0, 0, 1 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int > +main () > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + a[i] = 0xe0 + i; > + > + if (f1 () != a[0] > + || f2 () != a[1] > + || f3 () != a[N - 1]) > + __builtin_abort (); > + > + return 0; > +} > + > +/* ??? We need more constant folding for this to work with fully-masked > + loops. */ > +/* { dg-final { scan-tree-dump-not {REDUC_PLUS} "optimized" { xfail > aarch64_sve } } } */ > Index: gcc/testsuite/gcc.dg/vect/pr88598-4.c > =================================================================== > --- /dev/null 2018-12-31 11:20:29.178325188 +0000 > +++ gcc/testsuite/gcc.dg/vect/pr88598-4.c 2019-01-04 11:40:36.577654458 > +0000 > @@ -0,0 +1,51 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-fdump-tree-optimized" } */ > + > +#include "tree-vect.h" > + > +#define N 4 > + > +int a[N]; > + > +int __attribute__ ((noipa)) > +f1 (void) > +{ > + int b[N] = { 0, 15, 15, 15 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] & b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f2 (void) > +{ > + int b[N] = { 0, 31, 0, 31 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] & b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f3 (void) > +{ > + int b[N] = { -1, -1, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] & b[i]; > + return res; > +} > + > +int > +main () > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + a[i] = 0xe0 + i; > + > + if (f1 () != (a[1] & 15) + (a[2] & 15) + (a[3] & 15) > + || f2 () != (a[1] & 31) + (a[3] & 31) > + || f3 () != a[0] + a[1]) > + __builtin_abort (); > + > + return 0; > +} > Index: gcc/testsuite/gcc.dg/vect/pr88598-5.c > =================================================================== > --- /dev/null 2018-12-31 11:20:29.178325188 +0000 > +++ gcc/testsuite/gcc.dg/vect/pr88598-5.c 2019-01-04 11:40:36.577654458 > +0000 > @@ -0,0 +1,51 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-fdump-tree-optimized" } */ > + > +#include "tree-vect.h" > + > +#define N 4 > + > +int a[N]; > + > +int __attribute__ ((noipa)) > +f1 (void) > +{ > + int b[N] = { 0, 1, 1, 1 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f2 (void) > +{ > + int b[N] = { 0, 1, 0, 1 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int __attribute__ ((noipa)) > +f3 (void) > +{ > + int b[N] = { 1, 1, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int > +main () > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + a[i] = 0xe0 + i; > + > + if (f1 () != a[1] + a[2] + a[3] > + || f2 () != a[1] + a[3] > + || f3 () != a[0] + a[1]) > + __builtin_abort (); > + > + return 0; > +} > Index: gcc/testsuite/gcc.dg/vect/pr88598-6.c > =================================================================== > --- /dev/null 2018-12-31 11:20:29.178325188 +0000 > +++ gcc/testsuite/gcc.dg/vect/pr88598-6.c 2019-01-04 11:40:36.577654458 > +0000 > @@ -0,0 +1,51 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-ffast-math -fdump-tree-optimized" } */ > + > +#include "tree-vect.h" > + > +#define N 4 > + > +float a[N]; > + > +float __attribute__ ((noipa)) > +f1 (void) > +{ > + float b[N] = { 0, 1, 1, 1 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +float __attribute__ ((noipa)) > +f2 (void) > +{ > + float b[N] = { 0, 1, 0, 1 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +float __attribute__ ((noipa)) > +f3 (void) > +{ > + float b[N] = { 1, 1, 0, 0 }, res = 0; > + for (int i = 0; i < N; ++i) > + res += a[i] * b[i]; > + return res; > +} > + > +int > +main () > +{ > + check_vect (); > + > + for (int i = 0; i < N; ++i) > + a[i] = 0xe0 + i; > + > + if (f1 () != a[1] + a[2] + a[3] > + || f2 () != a[1] + a[3] > + || f3 () != a[0] + a[1]) > + __builtin_abort (); > + > + return 0; > +}