On Tue, May 5, 2020 at 12:27 AM Jakub Jelinek via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi! > > The popcount* testcases show yet another creative way to write popcount, > but rather than adjusting the popcount matcher to deal with it, I think > we just should canonicalize those (X + (X << C) to X * (1 + (1 << C)) > and (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2)), because for > multiplication we already have simplification rules that can handle nested > multiplication (X * CST1 * CST2), while the the shifts and adds we have > nothing like that. And user could have written the multiplication anyway, > so if we don't emit the fastest or smallest code for the multiplication by > constant, we should improve that. At least on the testcases seems the > emitted code is reasonable according to cost, except that perhaps we could > in some cases try to improve expansion of vector multiplication by > uniform constant. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2020-05-05 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/94800 > * match.pd (X + (X << C) to X * (1 + (1 << C)), > (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2))): New > canonicalizations. > > * gcc.dg/tree-ssa/pr94800.c: New test. > * gcc.dg/tree-ssa/popcount5.c: New test. > * gcc.dg/tree-ssa/popcount5l.c: New test. > * gcc.dg/tree-ssa/popcount5ll.c: New test. > > --- gcc/match.pd.jj 2020-05-04 12:15:33.220799388 +0200 > +++ gcc/match.pd 2020-05-04 18:54:58.789063922 +0200 > @@ -2570,6 +2570,41 @@ (define_operator_list COND_TERNARY > && single_use (@3)) > (mult (plusminus @2 { build_one_cst (type); }) @0)))))) > > +#if GIMPLE > +/* Canonicalize X + (X << C) into X * (1 + (1 << C)) and > + (X << C1) + (X << C2) into X * ((1 << C1) + (1 << C2)). */ > +(simplify > + (plus:c @0 (lshift:s @0 INTEGER_CST@1)) > + (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0)) > + && tree_fits_uhwi_p (@1) > + && tree_to_uhwi (@1) < element_precision (type)) > + (with { tree t = type; > + if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t); > + wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1), > + element_precision (type)); > + w += 1; > + tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t) > + : t, w); > + cst = build_uniform_cst (t, cst); } > + (convert (mult (convert:t @0) { cst; }))))) > +(simplify > + (plus (lshift:s @0 INTEGER_CST@1) (lshift:s @0 INTEGER_CST@2)) > + (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0)) > + && tree_fits_uhwi_p (@1) > + && tree_to_uhwi (@1) < element_precision (type) > + && tree_fits_uhwi_p (@2) > + && tree_to_uhwi (@2) < element_precision (type)) > + (with { tree t = type; > + if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t); > + unsigned int prec = element_precision (type); > + wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1), prec); > + w += wi::set_bit_in_zero (tree_to_uhwi (@2), prec); > + tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t) > + : t, w); > + cst = build_uniform_cst (t, cst); } > + (convert (mult (convert:t @0) { cst; }))))) > +#endif > + > /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */ > > (for minmax (min max FMIN_ALL FMAX_ALL) > --- gcc/testsuite/gcc.dg/tree-ssa/pr94800.c.jj 2020-05-04 19:18:24.683086245 > +0200 > +++ gcc/testsuite/gcc.dg/tree-ssa/pr94800.c 2020-05-04 19:17:26.530954010 > +0200 > @@ -0,0 +1,80 @@ > +/* PR tree-optimization/94800 */ > +/* { dg-do compile { target { ilp32 || lp64 } } } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > +/* { dg-final { scan-tree-dump-times " \* 72340172838076673" 2 "optimized" } > } */ > +/* { dg-final { scan-tree-dump-times " \* 16843009" 2 "optimized" } } */ > +/* { dg-final { scan-tree-dump-times " \* 289360691352306692" 2 "optimized" > } } */ > +/* { dg-final { scan-tree-dump-times " \* 1229782938247303441" 2 "optimized" > } } */ > +/* { dg-final { scan-tree-dump-not "<<" "optimized" } } */ > + > +unsigned long long int > +foo (unsigned long long int x) > +{ > + unsigned long long int a = x + (x << 8); > + unsigned long long int b = a + (a << 16); > + unsigned long long int c = b + (b << 32); > + return c; > +} > + > +unsigned int > +bar (unsigned int x) > +{ > + unsigned int a = x + (x << 8); > + unsigned int b = a + (a << 16); > + return b; > +} > + > +unsigned long long int > +baz (unsigned long long int x) > +{ > + unsigned long long int a = (x << 2) + (x << 10); > + unsigned long long int b = a + (a << 16); > + unsigned long long int c = b + (b << 32); > + return c; > +} > + > +unsigned long long int > +qux (unsigned long long int x) > +{ > + unsigned long long int a = x + (x << 4); > + unsigned long long int b = a + (a << 8); > + unsigned long long int c = b + (b << 16); > + unsigned long long int d = c + (c << 32); > + return d; > +} > + > +long long int > +quux (long long int x) > +{ > + long long int a = x + (x << 8); > + long long int b = a + (a << 16); > + long long int c = b + (b << 32); > + return c; > +} > + > +int > +corge (int x) > +{ > + int a = x + (x << 8); > + int b = a + (a << 16); > + return b; > +} > + > +long long int > +garply (long long int x) > +{ > + long long int a = (x << 2) + (x << 10); > + long long int b = a + (a << 16); > + long long int c = b + (b << 32); > + return c; > +} > + > +long long int > +waldo (long long int x) > +{ > + long long int a = x + (x << 4); > + long long int b = a + (a << 8); > + long long int c = b + (b << 16); > + long long int d = c + (c << 32); > + return d; > +} > --- gcc/testsuite/gcc.dg/tree-ssa/popcount5.c.jj 2020-05-04 > 19:25:56.406345437 +0200 > +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5.c 2020-05-04 19:25:50.447434358 > +0200 > @@ -0,0 +1,22 @@ > +/* PR tree-optimization/94800 */ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcount } */ > +/* { dg-require-effective-target int32plus } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +const unsigned m1 = 0x55555555UL; > +const unsigned m2 = 0x33333333UL; > +const unsigned m4 = 0x0F0F0F0FUL; > +const int shift = 24; > + > +int popcount64c(unsigned x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + x += (x << 8); > + x += (x << 16); > + return x >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ > --- gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c.jj 2020-05-04 > 19:26:48.134573527 +0200 > +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c 2020-05-04 19:28:19.376211969 > +0200 > @@ -0,0 +1,32 @@ > +/* PR tree-optimization/94800 */ > +/* { dg-do compile { target int32plus } } */ > +/* { dg-require-effective-target popcountl } */ > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +#if __SIZEOF_LONG__ == 4 > +const unsigned long m1 = 0x55555555UL; > +const unsigned long m2 = 0x33333333UL; > +const unsigned long m4 = 0x0F0F0F0FUL; > +const int shift = 24; > +#else > +const unsigned long m1 = 0x5555555555555555UL; > +const unsigned long m2 = 0x3333333333333333UL; > +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL; > +const int shift = 56; > +#endif > + > + > +int popcount64c(unsigned long x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + x += (x << 8); > + x += (x << 16); > +#if __SIZEOF_LONG__ != 4 > + x += (x << 32); > +#endif > + return x >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ > --- gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c.jj 2020-05-04 > 19:28:30.325048594 +0200 > +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c 2020-05-04 19:29:18.884323974 > +0200 > @@ -0,0 +1,22 @@ > +/* PR tree-optimization/94800 */ > +/* { dg-do compile } */ > +/* { dg-require-effective-target popcountll } */
Shouldn't dg-additional-options be used to enable POPCOUNT? This test fails on Linux/x86-64 with -m32 -msse4: $ make check-gcc RUNTESTFLAGS="--target_board='unix{-m32\ -msse4,-mx32\ -msse4,-msse4}' tree-ssa.exp=popcount5ll.c" ... FAIL: gcc.dg/tree-ssa/popcount5ll.c scan-tree-dump-times optimized ".POPCOUNT" 1 > +/* { dg-options "-O2 -fdump-tree-optimized" } */ > + > +const unsigned long long m1 = 0x5555555555555555ULL; > +const unsigned long long m2 = 0x3333333333333333ULL; > +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL; > +const int shift = 56; > + > +int popcount64c(unsigned long long x) > +{ > + x -= (x >> 1) & m1; > + x = (x & m2) + ((x >> 2) & m2); > + x = (x + (x >> 4)) & m4; > + x += (x << 8); > + x += (x << 16); > + x += (x << 32); > + return x >> shift; > +} > + > +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */ > > Jakub > -- H.J.