On Tue, May 5, 2020 at 12:27 AM Jakub Jelinek via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi!
>
> The popcount* testcases show yet another creative way to write popcount,
> but rather than adjusting the popcount matcher to deal with it, I think
> we just should canonicalize those (X + (X << C) to X * (1 + (1 << C))
> and (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2)), because for
> multiplication we already have simplification rules that can handle nested
> multiplication (X * CST1 * CST2), while the the shifts and adds we have
> nothing like that.  And user could have written the multiplication anyway,
> so if we don't emit the fastest or smallest code for the multiplication by
> constant, we should improve that.  At least on the testcases seems the
> emitted code is reasonable according to cost, except that perhaps we could
> in some cases try to improve expansion of vector multiplication by
> uniform constant.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-05-05  Jakub Jelinek  <ja...@redhat.com>
>
>         PR tree-optimization/94800
>         * match.pd (X + (X << C) to X * (1 + (1 << C)),
>         (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2))): New
>         canonicalizations.
>
>         * gcc.dg/tree-ssa/pr94800.c: New test.
>         * gcc.dg/tree-ssa/popcount5.c: New test.
>         * gcc.dg/tree-ssa/popcount5l.c: New test.
>         * gcc.dg/tree-ssa/popcount5ll.c: New test.
>
> --- gcc/match.pd.jj     2020-05-04 12:15:33.220799388 +0200
> +++ gcc/match.pd        2020-05-04 18:54:58.789063922 +0200
> @@ -2570,6 +2570,41 @@ (define_operator_list COND_TERNARY
>          && single_use (@3))
>       (mult (plusminus @2 { build_one_cst (type); }) @0))))))
>
> +#if GIMPLE
> +/* Canonicalize X + (X << C) into X * (1 + (1 << C)) and
> +   (X << C1) + (X << C2) into X * ((1 << C1) + (1 << C2)).  */
> +(simplify
> + (plus:c @0 (lshift:s @0 INTEGER_CST@1))
> +  (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +       && tree_fits_uhwi_p (@1)
> +       && tree_to_uhwi (@1) < element_precision (type))
> +   (with { tree t = type;
> +          if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t);
> +          wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1),
> +                                            element_precision (type));
> +          w += 1;
> +          tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t)
> +                                       : t, w);
> +          cst = build_uniform_cst (t, cst); }
> +    (convert (mult (convert:t @0) { cst; })))))
> +(simplify
> + (plus (lshift:s @0 INTEGER_CST@1) (lshift:s @0 INTEGER_CST@2))
> +  (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +       && tree_fits_uhwi_p (@1)
> +       && tree_to_uhwi (@1) < element_precision (type)
> +       && tree_fits_uhwi_p (@2)
> +       && tree_to_uhwi (@2) < element_precision (type))
> +   (with { tree t = type;
> +          if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t);
> +          unsigned int prec = element_precision (type);
> +          wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1), prec);
> +          w += wi::set_bit_in_zero (tree_to_uhwi (@2), prec);
> +          tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t)
> +                                       : t, w);
> +          cst = build_uniform_cst (t, cst); }
> +    (convert (mult (convert:t @0) { cst; })))))
> +#endif
> +
>  /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax().  */
>
>  (for minmax (min max FMIN_ALL FMAX_ALL)
> --- gcc/testsuite/gcc.dg/tree-ssa/pr94800.c.jj  2020-05-04 19:18:24.683086245 
> +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr94800.c     2020-05-04 19:17:26.530954010 
> +0200
> @@ -0,0 +1,80 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile { target { ilp32 || lp64 } } } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times " \* 72340172838076673" 2 "optimized" } 
> } */
> +/* { dg-final { scan-tree-dump-times " \* 16843009" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " \* 289360691352306692" 2 "optimized" 
> } } */
> +/* { dg-final { scan-tree-dump-times " \* 1229782938247303441" 2 "optimized" 
> } } */
> +/* { dg-final { scan-tree-dump-not "<<" "optimized" } } */
> +
> +unsigned long long int
> +foo (unsigned long long int x)
> +{
> +  unsigned long long int a = x + (x << 8);
> +  unsigned long long int b = a + (a << 16);
> +  unsigned long long int c = b + (b << 32);
> +  return c;
> +}
> +
> +unsigned int
> +bar (unsigned int x)
> +{
> +  unsigned int a = x + (x << 8);
> +  unsigned int b = a + (a << 16);
> +  return b;
> +}
> +
> +unsigned long long int
> +baz (unsigned long long int x)
> +{
> +  unsigned long long int a = (x << 2) + (x << 10);
> +  unsigned long long int b = a + (a << 16);
> +  unsigned long long int c = b + (b << 32);
> +  return c;
> +}
> +
> +unsigned long long int
> +qux (unsigned long long int x)
> +{
> +  unsigned long long int a = x + (x << 4);
> +  unsigned long long int b = a + (a << 8);
> +  unsigned long long int c = b + (b << 16);
> +  unsigned long long int d = c + (c << 32);
> +  return d;
> +}
> +
> +long long int
> +quux (long long int x)
> +{
> +  long long int a = x + (x << 8);
> +  long long int b = a + (a << 16);
> +  long long int c = b + (b << 32);
> +  return c;
> +}
> +
> +int
> +corge (int x)
> +{
> +  int a = x + (x << 8);
> +  int b = a + (a << 16);
> +  return b;
> +}
> +
> +long long int
> +garply (long long int x)
> +{
> +  long long int a = (x << 2) + (x << 10);
> +  long long int b = a + (a << 16);
> +  long long int c = b + (b << 32);
> +  return c;
> +}
> +
> +long long int
> +waldo (long long int x)
> +{
> +  long long int a = x + (x << 4);
> +  long long int b = a + (a << 8);
> +  long long int c = b + (b << 16);
> +  long long int d = c + (c << 32);
> +  return d;
> +}
> --- gcc/testsuite/gcc.dg/tree-ssa/popcount5.c.jj        2020-05-04 
> 19:25:56.406345437 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5.c   2020-05-04 19:25:50.447434358 
> +0200
> @@ -0,0 +1,22 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target popcount } */
> +/* { dg-require-effective-target int32plus } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +const unsigned m1  = 0x55555555UL;
> +const unsigned m2  = 0x33333333UL;
> +const unsigned m4  = 0x0F0F0F0FUL;
> +const int shift = 24;
> +
> +int popcount64c(unsigned x)
> +{
> +    x -= (x >> 1) & m1;
> +    x = (x & m2) + ((x >> 2) & m2);
> +    x = (x + (x >> 4)) & m4;
> +    x += (x << 8);
> +    x += (x << 16);
> +    return x >> shift;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */
> --- gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c.jj       2020-05-04 
> 19:26:48.134573527 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c  2020-05-04 19:28:19.376211969 
> +0200
> @@ -0,0 +1,32 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile { target int32plus } } */
> +/* { dg-require-effective-target popcountl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#if __SIZEOF_LONG__ == 4
> +const unsigned long m1  = 0x55555555UL;
> +const unsigned long m2  = 0x33333333UL;
> +const unsigned long m4  = 0x0F0F0F0FUL;
> +const int shift = 24;
> +#else
> +const unsigned long m1  = 0x5555555555555555UL;
> +const unsigned long m2  = 0x3333333333333333UL;
> +const unsigned long m4  = 0x0f0f0f0f0f0f0f0fUL;
> +const int shift = 56;
> +#endif
> +
> +
> +int popcount64c(unsigned long x)
> +{
> +    x -= (x >> 1) & m1;
> +    x = (x & m2) + ((x >> 2) & m2);
> +    x = (x + (x >> 4)) & m4;
> +    x += (x << 8);
> +    x += (x << 16);
> +#if __SIZEOF_LONG__ != 4
> +    x += (x << 32);
> +#endif
> +    return x >> shift;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */
> --- gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c.jj      2020-05-04 
> 19:28:30.325048594 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c 2020-05-04 19:29:18.884323974 
> +0200
> @@ -0,0 +1,22 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target popcountll } */

Shouldn't dg-additional-options be used to enable POPCOUNT?
This test fails on Linux/x86-64 with -m32 -msse4:

$ make check-gcc RUNTESTFLAGS="--target_board='unix{-m32\
-msse4,-mx32\ -msse4,-msse4}' tree-ssa.exp=popcount5ll.c"
...
FAIL: gcc.dg/tree-ssa/popcount5ll.c scan-tree-dump-times optimized ".POPCOUNT" 1


> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +const unsigned long long m1  = 0x5555555555555555ULL;
> +const unsigned long long m2  = 0x3333333333333333ULL;
> +const unsigned long long m4  = 0x0F0F0F0F0F0F0F0FULL;
> +const int shift = 56;
> +
> +int popcount64c(unsigned long long x)
> +{
> +    x -= (x >> 1) & m1;
> +    x = (x & m2) + ((x >> 2) & m2);
> +    x = (x + (x >> 4)) & m4;
> +    x += (x << 8);
> +    x += (x << 16);
> +    x += (x << 32);
> +    return x >> shift;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */
>
>         Jakub
>


-- 
H.J.

Reply via email to