On Tue, May 5, 2020 at 12:27 AM Jakub Jelinek via Gcc-patches
<[email protected]> wrote:
>
> Hi!
>
> The popcount* testcases show yet another creative way to write popcount,
> but rather than adjusting the popcount matcher to deal with it, I think
> we just should canonicalize those (X + (X << C) to X * (1 + (1 << C))
> and (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2)), because for
> multiplication we already have simplification rules that can handle nested
> multiplication (X * CST1 * CST2), while the the shifts and adds we have
> nothing like that. And user could have written the multiplication anyway,
> so if we don't emit the fastest or smallest code for the multiplication by
> constant, we should improve that. At least on the testcases seems the
> emitted code is reasonable according to cost, except that perhaps we could
> in some cases try to improve expansion of vector multiplication by
> uniform constant.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-05-05 Jakub Jelinek <[email protected]>
>
> PR tree-optimization/94800
> * match.pd (X + (X << C) to X * (1 + (1 << C)),
> (X << C1) + (X << C2) to X * ((1 << C1) + (1 << C2))): New
> canonicalizations.
>
> * gcc.dg/tree-ssa/pr94800.c: New test.
> * gcc.dg/tree-ssa/popcount5.c: New test.
> * gcc.dg/tree-ssa/popcount5l.c: New test.
> * gcc.dg/tree-ssa/popcount5ll.c: New test.
>
> --- gcc/match.pd.jj 2020-05-04 12:15:33.220799388 +0200
> +++ gcc/match.pd 2020-05-04 18:54:58.789063922 +0200
> @@ -2570,6 +2570,41 @@ (define_operator_list COND_TERNARY
> && single_use (@3))
> (mult (plusminus @2 { build_one_cst (type); }) @0))))))
>
> +#if GIMPLE
> +/* Canonicalize X + (X << C) into X * (1 + (1 << C)) and
> + (X << C1) + (X << C2) into X * ((1 << C1) + (1 << C2)). */
> +(simplify
> + (plus:c @0 (lshift:s @0 INTEGER_CST@1))
> + (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> + && tree_fits_uhwi_p (@1)
> + && tree_to_uhwi (@1) < element_precision (type))
> + (with { tree t = type;
> + if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t);
> + wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1),
> + element_precision (type));
> + w += 1;
> + tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t)
> + : t, w);
> + cst = build_uniform_cst (t, cst); }
> + (convert (mult (convert:t @0) { cst; })))))
> +(simplify
> + (plus (lshift:s @0 INTEGER_CST@1) (lshift:s @0 INTEGER_CST@2))
> + (if (ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> + && tree_fits_uhwi_p (@1)
> + && tree_to_uhwi (@1) < element_precision (type)
> + && tree_fits_uhwi_p (@2)
> + && tree_to_uhwi (@2) < element_precision (type))
> + (with { tree t = type;
> + if (!TYPE_OVERFLOW_WRAPS (t)) t = unsigned_type_for (t);
> + unsigned int prec = element_precision (type);
> + wide_int w = wi::set_bit_in_zero (tree_to_uhwi (@1), prec);
> + w += wi::set_bit_in_zero (tree_to_uhwi (@2), prec);
> + tree cst = wide_int_to_tree (VECTOR_TYPE_P (t) ? TREE_TYPE (t)
> + : t, w);
> + cst = build_uniform_cst (t, cst); }
> + (convert (mult (convert:t @0) { cst; })))))
> +#endif
> +
> /* Simplifications of MIN_EXPR, MAX_EXPR, fmin() and fmax(). */
>
> (for minmax (min max FMIN_ALL FMAX_ALL)
> --- gcc/testsuite/gcc.dg/tree-ssa/pr94800.c.jj 2020-05-04 19:18:24.683086245
> +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr94800.c 2020-05-04 19:17:26.530954010
> +0200
> @@ -0,0 +1,80 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile { target { ilp32 || lp64 } } } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-times " \* 72340172838076673" 2 "optimized" }
> } */
> +/* { dg-final { scan-tree-dump-times " \* 16843009" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " \* 289360691352306692" 2 "optimized"
> } } */
> +/* { dg-final { scan-tree-dump-times " \* 1229782938247303441" 2 "optimized"
> } } */
> +/* { dg-final { scan-tree-dump-not "<<" "optimized" } } */
> +
> +unsigned long long int
> +foo (unsigned long long int x)
> +{
> + unsigned long long int a = x + (x << 8);
> + unsigned long long int b = a + (a << 16);
> + unsigned long long int c = b + (b << 32);
> + return c;
> +}
> +
> +unsigned int
> +bar (unsigned int x)
> +{
> + unsigned int a = x + (x << 8);
> + unsigned int b = a + (a << 16);
> + return b;
> +}
> +
> +unsigned long long int
> +baz (unsigned long long int x)
> +{
> + unsigned long long int a = (x << 2) + (x << 10);
> + unsigned long long int b = a + (a << 16);
> + unsigned long long int c = b + (b << 32);
> + return c;
> +}
> +
> +unsigned long long int
> +qux (unsigned long long int x)
> +{
> + unsigned long long int a = x + (x << 4);
> + unsigned long long int b = a + (a << 8);
> + unsigned long long int c = b + (b << 16);
> + unsigned long long int d = c + (c << 32);
> + return d;
> +}
> +
> +long long int
> +quux (long long int x)
> +{
> + long long int a = x + (x << 8);
> + long long int b = a + (a << 16);
> + long long int c = b + (b << 32);
> + return c;
> +}
> +
> +int
> +corge (int x)
> +{
> + int a = x + (x << 8);
> + int b = a + (a << 16);
> + return b;
> +}
> +
> +long long int
> +garply (long long int x)
> +{
> + long long int a = (x << 2) + (x << 10);
> + long long int b = a + (a << 16);
> + long long int c = b + (b << 32);
> + return c;
> +}
> +
> +long long int
> +waldo (long long int x)
> +{
> + long long int a = x + (x << 4);
> + long long int b = a + (a << 8);
> + long long int c = b + (b << 16);
> + long long int d = c + (c << 32);
> + return d;
> +}
> --- gcc/testsuite/gcc.dg/tree-ssa/popcount5.c.jj 2020-05-04
> 19:25:56.406345437 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5.c 2020-05-04 19:25:50.447434358
> +0200
> @@ -0,0 +1,22 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target popcount } */
> +/* { dg-require-effective-target int32plus } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +const unsigned m1 = 0x55555555UL;
> +const unsigned m2 = 0x33333333UL;
> +const unsigned m4 = 0x0F0F0F0FUL;
> +const int shift = 24;
> +
> +int popcount64c(unsigned x)
> +{
> + x -= (x >> 1) & m1;
> + x = (x & m2) + ((x >> 2) & m2);
> + x = (x + (x >> 4)) & m4;
> + x += (x << 8);
> + x += (x << 16);
> + return x >> shift;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */
> --- gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c.jj 2020-05-04
> 19:26:48.134573527 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5l.c 2020-05-04 19:28:19.376211969
> +0200
> @@ -0,0 +1,32 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile { target int32plus } } */
> +/* { dg-require-effective-target popcountl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#if __SIZEOF_LONG__ == 4
> +const unsigned long m1 = 0x55555555UL;
> +const unsigned long m2 = 0x33333333UL;
> +const unsigned long m4 = 0x0F0F0F0FUL;
> +const int shift = 24;
> +#else
> +const unsigned long m1 = 0x5555555555555555UL;
> +const unsigned long m2 = 0x3333333333333333UL;
> +const unsigned long m4 = 0x0f0f0f0f0f0f0f0fUL;
> +const int shift = 56;
> +#endif
> +
> +
> +int popcount64c(unsigned long x)
> +{
> + x -= (x >> 1) & m1;
> + x = (x & m2) + ((x >> 2) & m2);
> + x = (x + (x >> 4)) & m4;
> + x += (x << 8);
> + x += (x << 16);
> +#if __SIZEOF_LONG__ != 4
> + x += (x << 32);
> +#endif
> + return x >> shift;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */
> --- gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c.jj 2020-05-04
> 19:28:30.325048594 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/popcount5ll.c 2020-05-04 19:29:18.884323974
> +0200
> @@ -0,0 +1,22 @@
> +/* PR tree-optimization/94800 */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target popcountll } */
Shouldn't dg-additional-options be used to enable POPCOUNT?
This test fails on Linux/x86-64 with -m32 -msse4:
$ make check-gcc RUNTESTFLAGS="--target_board='unix{-m32\
-msse4,-mx32\ -msse4,-msse4}' tree-ssa.exp=popcount5ll.c"
...
FAIL: gcc.dg/tree-ssa/popcount5ll.c scan-tree-dump-times optimized ".POPCOUNT" 1
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +const unsigned long long m1 = 0x5555555555555555ULL;
> +const unsigned long long m2 = 0x3333333333333333ULL;
> +const unsigned long long m4 = 0x0F0F0F0F0F0F0F0FULL;
> +const int shift = 56;
> +
> +int popcount64c(unsigned long long x)
> +{
> + x -= (x >> 1) & m1;
> + x = (x & m2) + ((x >> 2) & m2);
> + x = (x + (x >> 4)) & m4;
> + x += (x << 8);
> + x += (x << 16);
> + x += (x << 32);
> + return x >> shift;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.POPCOUNT" 1 "optimized" } } */
>
> Jakub
>
--
H.J.