Hi!

This patch fixes various issues in the popcount{64,32}c matcher.

The first issue is that it blindly uses tree_to_uhwi on all the INTEGER_CST
operands.  That is fine for those where we know their type, after the
prec <= 64 && TYPE_UNSIGNED (type)
verification, but shift counts can have different types and so we need to
handle e.g. invalid shifts by negative amounts.

Another issue is that the transformation is I believe only valid for
16, 32 and 64-bit precision, e.g. if it would be done in 24-bit or 62-bit,
it wouldn't be a popcount.  For 8-bit, it would likely not match due to >>
0, etc.

Yet another issue is that the >> shift computations could very well shift
by negative amounts e.g. for 128-bit precision, invoking UB in the compiler
until we actually check the precision later.  And, I think we want to use
HOST_WIDE_INT_UC macro instead of hardcoding ULL suffixes.

The formatting didn't match the match.pd formatting style either.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

One thing I haven't done anything about yet is that there is
FAIL: gcc.dg/tree-ssa/popcount4ll.c scan-tree-dump-times optimized ".POPCOUNT" 1
before/after this patch with -m32/-march=skylake-avx512.  That is because
the popcountll effective target tests that we don't emit a call for
__builtin_popcountll, which we don't on ia32 skylake-avx512, but
direct_internal_fn_supported_p isn't true - that is because we expand the
double word popcount using 2 word popcounts + addition.  Shall the match.pd
case handle that case too  by allowing the optimization even if there is a
type with half precision for which direct_internal_fn_supported_p?

2019-12-31  Jakub Jelinek  <ja...@redhat.com>

        PR tree-optimization/93098
        * match.pd (popcount): For shift amounts, use integer_onep
        or wi::to_widest () == cst instead of tree_to_uhwi () == cst
        tests.  Make sure that precision is power of two larger than or equal
        to 16.  Ensure shift is never negative.  Use HOST_WIDE_INT_UC macro
        instead of ULL suffixed constants.  Formatting fixes.

        * gcc.c-torture/compile/pr93098.c: New test.

--- gcc/match.pd.jj     2019-12-09 11:12:48.351010794 +0100
+++ gcc/match.pd        2019-12-30 09:52:05.423499909 +0100
@@ -5786,43 +5786,50 @@ (define_operator_list COND_TERNARY
      return (x * 0x01010101) >> 24;
    }  */
 (simplify
-  (rshift
-    (mult
-      (bit_and
-       (plus:c
-         (rshift @8 INTEGER_CST@5)
-         (plus:c@8
-           (bit_and @6 INTEGER_CST@7)
-           (bit_and
-             (rshift
-               (minus@6
-                 @0
-                 (bit_and
-                   (rshift @0 INTEGER_CST@4)
-                   INTEGER_CST@11))
-             INTEGER_CST@10)
-           INTEGER_CST@9)))
-       INTEGER_CST@3)
-      INTEGER_CST@2)
-    INTEGER_CST@1)
+ (rshift
+  (mult
+   (bit_and
+    (plus:c
+     (rshift @8 INTEGER_CST@5)
+      (plus:c@8
+       (bit_and @6 INTEGER_CST@7)
+       (bit_and
+        (rshift
+         (minus@6 @0
+          (bit_and (rshift @0 INTEGER_CST@4) INTEGER_CST@11))
+         INTEGER_CST@10)
+        INTEGER_CST@9)))
+    INTEGER_CST@3)
+   INTEGER_CST@2)
+  INTEGER_CST@1)
   /* Check constants and optab.  */
-  (with
-     {
-       unsigned prec = TYPE_PRECISION (type);
-       int shift = 64 - prec;
-       const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift,
-                                   c2 = 0x0F0F0F0F0F0F0F0FULL >> shift,
-                                   c3 = 0x3333333333333333ULL >> shift,
-                                   c4 = 0x5555555555555555ULL >> shift;
-     }
-    (if (prec <= 64 && TYPE_UNSIGNED (type) && tree_to_uhwi (@4) == 1
-          && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4
-          && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1
-          && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3
-          && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4
-          && direct_internal_fn_supported_p (IFN_POPCOUNT, type,
-           OPTIMIZE_FOR_BOTH))
-        (convert (IFN_POPCOUNT:type @0)))))
+  (with { unsigned prec = TYPE_PRECISION (type);
+         int shift = (64 - prec) & 63;
+         unsigned HOST_WIDE_INT c1
+           = HOST_WIDE_INT_UC (0x0101010101010101) >> shift;
+         unsigned HOST_WIDE_INT c2
+           = HOST_WIDE_INT_UC (0x0F0F0F0F0F0F0F0F) >> shift;
+         unsigned HOST_WIDE_INT c3
+           = HOST_WIDE_INT_UC (0x3333333333333333) >> shift;
+         unsigned HOST_WIDE_INT c4
+           = HOST_WIDE_INT_UC (0x5555555555555555) >> shift;
+   }
+   (if (prec >= 16
+       && prec <= 64
+       && pow2p_hwi (prec)
+       && TYPE_UNSIGNED (type)
+       && integer_onep (@4)
+       && wi::to_widest (@10) == 2
+       && wi::to_widest (@5) == 4
+       && wi::to_widest (@1) == prec - 8
+       && tree_to_uhwi (@2) == c1
+       && tree_to_uhwi (@3) == c2
+       && tree_to_uhwi (@9) == c3
+       && tree_to_uhwi (@7) == c3
+       && tree_to_uhwi (@11) == c4
+       && direct_internal_fn_supported_p (IFN_POPCOUNT, type,
+                                          OPTIMIZE_FOR_BOTH))
+    (convert (IFN_POPCOUNT:type @0)))))
 #endif
 
 /* Simplify:
--- gcc/testsuite/gcc.c-torture/compile/pr93098.c.jj    2019-12-30 
10:18:28.750401210 +0100
+++ gcc/testsuite/gcc.c-torture/compile/pr93098.c       2019-12-30 
10:18:12.491648669 +0100
@@ -0,0 +1,37 @@
+/* PR tree-optimization/93098 */
+
+int
+foo (unsigned long long x)
+{
+  x -= (x >> -1) & 0x5555555555555555ULL;
+  x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+  x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+  return (x * 0x0101010101010101ULL) >> 56;
+}
+
+int
+bar (unsigned long long x)
+{
+  x -= (x >> 1) & 0x5555555555555555ULL;
+  x = (x & 0x3333333333333333ULL) + ((x >> -2) & 0x3333333333333333ULL);
+  x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+  return (x * 0x0101010101010101ULL) >> 56;
+}
+
+int
+baz (unsigned long long x)
+{
+  x -= (x >> 1) & 0x5555555555555555ULL;
+  x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+  x = (x + (x >> -4)) & 0x0f0f0f0f0f0f0f0fULL;
+  return (x * 0x0101010101010101ULL) >> 56;
+}
+
+int
+qux (unsigned long long x)
+{
+  x -= (x >> 1) & 0x5555555555555555ULL;
+  x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+  x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+  return (x * 0x0101010101010101ULL) >> -56;
+}

        Jakub

Reply via email to