Re: [PATCH] popcount{64,32}c pattern matching fixes (PR tree-optimization/93098)

Richard Biener Tue, 31 Dec 2019 08:48:09 -0800

On December 31, 2019 12:00:56 PM GMT+01:00, Jakub Jelinek <ja...@redhat.com> 
wrote:
>Hi!
>
>This patch fixes various issues in the popcount{64,32}c matcher.
>
>The first issue is that it blindly uses tree_to_uhwi on all the
>INTEGER_CST
>operands.  That is fine for those where we know their type, after the
>prec <= 64 && TYPE_UNSIGNED (type)
>verification, but shift counts can have different types and so we need
>to
>handle e.g. invalid shifts by negative amounts.
>
>Another issue is that the transformation is I believe only valid for
>16, 32 and 64-bit precision, e.g. if it would be done in 24-bit or
>62-bit,
>it wouldn't be a popcount.  For 8-bit, it would likely not match due to
>>>
>0, etc.
>
>Yet another issue is that the >> shift computations could very well
>shift
>by negative amounts e.g. for 128-bit precision, invoking UB in the
>compiler
>until we actually check the precision later.  And, I think we want to
>use
>HOST_WIDE_INT_UC macro instead of hardcoding ULL suffixes.
>
>The formatting didn't match the match.pd formatting style either.
>
>Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?


Ok. 

>One thing I haven't done anything about yet is that there is
>FAIL: gcc.dg/tree-ssa/popcount4ll.c scan-tree-dump-times optimized
>".POPCOUNT" 1
>before/after this patch with -m32/-march=skylake-avx512.  That is
>because
>the popcountll effective target tests that we don't emit a call for
>__builtin_popcountll, which we don't on ia32 skylake-avx512, but
>direct_internal_fn_supported_p isn't true - that is because we expand
>the
>double word popcount using 2 word popcounts + addition.  Shall the
>match.pd
>case handle that case too  by allowing the optimization even if there
>is a
>type with half precision for which direct_internal_fn_supported_p?

You mean emitting a single builtin call
Or an add of two ifns? 

>2019-12-31  Jakub Jelinek  <ja...@redhat.com>
>
>       PR tree-optimization/93098
>       * match.pd (popcount): For shift amounts, use integer_onep
>       or wi::to_widest () == cst instead of tree_to_uhwi () == cst
>       tests.  Make sure that precision is power of two larger than or equal
>       to 16.  Ensure shift is never negative.  Use HOST_WIDE_INT_UC macro
>       instead of ULL suffixed constants.  Formatting fixes.
>
>       * gcc.c-torture/compile/pr93098.c: New test.
>
>--- gcc/match.pd.jj    2019-12-09 11:12:48.351010794 +0100
>+++ gcc/match.pd       2019-12-30 09:52:05.423499909 +0100
>@@ -5786,43 +5786,50 @@ (define_operator_list COND_TERNARY
>      return (x * 0x01010101) >> 24;
>    }  */
> (simplify
>-  (rshift
>-    (mult
>-      (bit_and
>-      (plus:c
>-        (rshift @8 INTEGER_CST@5)
>-        (plus:c@8
>-          (bit_and @6 INTEGER_CST@7)
>-          (bit_and
>-            (rshift
>-              (minus@6
>-                @0
>-                (bit_and
>-                  (rshift @0 INTEGER_CST@4)
>-                  INTEGER_CST@11))
>-            INTEGER_CST@10)
>-          INTEGER_CST@9)))
>-      INTEGER_CST@3)
>-      INTEGER_CST@2)
>-    INTEGER_CST@1)
>+ (rshift
>+  (mult
>+   (bit_and
>+    (plus:c
>+     (rshift @8 INTEGER_CST@5)
>+      (plus:c@8
>+       (bit_and @6 INTEGER_CST@7)
>+      (bit_and
>+       (rshift
>+        (minus@6 @0
>+         (bit_and (rshift @0 INTEGER_CST@4) INTEGER_CST@11))
>+        INTEGER_CST@10)
>+       INTEGER_CST@9)))
>+    INTEGER_CST@3)
>+   INTEGER_CST@2)
>+  INTEGER_CST@1)
>   /* Check constants and optab.  */
>-  (with
>-     {
>-       unsigned prec = TYPE_PRECISION (type);
>-       int shift = 64 - prec;
>-       const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >>
>shift,
>-                                  c2 = 0x0F0F0F0F0F0F0F0FULL >> shift,
>-                                  c3 = 0x3333333333333333ULL >> shift,
>-                                  c4 = 0x5555555555555555ULL >> shift;
>-     }
>-    (if (prec <= 64 && TYPE_UNSIGNED (type) && tree_to_uhwi (@4) == 1
>-          && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4
>-          && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1
>-          && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3
>-          && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4
>-          && direct_internal_fn_supported_p (IFN_POPCOUNT, type,
>-          OPTIMIZE_FOR_BOTH))
>-        (convert (IFN_POPCOUNT:type @0)))))
>+  (with { unsigned prec = TYPE_PRECISION (type);
>+        int shift = (64 - prec) & 63;
>+        unsigned HOST_WIDE_INT c1
>+          = HOST_WIDE_INT_UC (0x0101010101010101) >> shift;
>+        unsigned HOST_WIDE_INT c2
>+          = HOST_WIDE_INT_UC (0x0F0F0F0F0F0F0F0F) >> shift;
>+        unsigned HOST_WIDE_INT c3
>+          = HOST_WIDE_INT_UC (0x3333333333333333) >> shift;
>+        unsigned HOST_WIDE_INT c4
>+          = HOST_WIDE_INT_UC (0x5555555555555555) >> shift;
>+   }
>+   (if (prec >= 16
>+      && prec <= 64
>+      && pow2p_hwi (prec)
>+      && TYPE_UNSIGNED (type)
>+      && integer_onep (@4)
>+      && wi::to_widest (@10) == 2
>+      && wi::to_widest (@5) == 4
>+      && wi::to_widest (@1) == prec - 8
>+      && tree_to_uhwi (@2) == c1
>+      && tree_to_uhwi (@3) == c2
>+      && tree_to_uhwi (@9) == c3
>+      && tree_to_uhwi (@7) == c3
>+      && tree_to_uhwi (@11) == c4
>+      && direct_internal_fn_supported_p (IFN_POPCOUNT, type,
>+                                         OPTIMIZE_FOR_BOTH))
>+    (convert (IFN_POPCOUNT:type @0)))))
> #endif
> 
> /* Simplify:
>--- gcc/testsuite/gcc.c-torture/compile/pr93098.c.jj   2019-12-30
>10:18:28.750401210 +0100
>+++ gcc/testsuite/gcc.c-torture/compile/pr93098.c      2019-12-30
>10:18:12.491648669 +0100
>@@ -0,0 +1,37 @@
>+/* PR tree-optimization/93098 */
>+
>+int
>+foo (unsigned long long x)
>+{
>+  x -= (x >> -1) & 0x5555555555555555ULL;
>+  x = (x & 0x3333333333333333ULL) + ((x >> 2) &
>0x3333333333333333ULL);
>+  x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
>+  return (x * 0x0101010101010101ULL) >> 56;
>+}
>+
>+int
>+bar (unsigned long long x)
>+{
>+  x -= (x >> 1) & 0x5555555555555555ULL;
>+  x = (x & 0x3333333333333333ULL) + ((x >> -2) &
>0x3333333333333333ULL);
>+  x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
>+  return (x * 0x0101010101010101ULL) >> 56;
>+}
>+
>+int
>+baz (unsigned long long x)
>+{
>+  x -= (x >> 1) & 0x5555555555555555ULL;
>+  x = (x & 0x3333333333333333ULL) + ((x >> 2) &
>0x3333333333333333ULL);
>+  x = (x + (x >> -4)) & 0x0f0f0f0f0f0f0f0fULL;
>+  return (x * 0x0101010101010101ULL) >> 56;
>+}
>+
>+int
>+qux (unsigned long long x)
>+{
>+  x -= (x >> 1) & 0x5555555555555555ULL;
>+  x = (x & 0x3333333333333333ULL) + ((x >> 2) &
>0x3333333333333333ULL);
>+  x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
>+  return (x * 0x0101010101010101ULL) >> -56;
>+}
>
>       Jakub

Re: [PATCH] popcount{64,32}c pattern matching fixes (PR tree-optimization/93098)

Reply via email to