Add simplifications for popcount (x) > 1 to (x & (x-1)) != 0 and
popcount (x) == 1 into (x-1) <u (x & -x).  These trigger only for
single-use cases and support an optional convert.  A microbenchmark
shows a speedup of 2-2.5x on both x64 and AArch64.

Bootstrap OK, OK for commit?

ChangeLog:
2019-08-13  Wilco Dijkstra  <wdijk...@arm.com>

gcc/
        PR middle-end/90693
        * match.pd: Add popcount simplifications.

testsuite/
        PR middle-end/90693
        * gcc.dg/fold-popcount-5.c: Add new test.

---

diff --git a/gcc/match.pd b/gcc/match.pd
index 
0317bc704f771f626ab72189b3a54de00087ad5a..bf4351a330f45f3a1424d9792cefc3da6267597d
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5356,7 +5356,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        rep (eq eq ne ne)
     (simplify
       (cmp (popcount @0) integer_zerop)
-      (rep @0 { build_zero_cst (TREE_TYPE (@0)); }))))
+      (rep @0 { build_zero_cst (TREE_TYPE (@0)); })))
+  /* popcount(X) == 1 -> (X-1) <u (X & -X).  */
+  (for cmp (eq ne)
+       rep (lt ge)
+    (simplify
+      (cmp (convert? (popcount:s @0)) integer_onep)
+      (with {
+             tree utype = unsigned_type_for (TREE_TYPE (@0));
+             tree a0 = fold_convert (utype, @0); }
+       (rep (plus { a0; } { build_minus_one_cst (utype); })
+            (bit_and (negate { a0; }) { a0; })))))
+  /* popcount(X) > 1 -> (X & (X-1)) != 0.  */
+  (for cmp (gt le)
+       rep (ne eq)
+    (simplify
+      (cmp (convert? (popcount:s @0)) integer_onep)
+      (rep (bit_and (plus @0 { build_minus_one_cst (TREE_TYPE (@0)); }) @0)
+          { build_zero_cst (TREE_TYPE (@0)); }))))
 
 /* Simplify:
 
diff --git a/gcc/testsuite/gcc.dg/fold-popcount-5.c 
b/gcc/testsuite/gcc.dg/fold-popcount-5.c
new file mode 100644
index 
0000000000000000000000000000000000000000..fcf3910587caacb8e39cf437dc3971df892f405a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-popcount-5.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+/* Test popcount (x) > 1 -> (x & (x-1)) != 0.  */
+
+int test_1 (long x)
+{
+  return __builtin_popcountl (x) >= 2;
+}
+
+int test_2 (int x)
+{
+  return (unsigned) __builtin_popcount (x) <= 1u;
+}
+
+int test_3 (unsigned x)
+{
+  return __builtin_popcount (x) > 1u;
+}
+
+int test_4 (unsigned long x)
+{
+  return (unsigned char) __builtin_popcountl (x) > 1;
+}
+
+int test_5 (unsigned long x)
+{
+  return (signed char) __builtin_popcountl (x) <= (signed char)1;
+}
+
+int test_6 (unsigned long long x)
+{
+  return 2u <= __builtin_popcountll (x);
+}
+
+/* Test popcount (x) == 1 -> (x-1) <u (x & -x).  */
+
+int test_7 (unsigned long x)
+{
+  return __builtin_popcountl (x) != 1;
+}
+
+int test_8 (long long x)
+{
+  return (unsigned) __builtin_popcountll (x) == 1u;
+}
+
+int test_9 (int x)
+{
+  return (unsigned char) __builtin_popcount (x) != 1u;
+}
+
+int test_10 (unsigned x)
+{
+  return (unsigned char) __builtin_popcount (x) == 1;
+}
+
+int test_11 (long x)
+{
+  return (signed char) __builtin_popcountl (x) == 1;
+}
+
+int test_12 (long x)
+{
+  return 1u == __builtin_popcountl (x);
+}
+
+/* { dg-final { scan-tree-dump-times "popcount" 0 "optimized" } } */
+

Reply via email to