https://gcc.gnu.org/g:c6eb92973ea308e248ce23927a9ac58ef81ee7a2

commit r16-1201-gc6eb92973ea308e248ce23927a9ac58ef81ee7a2
Author: Richard Biener <rguent...@suse.de>
Date:   Wed May 28 15:09:19 2025 +0200

    tree-optimization/120032 - matching of table based CLZ
    
    The following adds the ability to match a table based CLZ implementation
    similar as to how we can do for CTZ.  I'm re-using the workers for
    matching up array and string tables by using a lambda and templates
    and kept the transform step for CLZ/CTZ inter-mangled.
    
            PR tree-optimization/120032
            * match.pd (clz_table_index): New match.
            * tree-ssa-forwprop.cc (check_table_array): Rename from
            check_ctz_array.  Split out actual verification to a functor.
            (check_table_string): Rename from check_ctz_string and likewise.
            (check_table): Rename from check_ctz_table and adjust.
            (gimple_clz_table_index): Declare.
            (simplify_count_zeroes): Rename from simplify_count_trailing_zeroes.
            Extend to cover CLZ.
            (pass_forwprop::execute): Adjust.
    
            * gcc.target/i386/pr120032-1.c: New testcase.
            * gcc.target/i386/pr120032-2.c: Likewise.

Diff:
---
 gcc/match.pd                               |  48 +++++++++++-
 gcc/testsuite/gcc.target/i386/pr120032-1.c |  22 ++++++
 gcc/testsuite/gcc.target/i386/pr120032-2.c |  22 ++++++
 gcc/tree-ssa-forwprop.cc                   | 121 +++++++++++++++++++++--------
 4 files changed, 181 insertions(+), 32 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 656572423aa7..0f53c162fce3 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -11339,7 +11339,7 @@ and,
          (vec_perm @2 @5 { op0; })))))))))))
 
 
-/* Match count trailing zeroes for simplify_count_trailing_zeroes in fwprop.
+/* Match count trailing zeroes for simplify_count_zeroes in forwprop.
    The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
    constant which when multiplied by a power of 2 contains a unique value
    in the top 5 or 6 bits.  This is then indexed into a table which maps it
@@ -11347,6 +11347,52 @@ and,
 (match (ctz_table_index @1 @2 @3)
   (rshift (mult (bit_and:c (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3))
 
+/* Match count leading zeros for simplify_count_zeroes in forwprop.
+   One canonical form is 31 - array[idx] where IDX is computed from X
+   by first setting all bits from the topmost set bits down via a
+   series of shifts and ors to X' and then computing (X' * C) >> SHIFT.  */
+(match (clz_table_index @1 @2 @3)
+  (rshift (mult
+   (bit_ior (rshift
+     (bit_ior@d (rshift
+       (bit_ior@c (rshift
+         (bit_ior@b (rshift
+          (bit_ior@a (rshift @1 INTEGER_CST@c1) @1)
+          INTEGER_CST@c2) @a)
+        INTEGER_CST@c4) @b)
+       INTEGER_CST@c8) @c)
+     INTEGER_CST@c16) @d) INTEGER_CST@2) INTEGER_CST@3)
+  (if (INTEGRAL_TYPE_P (type)
+       && TYPE_UNSIGNED (type)
+       && TYPE_PRECISION (type) == 32
+       && compare_tree_int (@c1, 1) == 0
+       && compare_tree_int (@c2, 2) == 0
+       && compare_tree_int (@c4, 4) == 0
+       && compare_tree_int (@c8, 8) == 0
+       && compare_tree_int (@c16, 16) == 0)))
+(match (clz_table_index @1 @2 @3)
+  (rshift (mult
+   (bit_ior (rshift
+     (bit_ior@e (rshift
+       (bit_ior@d (rshift
+         (bit_ior@c (rshift
+           (bit_ior@b (rshift
+            (bit_ior@a (rshift @1 INTEGER_CST@c1) @1)
+            INTEGER_CST@c2) @a)
+          INTEGER_CST@c4) @b)
+         INTEGER_CST@c8) @c)
+       INTEGER_CST@c16) @d)
+     INTEGER_CST@c32) @e) INTEGER_CST@2) INTEGER_CST@3)
+  (if (INTEGRAL_TYPE_P (type)
+       && TYPE_UNSIGNED (type)
+       && TYPE_PRECISION (type) == 64
+       && compare_tree_int (@c1, 1) == 0
+       && compare_tree_int (@c2, 2) == 0
+       && compare_tree_int (@c4, 4) == 0
+       && compare_tree_int (@c8, 8) == 0
+       && compare_tree_int (@c16, 16) == 0
+       && compare_tree_int (@c32, 32) == 0)))
+
 /* Floatint point/integer comparison and integer->integer
    or floating point -> float point conversion.  */
 (match (cond_expr_convert_p @0 @2 @3 @6)
diff --git a/gcc/testsuite/gcc.target/i386/pr120032-1.c 
b/gcc/testsuite/gcc.target/i386/pr120032-1.c
new file mode 100644
index 000000000000..c51512492b6f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr120032-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlzcnt" } */
+
+unsigned int
+ZSTD_countLeadingZeros32_fallback(unsigned int val)
+{
+  static const unsigned int DeBruijnClz[32]
+    = { 0, 9, 1, 10, 13, 21, 2, 29,
+       11, 14, 16, 18, 22, 25, 3, 30,
+       8, 12, 20, 28, 15, 17, 24, 7,
+       19, 27, 23, 6, 26, 5, 4, 31};
+  if (val == 0)
+    __builtin_abort ();
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+}
+
+/* { dg-final { scan-assembler "lzcnt" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr120032-2.c 
b/gcc/testsuite/gcc.target/i386/pr120032-2.c
new file mode 100644
index 000000000000..ea2ad4086189
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr120032-2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlzcnt" } */
+
+unsigned int
+ZSTD_countLeadingZeros32_fallback(unsigned int val)
+{
+  static const unsigned char DeBruijnClz[32]
+    = { 0, 9, 1, 10, 13, 21, 2, 29,
+       11, 14, 16, 18, 22, 25, 3, 30,
+       8, 12, 20, 28, 15, 17, 24, 7,
+       19, 27, 23, 6, 26, 5, 4, 31};
+  if (val == 0)
+    __builtin_abort ();
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+}
+
+/* { dg-final { scan-assembler "lzcnt" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index a60862a4b1a9..0c2b10e92aa4 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -2508,17 +2508,16 @@ simplify_rotate (gimple_stmt_iterator *gsi)
 }
 
 
-/* Check whether an array contains a valid ctz table.  */
+/* Check whether an array contains a valid table according to VALIDATE_FN.  */
+template<typename ValidateFn>
 static bool
-check_ctz_array (tree ctor, unsigned HOST_WIDE_INT mulc,
-                HOST_WIDE_INT &zero_val, unsigned shift, unsigned bits)
+check_table_array (tree ctor, HOST_WIDE_INT &zero_val, unsigned bits,
+                 ValidateFn validate_fn)
 {
   tree elt, idx;
-  unsigned HOST_WIDE_INT i, mask, raw_idx = 0;
+  unsigned HOST_WIDE_INT i, raw_idx = 0;
   unsigned matched = 0;
 
-  mask = ((HOST_WIDE_INT_1U << (bits - shift)) - 1) << shift;
-
   zero_val = 0;
 
   FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (ctor), i, idx, elt)
@@ -2558,7 +2557,7 @@ check_ctz_array (tree ctor, unsigned HOST_WIDE_INT mulc,
          matched++;
        }
 
-      if (val >= 0 && val < bits && (((mulc << val) & mask) >> shift) == index)
+      if (val >= 0 && val < bits && validate_fn (val, index))
        matched++;
 
       if (matched > bits)
@@ -2568,47 +2567,47 @@ check_ctz_array (tree ctor, unsigned HOST_WIDE_INT mulc,
   return false;
 }
 
-/* Check whether a string contains a valid ctz table.  */
+/* Check whether a string contains a valid table according to VALIDATE_FN.  */
+template<typename ValidateFn>
 static bool
-check_ctz_string (tree string, unsigned HOST_WIDE_INT mulc,
-                 HOST_WIDE_INT &zero_val, unsigned shift, unsigned bits)
+check_table_string (tree string, HOST_WIDE_INT &zero_val,unsigned bits,
+                   ValidateFn validate_fn)
 {
   unsigned HOST_WIDE_INT len = TREE_STRING_LENGTH (string);
-  unsigned HOST_WIDE_INT mask;
   unsigned matched = 0;
   const unsigned char *p = (const unsigned char *) TREE_STRING_POINTER 
(string);
 
   if (len < bits || len > bits * 2)
     return false;
 
-  mask = ((HOST_WIDE_INT_1U << (bits - shift)) - 1) << shift;
-
   zero_val = p[0];
 
   for (unsigned i = 0; i < len; i++)
-    if (p[i] < bits && (((mulc << p[i]) & mask) >> shift) == i)
+    if (p[i] < bits && validate_fn (p[i], i))
       matched++;
 
   return matched == bits;
 }
 
-/* Check whether CTOR contains a valid ctz table.  */
+/* Check whether CTOR contains a valid table according to VALIDATE_FN.  */
+template<typename ValidateFn>
 static bool
-check_ctz_table (tree ctor, tree type, unsigned HOST_WIDE_INT mulc,
-                HOST_WIDE_INT &zero_val, unsigned shift, unsigned bits)
+check_table (tree ctor, tree type, HOST_WIDE_INT &zero_val, unsigned bits,
+            ValidateFn validate_fn)
 {
   if (TREE_CODE (ctor) == CONSTRUCTOR)
-    return check_ctz_array (ctor, mulc, zero_val, shift, bits);
+    return check_table_array (ctor, zero_val, bits, validate_fn);
   else if (TREE_CODE (ctor) == STRING_CST
           && TYPE_PRECISION (type) == CHAR_TYPE_SIZE)
-    return check_ctz_string (ctor, mulc, zero_val, shift, bits);
+    return check_table_string (ctor, zero_val, bits, validate_fn);
   return false;
 }
 
 /* Match.pd function to match the ctz expression.  */
 extern bool gimple_ctz_table_index (tree, tree *, tree (*)(tree));
+extern bool gimple_clz_table_index (tree, tree *, tree (*)(tree));
 
-/* Recognize count trailing zeroes idiom.
+/* Recognize count leading and trailing zeroes idioms.
    The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic
    constant which when multiplied by a power of 2 creates a unique value
    in the top 5 or 6 bits.  This is then indexed into a table which maps it
@@ -2617,7 +2616,7 @@ extern bool gimple_ctz_table_index (tree, tree *, tree 
(*)(tree));
    the target.  */
 
 static bool
-simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi)
+simplify_count_zeroes (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi);
   tree array_ref = gimple_assign_rhs1 (stmt);
@@ -2625,7 +2624,23 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator 
*gsi)
 
   gcc_checking_assert (TREE_CODE (array_ref) == ARRAY_REF);
 
-  if (!gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], NULL))
+  internal_fn fn = IFN_LAST;
+  /* For CTZ we recognize ((x & -x) * C) >> SHIFT where the array data
+     represents the number of trailing zeros.  */
+  if (gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], NULL))
+    fn = IFN_CTZ;
+  /* For CLZ we recognize
+       x |= x >> 1;
+       x |= x >> 2;
+       x |= x >> 4;
+       x |= x >> 8;
+       x |= x >> 16;
+       (x * C) >> SHIFT
+     where 31 minus the array data represents the number of leading zeros.  */
+  else if (gimple_clz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0],
+                                  NULL))
+    fn = IFN_CLZ;
+  else
     return false;
 
   HOST_WIDE_INT zero_val;
@@ -2641,7 +2656,7 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi)
   if (input_bits != 32 && input_bits != 64)
     return false;
 
-  if (!direct_internal_fn_supported_p (IFN_CTZ, input_type, OPTIMIZE_FOR_BOTH))
+  if (!direct_internal_fn_supported_p (fn, input_type, OPTIMIZE_FOR_BOTH))
     return false;
 
   /* Check the lower bound of the array is zero.  */
@@ -2657,13 +2672,46 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator 
*gsi)
   tree ctor = ctor_for_folding (array);
   if (!ctor)
     return false;
-  unsigned HOST_WIDE_INT val = tree_to_uhwi (res_ops[1]);
-  if (!check_ctz_table (ctor, type, val, zero_val, shiftval, input_bits))
-    return false;
+  unsigned HOST_WIDE_INT mulval = tree_to_uhwi (res_ops[1]);
+  if (fn == IFN_CTZ)
+    {
+      auto checkfn = [&](unsigned data, unsigned i) -> bool
+       {
+         unsigned HOST_WIDE_INT mask
+           = ((HOST_WIDE_INT_1U << (input_bits - shiftval)) - 1) << shiftval;
+         return (((mulval << data) & mask) >> shiftval) == i;
+       };
+      if (!check_table (ctor, type, zero_val, input_bits, checkfn))
+       return false;
+    }
+  else if (fn == IFN_CLZ)
+    {
+      auto checkfn = [&](unsigned data, unsigned i) -> bool
+       {
+         unsigned HOST_WIDE_INT mask
+           = ((HOST_WIDE_INT_1U << (input_bits - shiftval)) - 1) << shiftval;
+         return (((((HOST_WIDE_INT_1U << (data + 1)) - 1) * mulval) & mask)
+                 >> shiftval) == i;
+       };
+    if (!check_table (ctor, type, zero_val, input_bits, checkfn))
+      return false;
+    }
 
-  HOST_WIDE_INT ctz_val = 0;
-  bool zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (input_type),
-                                           ctz_val) == 2;
+  HOST_WIDE_INT ctz_val = -1;
+  bool zero_ok;
+  if (fn == IFN_CTZ)
+    {
+      ctz_val = 0;
+      zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (input_type),
+                                          ctz_val) == 2;
+    }
+  else if (fn == IFN_CLZ)
+    {
+      ctz_val = 32;
+      zero_ok = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (input_type),
+                                          ctz_val) == 2;
+      zero_val = input_bits - 1 - zero_val;
+    }
   int nargs = 2;
 
   /* If the input value can't be zero, don't special case ctz (0).  */
@@ -2689,7 +2737,7 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi)
 
   gimple_seq seq = NULL;
   gimple *g;
-  gcall *call = gimple_build_call_internal (IFN_CTZ, nargs, res_ops[0],
+  gcall *call = gimple_build_call_internal (fn, nargs, res_ops[0],
                                            nargs == 1 ? NULL_TREE
                                            : build_int_cst (integer_type_node,
                                                             ctz_val));
@@ -2698,6 +2746,17 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator 
*gsi)
   gimple_seq_add_stmt (&seq, call);
 
   tree prev_lhs = gimple_call_lhs (call);
+  if (fn == IFN_CLZ)
+    {
+      g = gimple_build_assign (make_ssa_name (integer_type_node),
+                              MINUS_EXPR,
+                              build_int_cst (integer_type_node,
+                                             input_bits - 1),
+                              prev_lhs);
+      gimple_set_location (g, gimple_location (stmt));
+      gimple_seq_add_stmt (&seq, g);
+      prev_lhs = gimple_assign_lhs (g);
+    }
 
   /* Emit ctz (x) & 31 if ctz (0) is 32 but we need to return 0.  */
   if (zero_val == 0 && ctz_val == input_bits)
@@ -4829,7 +4888,7 @@ pass_forwprop::execute (function *fun)
                             && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE)
                      changed |= simplify_vector_constructor (&gsi);
                    else if (code == ARRAY_REF)
-                     changed |= simplify_count_trailing_zeroes (&gsi);
+                     changed |= simplify_count_zeroes (&gsi);
                    break;
                  }

Reply via email to