https://gcc.gnu.org/g:c6eb92973ea308e248ce23927a9ac58ef81ee7a2
commit r16-1201-gc6eb92973ea308e248ce23927a9ac58ef81ee7a2 Author: Richard Biener <rguent...@suse.de> Date: Wed May 28 15:09:19 2025 +0200 tree-optimization/120032 - matching of table based CLZ The following adds the ability to match a table based CLZ implementation similar as to how we can do for CTZ. I'm re-using the workers for matching up array and string tables by using a lambda and templates and kept the transform step for CLZ/CTZ inter-mangled. PR tree-optimization/120032 * match.pd (clz_table_index): New match. * tree-ssa-forwprop.cc (check_table_array): Rename from check_ctz_array. Split out actual verification to a functor. (check_table_string): Rename from check_ctz_string and likewise. (check_table): Rename from check_ctz_table and adjust. (gimple_clz_table_index): Declare. (simplify_count_zeroes): Rename from simplify_count_trailing_zeroes. Extend to cover CLZ. (pass_forwprop::execute): Adjust. * gcc.target/i386/pr120032-1.c: New testcase. * gcc.target/i386/pr120032-2.c: Likewise. Diff: --- gcc/match.pd | 48 +++++++++++- gcc/testsuite/gcc.target/i386/pr120032-1.c | 22 ++++++ gcc/testsuite/gcc.target/i386/pr120032-2.c | 22 ++++++ gcc/tree-ssa-forwprop.cc | 121 +++++++++++++++++++++-------- 4 files changed, 181 insertions(+), 32 deletions(-) diff --git a/gcc/match.pd b/gcc/match.pd index 656572423aa7..0f53c162fce3 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -11339,7 +11339,7 @@ and, (vec_perm @2 @5 { op0; }))))))))))) -/* Match count trailing zeroes for simplify_count_trailing_zeroes in fwprop. +/* Match count trailing zeroes for simplify_count_zeroes in forwprop. The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic constant which when multiplied by a power of 2 contains a unique value in the top 5 or 6 bits. This is then indexed into a table which maps it @@ -11347,6 +11347,52 @@ and, (match (ctz_table_index @1 @2 @3) (rshift (mult (bit_and:c (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3)) +/* Match count leading zeros for simplify_count_zeroes in forwprop. + One canonical form is 31 - array[idx] where IDX is computed from X + by first setting all bits from the topmost set bits down via a + series of shifts and ors to X' and then computing (X' * C) >> SHIFT. */ +(match (clz_table_index @1 @2 @3) + (rshift (mult + (bit_ior (rshift + (bit_ior@d (rshift + (bit_ior@c (rshift + (bit_ior@b (rshift + (bit_ior@a (rshift @1 INTEGER_CST@c1) @1) + INTEGER_CST@c2) @a) + INTEGER_CST@c4) @b) + INTEGER_CST@c8) @c) + INTEGER_CST@c16) @d) INTEGER_CST@2) INTEGER_CST@3) + (if (INTEGRAL_TYPE_P (type) + && TYPE_UNSIGNED (type) + && TYPE_PRECISION (type) == 32 + && compare_tree_int (@c1, 1) == 0 + && compare_tree_int (@c2, 2) == 0 + && compare_tree_int (@c4, 4) == 0 + && compare_tree_int (@c8, 8) == 0 + && compare_tree_int (@c16, 16) == 0))) +(match (clz_table_index @1 @2 @3) + (rshift (mult + (bit_ior (rshift + (bit_ior@e (rshift + (bit_ior@d (rshift + (bit_ior@c (rshift + (bit_ior@b (rshift + (bit_ior@a (rshift @1 INTEGER_CST@c1) @1) + INTEGER_CST@c2) @a) + INTEGER_CST@c4) @b) + INTEGER_CST@c8) @c) + INTEGER_CST@c16) @d) + INTEGER_CST@c32) @e) INTEGER_CST@2) INTEGER_CST@3) + (if (INTEGRAL_TYPE_P (type) + && TYPE_UNSIGNED (type) + && TYPE_PRECISION (type) == 64 + && compare_tree_int (@c1, 1) == 0 + && compare_tree_int (@c2, 2) == 0 + && compare_tree_int (@c4, 4) == 0 + && compare_tree_int (@c8, 8) == 0 + && compare_tree_int (@c16, 16) == 0 + && compare_tree_int (@c32, 32) == 0))) + /* Floatint point/integer comparison and integer->integer or floating point -> float point conversion. */ (match (cond_expr_convert_p @0 @2 @3 @6) diff --git a/gcc/testsuite/gcc.target/i386/pr120032-1.c b/gcc/testsuite/gcc.target/i386/pr120032-1.c new file mode 100644 index 000000000000..c51512492b6f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120032-1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlzcnt" } */ + +unsigned int +ZSTD_countLeadingZeros32_fallback(unsigned int val) +{ + static const unsigned int DeBruijnClz[32] + = { 0, 9, 1, 10, 13, 21, 2, 29, + 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, + 19, 27, 23, 6, 26, 5, 4, 31}; + if (val == 0) + __builtin_abort (); + val |= val >> 1; + val |= val >> 2; + val |= val >> 4; + val |= val >> 8; + val |= val >> 16; + return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; +} + +/* { dg-final { scan-assembler "lzcnt" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr120032-2.c b/gcc/testsuite/gcc.target/i386/pr120032-2.c new file mode 100644 index 000000000000..ea2ad4086189 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr120032-2.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlzcnt" } */ + +unsigned int +ZSTD_countLeadingZeros32_fallback(unsigned int val) +{ + static const unsigned char DeBruijnClz[32] + = { 0, 9, 1, 10, 13, 21, 2, 29, + 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, + 19, 27, 23, 6, 26, 5, 4, 31}; + if (val == 0) + __builtin_abort (); + val |= val >> 1; + val |= val >> 2; + val |= val >> 4; + val |= val >> 8; + val |= val >> 16; + return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; +} + +/* { dg-final { scan-assembler "lzcnt" } } */ diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index a60862a4b1a9..0c2b10e92aa4 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -2508,17 +2508,16 @@ simplify_rotate (gimple_stmt_iterator *gsi) } -/* Check whether an array contains a valid ctz table. */ +/* Check whether an array contains a valid table according to VALIDATE_FN. */ +template<typename ValidateFn> static bool -check_ctz_array (tree ctor, unsigned HOST_WIDE_INT mulc, - HOST_WIDE_INT &zero_val, unsigned shift, unsigned bits) +check_table_array (tree ctor, HOST_WIDE_INT &zero_val, unsigned bits, + ValidateFn validate_fn) { tree elt, idx; - unsigned HOST_WIDE_INT i, mask, raw_idx = 0; + unsigned HOST_WIDE_INT i, raw_idx = 0; unsigned matched = 0; - mask = ((HOST_WIDE_INT_1U << (bits - shift)) - 1) << shift; - zero_val = 0; FOR_EACH_CONSTRUCTOR_ELT (CONSTRUCTOR_ELTS (ctor), i, idx, elt) @@ -2558,7 +2557,7 @@ check_ctz_array (tree ctor, unsigned HOST_WIDE_INT mulc, matched++; } - if (val >= 0 && val < bits && (((mulc << val) & mask) >> shift) == index) + if (val >= 0 && val < bits && validate_fn (val, index)) matched++; if (matched > bits) @@ -2568,47 +2567,47 @@ check_ctz_array (tree ctor, unsigned HOST_WIDE_INT mulc, return false; } -/* Check whether a string contains a valid ctz table. */ +/* Check whether a string contains a valid table according to VALIDATE_FN. */ +template<typename ValidateFn> static bool -check_ctz_string (tree string, unsigned HOST_WIDE_INT mulc, - HOST_WIDE_INT &zero_val, unsigned shift, unsigned bits) +check_table_string (tree string, HOST_WIDE_INT &zero_val,unsigned bits, + ValidateFn validate_fn) { unsigned HOST_WIDE_INT len = TREE_STRING_LENGTH (string); - unsigned HOST_WIDE_INT mask; unsigned matched = 0; const unsigned char *p = (const unsigned char *) TREE_STRING_POINTER (string); if (len < bits || len > bits * 2) return false; - mask = ((HOST_WIDE_INT_1U << (bits - shift)) - 1) << shift; - zero_val = p[0]; for (unsigned i = 0; i < len; i++) - if (p[i] < bits && (((mulc << p[i]) & mask) >> shift) == i) + if (p[i] < bits && validate_fn (p[i], i)) matched++; return matched == bits; } -/* Check whether CTOR contains a valid ctz table. */ +/* Check whether CTOR contains a valid table according to VALIDATE_FN. */ +template<typename ValidateFn> static bool -check_ctz_table (tree ctor, tree type, unsigned HOST_WIDE_INT mulc, - HOST_WIDE_INT &zero_val, unsigned shift, unsigned bits) +check_table (tree ctor, tree type, HOST_WIDE_INT &zero_val, unsigned bits, + ValidateFn validate_fn) { if (TREE_CODE (ctor) == CONSTRUCTOR) - return check_ctz_array (ctor, mulc, zero_val, shift, bits); + return check_table_array (ctor, zero_val, bits, validate_fn); else if (TREE_CODE (ctor) == STRING_CST && TYPE_PRECISION (type) == CHAR_TYPE_SIZE) - return check_ctz_string (ctor, mulc, zero_val, shift, bits); + return check_table_string (ctor, zero_val, bits, validate_fn); return false; } /* Match.pd function to match the ctz expression. */ extern bool gimple_ctz_table_index (tree, tree *, tree (*)(tree)); +extern bool gimple_clz_table_index (tree, tree *, tree (*)(tree)); -/* Recognize count trailing zeroes idiom. +/* Recognize count leading and trailing zeroes idioms. The canonical form is array[((x & -x) * C) >> SHIFT] where C is a magic constant which when multiplied by a power of 2 creates a unique value in the top 5 or 6 bits. This is then indexed into a table which maps it @@ -2617,7 +2616,7 @@ extern bool gimple_ctz_table_index (tree, tree *, tree (*)(tree)); the target. */ static bool -simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) +simplify_count_zeroes (gimple_stmt_iterator *gsi) { gimple *stmt = gsi_stmt (*gsi); tree array_ref = gimple_assign_rhs1 (stmt); @@ -2625,7 +2624,23 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) gcc_checking_assert (TREE_CODE (array_ref) == ARRAY_REF); - if (!gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], NULL)) + internal_fn fn = IFN_LAST; + /* For CTZ we recognize ((x & -x) * C) >> SHIFT where the array data + represents the number of trailing zeros. */ + if (gimple_ctz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], NULL)) + fn = IFN_CTZ; + /* For CLZ we recognize + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + (x * C) >> SHIFT + where 31 minus the array data represents the number of leading zeros. */ + else if (gimple_clz_table_index (TREE_OPERAND (array_ref, 1), &res_ops[0], + NULL)) + fn = IFN_CLZ; + else return false; HOST_WIDE_INT zero_val; @@ -2641,7 +2656,7 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) if (input_bits != 32 && input_bits != 64) return false; - if (!direct_internal_fn_supported_p (IFN_CTZ, input_type, OPTIMIZE_FOR_BOTH)) + if (!direct_internal_fn_supported_p (fn, input_type, OPTIMIZE_FOR_BOTH)) return false; /* Check the lower bound of the array is zero. */ @@ -2657,13 +2672,46 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) tree ctor = ctor_for_folding (array); if (!ctor) return false; - unsigned HOST_WIDE_INT val = tree_to_uhwi (res_ops[1]); - if (!check_ctz_table (ctor, type, val, zero_val, shiftval, input_bits)) - return false; + unsigned HOST_WIDE_INT mulval = tree_to_uhwi (res_ops[1]); + if (fn == IFN_CTZ) + { + auto checkfn = [&](unsigned data, unsigned i) -> bool + { + unsigned HOST_WIDE_INT mask + = ((HOST_WIDE_INT_1U << (input_bits - shiftval)) - 1) << shiftval; + return (((mulval << data) & mask) >> shiftval) == i; + }; + if (!check_table (ctor, type, zero_val, input_bits, checkfn)) + return false; + } + else if (fn == IFN_CLZ) + { + auto checkfn = [&](unsigned data, unsigned i) -> bool + { + unsigned HOST_WIDE_INT mask + = ((HOST_WIDE_INT_1U << (input_bits - shiftval)) - 1) << shiftval; + return (((((HOST_WIDE_INT_1U << (data + 1)) - 1) * mulval) & mask) + >> shiftval) == i; + }; + if (!check_table (ctor, type, zero_val, input_bits, checkfn)) + return false; + } - HOST_WIDE_INT ctz_val = 0; - bool zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (input_type), - ctz_val) == 2; + HOST_WIDE_INT ctz_val = -1; + bool zero_ok; + if (fn == IFN_CTZ) + { + ctz_val = 0; + zero_ok = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (input_type), + ctz_val) == 2; + } + else if (fn == IFN_CLZ) + { + ctz_val = 32; + zero_ok = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (input_type), + ctz_val) == 2; + zero_val = input_bits - 1 - zero_val; + } int nargs = 2; /* If the input value can't be zero, don't special case ctz (0). */ @@ -2689,7 +2737,7 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) gimple_seq seq = NULL; gimple *g; - gcall *call = gimple_build_call_internal (IFN_CTZ, nargs, res_ops[0], + gcall *call = gimple_build_call_internal (fn, nargs, res_ops[0], nargs == 1 ? NULL_TREE : build_int_cst (integer_type_node, ctz_val)); @@ -2698,6 +2746,17 @@ simplify_count_trailing_zeroes (gimple_stmt_iterator *gsi) gimple_seq_add_stmt (&seq, call); tree prev_lhs = gimple_call_lhs (call); + if (fn == IFN_CLZ) + { + g = gimple_build_assign (make_ssa_name (integer_type_node), + MINUS_EXPR, + build_int_cst (integer_type_node, + input_bits - 1), + prev_lhs); + gimple_set_location (g, gimple_location (stmt)); + gimple_seq_add_stmt (&seq, g); + prev_lhs = gimple_assign_lhs (g); + } /* Emit ctz (x) & 31 if ctz (0) is 32 but we need to return 0. */ if (zero_val == 0 && ctz_val == input_bits) @@ -4829,7 +4888,7 @@ pass_forwprop::execute (function *fun) && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE) changed |= simplify_vector_constructor (&gsi); else if (code == ARRAY_REF) - changed |= simplify_count_trailing_zeroes (&gsi); + changed |= simplify_count_zeroes (&gsi); break; }