Hi! This patch fixes various issues in the popcount{64,32}c matcher.
The first issue is that it blindly uses tree_to_uhwi on all the INTEGER_CST operands. That is fine for those where we know their type, after the prec <= 64 && TYPE_UNSIGNED (type) verification, but shift counts can have different types and so we need to handle e.g. invalid shifts by negative amounts. Another issue is that the transformation is I believe only valid for 16, 32 and 64-bit precision, e.g. if it would be done in 24-bit or 62-bit, it wouldn't be a popcount. For 8-bit, it would likely not match due to >> 0, etc. Yet another issue is that the >> shift computations could very well shift by negative amounts e.g. for 128-bit precision, invoking UB in the compiler until we actually check the precision later. And, I think we want to use HOST_WIDE_INT_UC macro instead of hardcoding ULL suffixes. The formatting didn't match the match.pd formatting style either. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? One thing I haven't done anything about yet is that there is FAIL: gcc.dg/tree-ssa/popcount4ll.c scan-tree-dump-times optimized ".POPCOUNT" 1 before/after this patch with -m32/-march=skylake-avx512. That is because the popcountll effective target tests that we don't emit a call for __builtin_popcountll, which we don't on ia32 skylake-avx512, but direct_internal_fn_supported_p isn't true - that is because we expand the double word popcount using 2 word popcounts + addition. Shall the match.pd case handle that case too by allowing the optimization even if there is a type with half precision for which direct_internal_fn_supported_p? 2019-12-31 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/93098 * match.pd (popcount): For shift amounts, use integer_onep or wi::to_widest () == cst instead of tree_to_uhwi () == cst tests. Make sure that precision is power of two larger than or equal to 16. Ensure shift is never negative. Use HOST_WIDE_INT_UC macro instead of ULL suffixed constants. Formatting fixes. * gcc.c-torture/compile/pr93098.c: New test. --- gcc/match.pd.jj 2019-12-09 11:12:48.351010794 +0100 +++ gcc/match.pd 2019-12-30 09:52:05.423499909 +0100 @@ -5786,43 +5786,50 @@ (define_operator_list COND_TERNARY return (x * 0x01010101) >> 24; } */ (simplify - (rshift - (mult - (bit_and - (plus:c - (rshift @8 INTEGER_CST@5) - (plus:c@8 - (bit_and @6 INTEGER_CST@7) - (bit_and - (rshift - (minus@6 - @0 - (bit_and - (rshift @0 INTEGER_CST@4) - INTEGER_CST@11)) - INTEGER_CST@10) - INTEGER_CST@9))) - INTEGER_CST@3) - INTEGER_CST@2) - INTEGER_CST@1) + (rshift + (mult + (bit_and + (plus:c + (rshift @8 INTEGER_CST@5) + (plus:c@8 + (bit_and @6 INTEGER_CST@7) + (bit_and + (rshift + (minus@6 @0 + (bit_and (rshift @0 INTEGER_CST@4) INTEGER_CST@11)) + INTEGER_CST@10) + INTEGER_CST@9))) + INTEGER_CST@3) + INTEGER_CST@2) + INTEGER_CST@1) /* Check constants and optab. */ - (with - { - unsigned prec = TYPE_PRECISION (type); - int shift = 64 - prec; - const unsigned HOST_WIDE_INT c1 = 0x0101010101010101ULL >> shift, - c2 = 0x0F0F0F0F0F0F0F0FULL >> shift, - c3 = 0x3333333333333333ULL >> shift, - c4 = 0x5555555555555555ULL >> shift; - } - (if (prec <= 64 && TYPE_UNSIGNED (type) && tree_to_uhwi (@4) == 1 - && tree_to_uhwi (@10) == 2 && tree_to_uhwi (@5) == 4 - && tree_to_uhwi (@1) == prec - 8 && tree_to_uhwi (@2) == c1 - && tree_to_uhwi (@3) == c2 && tree_to_uhwi (@9) == c3 - && tree_to_uhwi (@7) == c3 && tree_to_uhwi (@11) == c4 - && direct_internal_fn_supported_p (IFN_POPCOUNT, type, - OPTIMIZE_FOR_BOTH)) - (convert (IFN_POPCOUNT:type @0))))) + (with { unsigned prec = TYPE_PRECISION (type); + int shift = (64 - prec) & 63; + unsigned HOST_WIDE_INT c1 + = HOST_WIDE_INT_UC (0x0101010101010101) >> shift; + unsigned HOST_WIDE_INT c2 + = HOST_WIDE_INT_UC (0x0F0F0F0F0F0F0F0F) >> shift; + unsigned HOST_WIDE_INT c3 + = HOST_WIDE_INT_UC (0x3333333333333333) >> shift; + unsigned HOST_WIDE_INT c4 + = HOST_WIDE_INT_UC (0x5555555555555555) >> shift; + } + (if (prec >= 16 + && prec <= 64 + && pow2p_hwi (prec) + && TYPE_UNSIGNED (type) + && integer_onep (@4) + && wi::to_widest (@10) == 2 + && wi::to_widest (@5) == 4 + && wi::to_widest (@1) == prec - 8 + && tree_to_uhwi (@2) == c1 + && tree_to_uhwi (@3) == c2 + && tree_to_uhwi (@9) == c3 + && tree_to_uhwi (@7) == c3 + && tree_to_uhwi (@11) == c4 + && direct_internal_fn_supported_p (IFN_POPCOUNT, type, + OPTIMIZE_FOR_BOTH)) + (convert (IFN_POPCOUNT:type @0))))) #endif /* Simplify: --- gcc/testsuite/gcc.c-torture/compile/pr93098.c.jj 2019-12-30 10:18:28.750401210 +0100 +++ gcc/testsuite/gcc.c-torture/compile/pr93098.c 2019-12-30 10:18:12.491648669 +0100 @@ -0,0 +1,37 @@ +/* PR tree-optimization/93098 */ + +int +foo (unsigned long long x) +{ + x -= (x >> -1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> 56; +} + +int +bar (unsigned long long x) +{ + x -= (x >> 1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> -2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> 56; +} + +int +baz (unsigned long long x) +{ + x -= (x >> 1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> -4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> 56; +} + +int +qux (unsigned long long x) +{ + x -= (x >> 1) & 0x5555555555555555ULL; + x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL; + return (x * 0x0101010101010101ULL) >> -56; +} Jakub