On Sat, 18 Nov 2023, Jakub Jelinek wrote: > Hi! > > On Fri, Nov 17, 2023 at 03:01:04PM +0100, Jakub Jelinek wrote: > > As a follow-up, I'm considering changing in this routine the popcount > > call to IFN_POPCOUNT with 2 arguments and during expansion test costs. > > Here is the follow-up which does the rtx costs testing. > While having to tweak internal-fn.def so that POPCOUNT can have a custom > expand_POPCOUNT, I have noticed we are inconsistent, some DEF_INTERNAL* > macros (most of them) were undefined at the end of internal-fn.def (but in > some cases uselessly undefined again after inclusion), while others were not > (and sometimes undefined after the inclusion). I've changed it to always > undefine at the end of internal-fn.def.
The last bit is cleanup that could go in independently. > Ok for trunk if it passes bootstrap/regtest? > > 2023-11-18 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/90693 > * tree-ssa-math-opts.cc (match_single_bit_test): Mark POPCOUNT with > result only used in equality comparison against 1 with direct optab > support as .POPCOUNT call with 2 arguments. > * internal-fn.h (expand_POPCOUNT): Declare. > * internal-fn.def: Document missing DEF_INTERNAL* macros and make sure > they are all undefined at the end. > (DEF_INTERNAL_INT_EXT_FN): New macro. > (POPCOUNT): Use it instead of DEF_INTERNAL_INT_FN. > * internal-fn.cc (lookup_hilo_internal_fn, lookup_evenodd_internal_fn, > widening_fn_p, get_len_internal_fn): Don't undef DEF_INTERNAL_*FN > macros after inclusion of internal-fn.def. > (DEF_INTERNAL_INT_EXT_FN): Define to nothing before inclusion to > define expanders. > (expand_POPCOUNT): New function. > > --- gcc/tree-ssa-math-opts.cc.jj 2023-11-18 09:38:03.460813910 +0100 > +++ gcc/tree-ssa-math-opts.cc 2023-11-18 10:25:18.751207936 +0100 > @@ -5195,7 +5195,16 @@ match_single_bit_test (gimple_stmt_itera > if (!INTEGRAL_TYPE_P (type)) > return; > if (direct_internal_fn_supported_p (IFN_POPCOUNT, type, OPTIMIZE_FOR_BOTH)) > - return; > + { > + /* Tell expand_POPCOUNT the popcount result is only used in equality > + comparison with one, so that it can decide based on rtx costs. */ > + gimple *g = gimple_build_call_internal (IFN_POPCOUNT, 2, arg, > + integer_one_node); > + gimple_call_set_lhs (g, gimple_call_lhs (call)); > + gimple_stmt_iterator gsi2 = gsi_for_stmt (call); > + gsi_replace (&gsi2, g, true); > + return; > + } > tree argm1 = make_ssa_name (type); > gimple *g = gimple_build_assign (argm1, PLUS_EXPR, arg, > build_int_cst (type, -1)); > --- gcc/internal-fn.h.jj 2023-11-02 12:15:12.223998565 +0100 > +++ gcc/internal-fn.h 2023-11-18 10:14:25.834340505 +0100 > @@ -262,6 +262,7 @@ extern void expand_MULBITINT (internal_f > extern void expand_DIVMODBITINT (internal_fn, gcall *); > extern void expand_FLOATTOBITINT (internal_fn, gcall *); > extern void expand_BITINTTOFLOAT (internal_fn, gcall *); > +extern void expand_POPCOUNT (internal_fn, gcall *); > > extern bool vectorized_internal_fn_supported_p (internal_fn, tree); > > --- gcc/internal-fn.def.jj 2023-11-17 15:51:02.642802521 +0100 > +++ gcc/internal-fn.def 2023-11-18 10:12:10.329236626 +0100 > @@ -33,9 +33,13 @@ along with GCC; see the file COPYING3. > DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SIGNED_OPTAB, > UNSIGNED_OPTAB, TYPE) > DEF_INTERNAL_FLT_FN (NAME, FLAGS, OPTAB, TYPE) > + DEF_INTERNAL_FLT_FLOATN_FN (NAME, FLAGS, OPTAB, TYPE) > DEF_INTERNAL_INT_FN (NAME, FLAGS, OPTAB, TYPE) > + DEF_INTERNAL_INT_EXT_FN (NAME, FLAGS, OPTAB, TYPE) > DEF_INTERNAL_COND_FN (NAME, FLAGS, OPTAB, TYPE) > DEF_INTERNAL_SIGNED_COND_FN (NAME, FLAGS, OPTAB, TYPE) > + DEF_INTERNAL_WIDENING_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, > + TYPE) > > where NAME is the name of the function, FLAGS is a set of > ECF_* flags and FNSPEC is a string describing functions fnspec. > @@ -97,6 +101,10 @@ along with GCC; see the file COPYING3. > says that the function extends the C-level BUILT_IN_<NAME>{,L,LL,IMAX} > group of functions to any integral mode (including vector modes). > > + DEF_INTERNAL_INT_EXT_FN is like DEF_INTERNAL_INT_FN, except that it > + has expand_##NAME defined in internal-fn.cc to override the > + DEF_INTERNAL_INT_FN expansion behavior. > + > DEF_INTERNAL_WIDENING_OPTAB_FN is a wrapper that defines five internal > functions with DEF_INTERNAL_SIGNED_OPTAB_FN: > - one that describes a widening operation with the same number of elements > @@ -160,6 +168,11 @@ along with GCC; see the file COPYING3. > DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE) > #endif > > +#ifndef DEF_INTERNAL_INT_EXT_FN > +#define DEF_INTERNAL_INT_EXT_FN(NAME, FLAGS, OPTAB, TYPE) \ > + DEF_INTERNAL_INT_FN (NAME, FLAGS, OPTAB, TYPE) > +#endif > + > #ifndef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, FLAGS, SELECTOR, SOPTAB, > UOPTAB, TYPE) \ > DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, TYPE) > \ > @@ -432,7 +445,7 @@ DEF_INTERNAL_INT_FN (CLZ, ECF_CONST | EC > DEF_INTERNAL_INT_FN (CTZ, ECF_CONST | ECF_NOTHROW, ctz, unary) > DEF_INTERNAL_INT_FN (FFS, ECF_CONST | ECF_NOTHROW, ffs, unary) > DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity, unary) > -DEF_INTERNAL_INT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) > +DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW, popcount, unary) > > DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) > DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) > @@ -572,6 +585,10 @@ DEF_INTERNAL_FN (DIVMODBITINT, ECF_LEAF, > DEF_INTERNAL_FN (FLOATTOBITINT, ECF_LEAF | ECF_NOTHROW, ". O . . ") > DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE | ECF_LEAF, ". R . ") > > +#undef DEF_INTERNAL_WIDENING_OPTAB_FN > +#undef DEF_INTERNAL_SIGNED_COND_FN > +#undef DEF_INTERNAL_COND_FN > +#undef DEF_INTERNAL_INT_EXT_FN > #undef DEF_INTERNAL_INT_FN > #undef DEF_INTERNAL_FLT_FN > #undef DEF_INTERNAL_FLT_FLOATN_FN > --- gcc/internal-fn.cc.jj 2023-11-07 08:32:01.741254155 +0100 > +++ gcc/internal-fn.cc 2023-11-18 11:06:31.740703230 +0100 > @@ -102,8 +102,6 @@ lookup_hilo_internal_fn (internal_fn ifn > { > default: > gcc_unreachable (); > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ > case IFN_##NAME: \ > @@ -111,8 +109,6 @@ lookup_hilo_internal_fn (internal_fn ifn > *hi = internal_fn (IFN_##NAME##_HI); \ > break; > #include "internal-fn.def" > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > } > } > > @@ -129,8 +125,6 @@ lookup_evenodd_internal_fn (internal_fn > { > default: > gcc_unreachable (); > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_FN(NAME, FLAGS, TYPE) > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ > case IFN_##NAME: \ > @@ -138,8 +132,6 @@ lookup_evenodd_internal_fn (internal_fn > *odd = internal_fn (IFN_##NAME##_ODD); \ > break; > #include "internal-fn.def" > -#undef DEF_INTERNAL_FN > -#undef DEF_INTERNAL_WIDENING_OPTAB_FN > } > } > > @@ -4261,7 +4253,6 @@ widening_fn_p (code_helper code) > internal_fn fn = as_internal_fn ((combined_fn) code); > switch (fn) > { > - #undef DEF_INTERNAL_WIDENING_OPTAB_FN > #define DEF_INTERNAL_WIDENING_OPTAB_FN(NAME, F, S, SO, UO, T) \ > case IFN_##NAME: \ > case IFN_##NAME##_HI: \ > @@ -4270,7 +4261,6 @@ widening_fn_p (code_helper code) > case IFN_##NAME##_ODD: \ > return true; > #include "internal-fn.def" > - #undef DEF_INTERNAL_WIDENING_OPTAB_FN > > default: > return false; > @@ -4295,6 +4285,7 @@ set_edom_supported_p (void) > { \ > expand_##TYPE##_optab_fn (fn, stmt, OPTAB##_optab); \ > } > +#define DEF_INTERNAL_INT_EXT_FN(CODE, FLAGS, OPTAB, TYPE) > #define DEF_INTERNAL_SIGNED_OPTAB_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \ > UNSIGNED_OPTAB, TYPE) \ > static void > \ > @@ -4305,8 +4296,6 @@ set_edom_supported_p (void) > expand_##TYPE##_optab_fn (fn, stmt, which_optab); > \ > } > #include "internal-fn.def" > -#undef DEF_INTERNAL_OPTAB_FN > -#undef DEF_INTERNAL_SIGNED_OPTAB_FN > > /* Routines to expand each internal function, indexed by function number. > Each routine has the prototype: > @@ -4465,8 +4454,6 @@ get_len_internal_fn (internal_fn fn) > { > switch (fn) > { > -#undef DEF_INTERNAL_COND_FN > -#undef DEF_INTERNAL_SIGNED_COND_FN > #define DEF_INTERNAL_COND_FN(NAME, ...) > \ > case IFN_COND_##NAME: > \ > return IFN_COND_LEN_##NAME; > @@ -4474,8 +4461,6 @@ get_len_internal_fn (internal_fn fn) > case IFN_COND_##NAME: > \ > return IFN_COND_LEN_##NAME; > #include "internal-fn.def" > -#undef DEF_INTERNAL_COND_FN > -#undef DEF_INTERNAL_SIGNED_COND_FN > default: > return IFN_LAST; > } > @@ -5113,3 +5098,67 @@ expand_BITINTTOFLOAT (internal_fn, gcall > if (val != target) > emit_move_insn (target, val); > } > + > +void > +expand_POPCOUNT (internal_fn fn, gcall *stmt) > +{ > + if (gimple_call_num_args (stmt) == 1) > + { > + expand_unary_optab_fn (fn, stmt, popcount_optab); > + return; > + } > + /* If .POPCOUNT call has 2 arguments, match_single_bit_test marked it > + because the result is only used in an equality comparison against 1. > + Use rtx costs in that case to determine if .POPCOUNT (arg) == 1 > + or (arg ^ (arg - 1)) > arg - 1 is cheaper. */ We're doing plenty of RTX costing during GIMPLE optimizations, I wonder if it could be done equally well there given that seq_cost of an RTX sequence isn't going to capture the compare & branch cost very reliably anyway ... > + bool speed_p = optimize_insn_for_speed_p (); > + tree lhs = gimple_call_lhs (stmt); > + tree arg = gimple_call_arg (stmt, 0); > + tree type = TREE_TYPE (arg); > + machine_mode mode = TYPE_MODE (type); > + do_pending_stack_adjust (); > + start_sequence (); > + expand_unary_optab_fn (fn, stmt, popcount_optab); > + rtx_insn *popcount_insns = get_insns (); > + end_sequence (); > + start_sequence (); > + rtx plhs = expand_normal (lhs); > + rtx pcmp = emit_store_flag (NULL_RTX, EQ, plhs, const1_rtx, mode, 0, 0); > + if (pcmp == NULL_RTX) > + { > + fail: > + end_sequence (); > + emit_insn (popcount_insns); > + return; > + } > + rtx_insn *popcount_cmp_insns = get_insns (); > + end_sequence (); > + start_sequence (); > + rtx op0 = expand_normal (arg); > + rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX, > + 1, OPTAB_DIRECT); > + if (argm1 == NULL_RTX) > + goto fail; > + rtx argxorargm1 = expand_simple_binop (mode, XOR, op0, argm1, NULL_RTX, > + 1, OPTAB_DIRECT); > + if (argxorargm1 == NULL_RTX) > + goto fail; > + rtx cmp = emit_store_flag (NULL_RTX, GTU, argxorargm1, argm1, mode, 1, 1); > + if (cmp == NULL_RTX) > + goto fail; > + rtx_insn *cmp_insns = get_insns (); > + end_sequence (); > + unsigned popcount_cost = (seq_cost (popcount_insns, speed_p) > + + seq_cost (popcount_cmp_insns, speed_p)); > + unsigned cmp_cost = seq_cost (cmp_insns, speed_p); > + if (popcount_cost < cmp_cost) > + emit_insn (popcount_insns); > + else > + { > + emit_insn (cmp_insns); > + plhs = expand_normal (lhs); > + if (GET_MODE (cmp) != GET_MODE (plhs)) > + cmp = convert_to_mode (GET_MODE (plhs), cmp, 1); > + emit_move_insn (plhs, cmp); > + } > +} > > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)