The only part I changed is related to size_cost of sse_to_ineteger, as below
114+ /* Under TARGET_SSE4_1, it's vmovd + vpextrd/vpinsrd. 115+ W/o it, it's movd + psrlq/unpckldq + movd. */ 116+ else if (!TARGET_64BIT && smode != SImode) 117+ cost *= TARGET_SSE4_1 ? 2 : 3; 118+ Ok for trunk? n some benchmark, I notice stv failed due to cost unprofitable, but the igain is inside the loop, but sse<->integer conversion is outside the loop, current cost model doesn't consider the frequency of those gain/cost. The patch weights those cost with frequency. The patch regressed gcc.target/i386/minmax-6.c under -m32 Since the place of integer<->sse is before the branch, and the conversion to min/max is in the branch, with static profile, the cost model is not profitable anymore which is exactly the patch try to do. Considering the original testcase is to guard RA issue, so restrict the testcase under ! ia32 should still be ok. gcc/ChangeLog: * config/i386/i386-features.cc (scalar_chain::mark_dual_mode_def): Weight n_integer_to_sse/n_sse_to_integer with bb frequency. (general_scalar_chain::compute_convert_gain): Ditto, and adjust function prototype to return true/false when cost model is profitable or not. (timode_scalar_chain::compute_convert_gain): Ditto. (convert_scalars_to_vector): Adjust after the upper two function prototype are changed. * config/i386/i386-features.h (class scalar_chain): Change n_integer_to_sse/n_sse_to_integer to cost_sse_integer, and add weighted_cost_sse_integer. (class general_scalar_chain): Adjust prototype to return bool intead of int. (class timode_scalar_chain): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/minmax-6.c: Adjust testcase. --- gcc/config/i386/i386-features.cc | 216 ++++++++++++----------- gcc/config/i386/i386-features.h | 11 +- gcc/testsuite/gcc.target/i386/minmax-6.c | 2 +- 3 files changed, 124 insertions(+), 105 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c35ac24fd8a..5f21130db58 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -296,8 +296,8 @@ scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) insns_conv = BITMAP_ALLOC (NULL); queue = NULL; - n_sse_to_integer = 0; - n_integer_to_sse = 0; + cost_sse_integer = 0; + weighted_cost_sse_integer = 0 ; max_visits = x86_stv_max_visits; } @@ -337,20 +337,40 @@ scalar_chain::mark_dual_mode_def (df_ref def) /* Record the def/insn pair so we can later efficiently iterate over the defs to convert on insns not in the chain. */ bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); + basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def)); + profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count; + bool speed_p = optimize_bb_for_speed_p (bb); + sreal bb_freq = bb->count.to_sreal_scale (entry_count); + int cost = 0; + if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def))) { if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def)) && !reg_new) return; - n_integer_to_sse++; + + /* ??? integer_to_sse but we only have that in the RA cost table. + Assume sse_to_integer/integer_to_sse are the same which they. */ + cost = speed_p ? ix86_cost->sse_to_integer + : ix86_size_cost.sse_to_integer; } else { if (!reg_new) return; - n_sse_to_integer++; + cost = speed_p ? ix86_cost->sse_to_integer + : ix86_size_cost.sse_to_integer; } + if (speed_p) + weighted_cost_sse_integer += bb_freq * cost; + /* Under TARGET_SSE4_1, it's vmovd + vpextrd/vpinsrd. + W/o it, it's movd + psrlq/unpckldq + movd. */ + else if (!TARGET_64BIT && smode != SImode) + cost *= TARGET_SSE4_1 ? 2 : 3; + + cost_sse_integer += cost; + if (dump_file) fprintf (dump_file, " Mark r%d def in insn %d as requiring both modes in chain #%d\n", @@ -529,15 +549,15 @@ general_scalar_chain::vector_const_cost (rtx exp) return ix86_cost->sse_load[smode == DImode ? 1 : 0]; } -/* Compute a gain for chain conversion. */ +/* Return true if it's cost profitable for chain conversion. */ -int +bool general_scalar_chain::compute_convert_gain () { bitmap_iterator bi; unsigned insn_uid; int gain = 0; - int cost = 0; + sreal weighted_gain = 0; if (dump_file) fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); @@ -556,25 +576,30 @@ general_scalar_chain::compute_convert_gain () rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); int igain = 0; + basic_block bb = BLOCK_FOR_INSN (insn); + profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count; + bool speed_p = optimize_bb_for_speed_p (bb); + sreal bb_freq = bb->count.to_sreal_scale (entry_count); + const struct processor_costs* uarch_cost + = speed_p ? ix86_cost : &ix86_size_cost; if (REG_P (src) && REG_P (dst)) - igain += 2 * m - ix86_cost->xmm_move; + igain += 2 * m - uarch_cost->xmm_move; else if (REG_P (src) && MEM_P (dst)) igain - += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx]; + += m * uarch_cost->int_store[2] - uarch_cost->sse_store[sse_cost_idx]; else if (MEM_P (src) && REG_P (dst)) - igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx]; + igain += m * uarch_cost->int_load[2] - uarch_cost->sse_load[sse_cost_idx]; else { /* For operations on memory operands, include the overhead of explicit load and store instructions. */ if (MEM_P (dst)) - igain += optimize_insn_for_size_p () - ? -COSTS_N_BYTES (8) - : (m * (ix86_cost->int_load[2] - + ix86_cost->int_store[2]) - - (ix86_cost->sse_load[sse_cost_idx] + - ix86_cost->sse_store[sse_cost_idx])); + igain += !speed_p ? -COSTS_N_BYTES (8) + : (m * (uarch_cost->int_load[2] + + uarch_cost->int_store[2]) + - (uarch_cost->sse_load[sse_cost_idx] + + uarch_cost->sse_store[sse_cost_idx])); switch (GET_CODE (src)) { @@ -584,15 +609,15 @@ general_scalar_chain::compute_convert_gain () if (m == 2) { if (INTVAL (XEXP (src, 1)) >= 32) - igain += ix86_cost->add; + igain += uarch_cost->add; /* Gain for extend highpart case. */ else if (GET_CODE (XEXP (src, 0)) == ASHIFT) - igain += ix86_cost->shift_const - ix86_cost->sse_op; + igain += uarch_cost->shift_const - uarch_cost->sse_op; else - igain += ix86_cost->shift_const; + igain += uarch_cost->shift_const; } - igain += ix86_cost->shift_const - ix86_cost->sse_op; + igain += uarch_cost->shift_const - uarch_cost->sse_op; if (CONST_INT_P (XEXP (src, 0))) igain -= vector_const_cost (XEXP (src, 0)); @@ -600,23 +625,23 @@ general_scalar_chain::compute_convert_gain () case ROTATE: case ROTATERT: - igain += m * ix86_cost->shift_const; + igain += m * uarch_cost->shift_const; if (TARGET_AVX512VL) - igain -= ix86_cost->sse_op; + igain -= uarch_cost->sse_op; else if (smode == DImode) { int bits = INTVAL (XEXP (src, 1)); if ((bits & 0x0f) == 0) - igain -= ix86_cost->sse_op; + igain -= uarch_cost->sse_op; else if ((bits & 0x07) == 0) - igain -= 2 * ix86_cost->sse_op; + igain -= 2 * uarch_cost->sse_op; else - igain -= 3 * ix86_cost->sse_op; + igain -= 3 * uarch_cost->sse_op; } else if (INTVAL (XEXP (src, 1)) == 16) - igain -= ix86_cost->sse_op; + igain -= uarch_cost->sse_op; else - igain -= 2 * ix86_cost->sse_op; + igain -= 2 * uarch_cost->sse_op; break; case AND: @@ -624,11 +649,11 @@ general_scalar_chain::compute_convert_gain () case XOR: case PLUS: case MINUS: - igain += m * ix86_cost->add - ix86_cost->sse_op; + igain += m * uarch_cost->add - uarch_cost->sse_op; /* Additional gain for andnot for targets without BMI. */ if (GET_CODE (XEXP (src, 0)) == NOT && !TARGET_BMI) - igain += m * ix86_cost->add; + igain += m * uarch_cost->add; if (CONST_INT_P (XEXP (src, 0))) igain -= vector_const_cost (XEXP (src, 0)); @@ -636,21 +661,21 @@ general_scalar_chain::compute_convert_gain () igain -= vector_const_cost (XEXP (src, 1)); if (MEM_P (XEXP (src, 1))) { - if (optimize_insn_for_size_p ()) + if (!speed_p) igain -= COSTS_N_BYTES (m == 2 ? 3 : 5); else - igain += m * ix86_cost->int_load[2] - - ix86_cost->sse_load[sse_cost_idx]; + igain += m * uarch_cost->int_load[2] + - uarch_cost->sse_load[sse_cost_idx]; } break; case NEG: case NOT: - igain -= ix86_cost->sse_op + COSTS_N_INSNS (1); + igain -= uarch_cost->sse_op + COSTS_N_INSNS (1); if (GET_CODE (XEXP (src, 0)) != ABS) { - igain += m * ix86_cost->add; + igain += m * uarch_cost->add; break; } /* FALLTHRU */ @@ -662,9 +687,9 @@ general_scalar_chain::compute_convert_gain () case UMIN: /* We do not have any conditional move cost, estimate it as a reg-reg move. Comparisons are costed as adds. */ - igain += m * (COSTS_N_INSNS (2) + ix86_cost->add); + igain += m * (COSTS_N_INSNS (2) + uarch_cost->add); /* Integer SSE ops are all costed the same. */ - igain -= ix86_cost->sse_op; + igain -= uarch_cost->sse_op; break; case COMPARE: @@ -698,7 +723,7 @@ general_scalar_chain::compute_convert_gain () case CONST_INT: if (REG_P (dst)) { - if (optimize_insn_for_size_p ()) + if (!speed_p) { /* xor (2 bytes) vs. xorps (3 bytes). */ if (src == const0_rtx) @@ -727,8 +752,8 @@ general_scalar_chain::compute_convert_gain () } else if (MEM_P (dst)) { - igain += (m * ix86_cost->int_store[2] - - ix86_cost->sse_store[sse_cost_idx]); + igain += (m * uarch_cost->int_store[2] + - uarch_cost->sse_store[sse_cost_idx]); igain -= vector_const_cost (src); } break; @@ -737,16 +762,16 @@ general_scalar_chain::compute_convert_gain () if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx) { // movd (4 bytes) replaced with movdqa (4 bytes). - if (!optimize_insn_for_size_p ()) - igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move; + if (speed_p) + igain += uarch_cost->sse_to_integer - uarch_cost->xmm_move; } else { // pshufd; movd replaced with pshufd. - if (optimize_insn_for_size_p ()) + if (!speed_p) igain += COSTS_N_BYTES (4); else - igain += ix86_cost->sse_to_integer; + igain += uarch_cost->sse_to_integer; } break; @@ -755,55 +780,35 @@ general_scalar_chain::compute_convert_gain () } } + if (speed_p) + weighted_gain += bb_freq * igain; if (igain != 0 && dump_file) { - fprintf (dump_file, " Instruction gain %d for ", igain); + fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for", + igain, bb_freq.to_double ()); dump_insn_slim (dump_file, insn); } gain += igain; } if (dump_file) - fprintf (dump_file, " Instruction conversion gain: %d\n", gain); - - /* Cost the integer to sse and sse to integer moves. */ - if (!optimize_function_for_size_p (cfun)) - { - cost += n_sse_to_integer * ix86_cost->sse_to_integer; - /* ??? integer_to_sse but we only have that in the RA cost table. - Assume sse_to_integer/integer_to_sse are the same which they - are at the moment. */ - cost += n_integer_to_sse * ix86_cost->sse_to_integer; - } - else if (TARGET_64BIT || smode == SImode) - { - cost += n_sse_to_integer * COSTS_N_BYTES (4); - cost += n_integer_to_sse * COSTS_N_BYTES (4); - } - else if (TARGET_SSE4_1) { - /* vmovd (4 bytes) + vpextrd (6 bytes). */ - cost += n_sse_to_integer * COSTS_N_BYTES (10); - /* vmovd (4 bytes) + vpinsrd (6 bytes). */ - cost += n_integer_to_sse * COSTS_N_BYTES (10); - } - else - { - /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */ - cost += n_sse_to_integer * COSTS_N_BYTES (13); - /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */ - cost += n_integer_to_sse * COSTS_N_BYTES (12); - } - - if (dump_file) - fprintf (dump_file, " Registers conversion cost: %d\n", cost); + fprintf (dump_file, " Instruction conversion gain: %d, \n", + gain); + fprintf (dump_file, " Registers conversion cost: %d\n", + cost_sse_integer); - gain -= cost; + fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n", + weighted_gain.to_double ()); - if (dump_file) - fprintf (dump_file, " Total gain: %d\n", gain); + fprintf (dump_file, " Weighted registers conversion cost: %.2f\n", + weighted_cost_sse_integer.to_double ()); + } - return gain; + if (weighted_gain != weighted_cost_sse_integer) + return weighted_gain > weighted_cost_sse_integer; + else + return gain > cost_sse_integer;; } /* Insert generated conversion instruction sequence INSNS @@ -1520,21 +1525,22 @@ timode_immed_const_gain (rtx cst) return 0; } -/* Compute a gain for chain conversion. */ +/* Return true it's cost profitable for for chain conversion. */ -int +bool timode_scalar_chain::compute_convert_gain () { /* Assume that if we have to move TImode values between units, then transforming this chain isn't worth it. */ - if (n_sse_to_integer || n_integer_to_sse) - return -1; + if (cost_sse_integer) + return false; bitmap_iterator bi; unsigned insn_uid; /* Split ties to prefer V1TImode when not optimizing for size. */ int gain = optimize_size ? 0 : 1; + sreal weighted_gain = 0; if (dump_file) fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); @@ -1549,31 +1555,35 @@ timode_scalar_chain::compute_convert_gain () int scost, vcost; int igain = 0; + basic_block bb = BLOCK_FOR_INSN (insn); + profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count; + bool speed_p = optimize_bb_for_speed_p (bb); + sreal bb_freq = bb->count.to_sreal_scale (entry_count); + switch (GET_CODE (src)) { case REG: - if (optimize_insn_for_size_p ()) + if (!speed_p) igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3); else igain = COSTS_N_INSNS (1); break; case MEM: - igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7) + igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1); break; case CONST_INT: if (MEM_P (dst) && standard_sse_constant_p (src, V1TImode)) - igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1; + igain = !speed_p ? COSTS_N_BYTES (11) : 1; break; case CONST_WIDE_INT: /* 2 x mov vs. vmovdqa. */ if (MEM_P (dst)) - igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3) - : COSTS_N_INSNS (1); + igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1); break; case NOT: @@ -1594,7 +1604,7 @@ timode_scalar_chain::compute_convert_gain () case LSHIFTRT: /* See ix86_expand_v1ti_shift. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_insn_for_size_p ()) + if (!speed_p) { if (op1val == 64 || op1val == 65) scost = COSTS_N_BYTES (5); @@ -1628,7 +1638,7 @@ timode_scalar_chain::compute_convert_gain () case ASHIFTRT: /* See ix86_expand_v1ti_ashiftrt. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_insn_for_size_p ()) + if (!speed_p) { if (op1val == 64 || op1val == 127) scost = COSTS_N_BYTES (7); @@ -1706,7 +1716,7 @@ timode_scalar_chain::compute_convert_gain () case ROTATERT: /* See ix86_expand_v1ti_rotate. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_insn_for_size_p ()) + if (!speed_p) { scost = COSTS_N_BYTES (13); if ((op1val & 31) == 0) @@ -1738,15 +1748,15 @@ timode_scalar_chain::compute_convert_gain () { if (GET_CODE (XEXP (src, 0)) == AND) /* and;and;or (9 bytes) vs. ptest (5 bytes). */ - igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (4) + igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2); /* or (3 bytes) vs. ptest (5 bytes). */ - else if (optimize_insn_for_size_p ()) + else if (!speed_p) igain = -COSTS_N_BYTES (2); } else if (XEXP (src, 1) == const1_rtx) /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */ - igain = optimize_insn_for_size_p() ? -COSTS_N_BYTES (6) + igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1); break; @@ -1756,16 +1766,24 @@ timode_scalar_chain::compute_convert_gain () if (igain != 0 && dump_file) { - fprintf (dump_file, " Instruction gain %d for ", igain); + fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ", + igain, bb_freq.to_double ()); dump_insn_slim (dump_file, insn); } + gain += igain; + if (speed_p) + weighted_gain += bb_freq * igain; } if (dump_file) - fprintf (dump_file, " Total gain: %d\n", gain); + fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n", + gain, weighted_gain.to_double ()); - return gain; + if (weighted_gain > (sreal) 0) + return true; + else + return gain > 0; } /* Fix uses of converted REG in debug insns. */ @@ -2561,7 +2579,7 @@ convert_scalars_to_vector (bool timode_p) conversions. */ if (chain->build (&candidates[i], uid, disallowed)) { - if (chain->compute_convert_gain () > 0) + if (chain->compute_convert_gain ()) converted_insns += chain->convert (); else if (dump_file) fprintf (dump_file, "Chain #%d conversion is not profitable\n", diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h index 24b0c4ed0cd..ea2c189fea8 100644 --- a/gcc/config/i386/i386-features.h +++ b/gcc/config/i386/i386-features.h @@ -153,12 +153,13 @@ class scalar_chain bitmap insns_conv; hash_map<rtx, rtx> defs_map; - unsigned n_sse_to_integer; - unsigned n_integer_to_sse; + /* Cost of inserted conversion between ineteger and sse. */ + int cost_sse_integer; + sreal weighted_cost_sse_integer; auto_vec<rtx_insn *> control_flow_insns; bool build (bitmap candidates, unsigned insn_uid, bitmap disallowed); - virtual int compute_convert_gain () = 0; + virtual bool compute_convert_gain () = 0; int convert (); protected: @@ -184,7 +185,7 @@ class general_scalar_chain : public scalar_chain public: general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) : scalar_chain (smode_, vmode_) {} - int compute_convert_gain () final override; + bool compute_convert_gain () final override; private: void convert_insn (rtx_insn *insn) final override; @@ -196,7 +197,7 @@ class timode_scalar_chain : public scalar_chain { public: timode_scalar_chain () : scalar_chain (TImode, V1TImode) {} - int compute_convert_gain () final override; + bool compute_convert_gain () final override; private: void fix_debug_reg_uses (rtx reg); diff --git a/gcc/testsuite/gcc.target/i386/minmax-6.c b/gcc/testsuite/gcc.target/i386/minmax-6.c index 615f919ba0a..b5b4363f931 100644 --- a/gcc/testsuite/gcc.target/i386/minmax-6.c +++ b/gcc/testsuite/gcc.target/i386/minmax-6.c @@ -15,4 +15,4 @@ UMVLine16Y_11 (short unsigned int * Pic, int y, int width) /* We do not want the RA to spill %esi for it's dual-use but using pmaxsd is OK. */ /* { dg-final { scan-assembler-not "rsp" { target { ! { ia32 } } } } } */ -/* { dg-final { scan-assembler "pmaxsd" } } */ +/* { dg-final { scan-assembler "pmaxsd" { target { ! ia32 } } } } */ -- 2.34.1