With forcing STV for all chains I run into the same issue as I fixed for the analysis phase which is looping over all defs of a pseudo rather just those interesting. The following fixes this by recording insn/pseudo pairs in mark_dual_mode_def; well, not actually pairs but tracking insns in addition to pseudos which allows us to cheaply do the set property plus visit the defs that are interesting (assuming the number of defs on an insn is a lot less endless than the number of defs of a pseudo).
This way the original testcase with STV forced goes down from 200s to 24s compile-time with machine dep reorg : 7.09 ( 20%) 0.01 ( 2%) 8.18 ( 21%) 1854 kB ( 2%) where almost all of the compile-time is spent in DFs deferred rescan processing. I'm not sure we even need that but I'm not too eager to dig more into DF than necessary right now (we call df_finish () at the end which removes the DU/UD_CHAIN and the MD problem (what do we need that for!?). We don't appropriately scan all insn we add so the prevailing problems like LIVE are not updated, but well... Bootstrapped and tested on x86_64-unknown-linux-gnu with STV forced and for Haswell this time. OK? Thanks, Richard. 2019-08-27 Richard Biener <rguent...@suse.de> * config/i386/i386-features.h (general_scalar_chain::~general_scalar_chain): Add. (general_scalar_chain::insns_conv): New bitmap. (general_scalar_chain::n_sse_to_integer): New. (general_scalar_chain::n_integer_to_sse): Likewise. (general_scalar_chain::make_vector_copies): Adjust signature. * config/i386/i386-features.c (general_scalar_chain::general_scalar_chain): Outline, initialize new members. (general_scalar_chain::~general_scalar_chain): New. (general_scalar_chain::mark_dual_mode_def): Record insns we need to insert conversions at and count them. (general_scalar_chain::compute_convert_gain): Account for conversion instructions at chain boundary. (general_scalar_chain::make_vector_copies): Generate a single copy for a def by a specific insn. (general_scalar_chain::convert_registers): First populate defs_map, then make copies at out-of chain insns. Index: gcc/config/i386/i386-features.c =================================================================== --- gcc/config/i386/i386-features.c (revision 274945) +++ gcc/config/i386/i386-features.c (working copy) @@ -320,6 +320,20 @@ scalar_chain::add_to_queue (unsigned ins bitmap_set_bit (queue, insn_uid); } +general_scalar_chain::general_scalar_chain (enum machine_mode smode_, + enum machine_mode vmode_) + : scalar_chain (smode_, vmode_) +{ + insns_conv = BITMAP_ALLOC (NULL); + n_sse_to_integer = 0; + n_integer_to_sse = 0; +} + +general_scalar_chain::~general_scalar_chain () +{ + BITMAP_FREE (insns_conv); +} + /* For DImode conversion, mark register defined by DEF as requiring conversion. */ @@ -328,15 +342,27 @@ general_scalar_chain::mark_dual_mode_def { gcc_assert (DF_REF_REG_DEF_P (def)); - if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def))) - return; - + /* Record the def/insn pair so we can later efficiently iterate over + the defs to convert on insns not in the chain. */ + bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); + if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def))) + { + if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def)) + && !reg_new) + return; + n_integer_to_sse++; + } + else + { + if (!reg_new) + return; + n_sse_to_integer++; + } + if (dump_file) fprintf (dump_file, " Mark r%d def in insn %d as requiring both modes in chain #%d\n", DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id); - - bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); } /* For TImode conversion, it is unused. */ @@ -523,7 +549,7 @@ general_scalar_chain::compute_convert_ga || GET_CODE (src) == ASHIFTRT || GET_CODE (src) == LSHIFTRT) { - if (CONST_INT_P (XEXP (src, 0))) + if (CONST_INT_P (XEXP (src, 0))) igain -= vector_const_cost (XEXP (src, 0)); igain += m * ix86_cost->shift_const - ix86_cost->sse_op; if (INTVAL (XEXP (src, 1)) >= 32) @@ -588,9 +614,12 @@ general_scalar_chain::compute_convert_ga if (dump_file) fprintf (dump_file, " Instruction conversion gain: %d\n", gain); - /* ??? What about integer to SSE? */ - EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi) - cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->sse_to_integer; + /* Cost the integer to sse and sse to integer moves. */ + cost += n_sse_to_integer * ix86_cost->sse_to_integer; + /* ??? integer_to_sse but we only have that in the RA cost table. + Assume sse_to_integer/integer_to_sse are the same which they + are at the moment. */ + cost += n_integer_to_sse * ix86_cost->sse_to_integer; if (dump_file) fprintf (dump_file, " Registers conversion cost: %d\n", cost); @@ -649,85 +678,64 @@ gen_gpr_to_xmm_move_src (enum machine_mo and replace its uses in a chain. */ void -general_scalar_chain::make_vector_copies (unsigned regno) +general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg) { - rtx reg = regno_reg_rtx[regno]; - rtx vreg = gen_reg_rtx (smode); - df_ref ref; + rtx vreg = *defs_map.get (reg); - defs_map.put (reg, vreg); - - /* For each insn defining REGNO, see if it is defined by an insn - not part of the chain but with uses in insns part of the chain - and insert a copy in that case. */ - for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) + start_sequence (); + if (!TARGET_INTER_UNIT_MOVES_TO_VEC) { - if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) - continue; - df_link *use; - for (use = DF_REF_CHAIN (ref); use; use = use->next) - if (!DF_REF_REG_MEM_P (use->ref) - && bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))) - break; - if (!use) - continue; - - start_sequence (); - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); + if (smode == DImode && !TARGET_64BIT) { - rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP); - if (smode == DImode && !TARGET_64BIT) - { - emit_move_insn (adjust_address (tmp, SImode, 0), - gen_rtx_SUBREG (SImode, reg, 0)); - emit_move_insn (adjust_address (tmp, SImode, 4), - gen_rtx_SUBREG (SImode, reg, 4)); - } - else - emit_move_insn (copy_rtx (tmp), reg); - emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), - gen_gpr_to_xmm_move_src (vmode, tmp))); + emit_move_insn (adjust_address (tmp, SImode, 0), + gen_rtx_SUBREG (SImode, reg, 0)); + emit_move_insn (adjust_address (tmp, SImode, 4), + gen_rtx_SUBREG (SImode, reg, 4)); } - else if (!TARGET_64BIT && smode == DImode) + else + emit_move_insn (copy_rtx (tmp), reg); + emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), + gen_gpr_to_xmm_move_src (vmode, tmp))); + } + else if (!TARGET_64BIT && smode == DImode) + { + if (TARGET_SSE4_1) { - if (TARGET_SSE4_1) - { - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (SImode, reg, 4), - GEN_INT (2))); - } - else - { - rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 0))); - emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), - CONST0_RTX (V4SImode), - gen_rtx_SUBREG (SImode, reg, 4))); - emit_insn (gen_vec_interleave_lowv4si - (gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, vreg, 0), - gen_rtx_SUBREG (V4SImode, tmp, 0))); - } + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 0))); + emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (SImode, reg, 4), + GEN_INT (2))); } else - emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), - gen_gpr_to_xmm_move_src (vmode, reg))); - rtx_insn *seq = get_insns (); - end_sequence (); - rtx_insn *insn = DF_REF_INSN (ref); - emit_conversion_insns (seq, insn); - - if (dump_file) - fprintf (dump_file, - " Copied r%d to a vector register r%d for insn %d\n", - regno, REGNO (vreg), INSN_UID (insn)); + { + rtx tmp = gen_reg_rtx (DImode); + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 0))); + emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0), + CONST0_RTX (V4SImode), + gen_rtx_SUBREG (SImode, reg, 4))); + emit_insn (gen_vec_interleave_lowv4si + (gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, vreg, 0), + gen_rtx_SUBREG (V4SImode, tmp, 0))); + } } + else + emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0), + gen_gpr_to_xmm_move_src (vmode, reg))); + rtx_insn *seq = get_insns (); + end_sequence (); + emit_conversion_insns (seq, insn); + + if (dump_file) + fprintf (dump_file, + " Copied r%d to a vector register r%d for insn %d\n", + REGNO (reg), REGNO (vreg), INSN_UID (insn)); } /* Copy the definition SRC of INSN inside the chain to DST for @@ -1158,7 +1164,11 @@ general_scalar_chain::convert_registers bitmap_iterator bi; unsigned id; EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi) - make_vector_copies (id); + defs_map.put (regno_reg_rtx[id], gen_reg_rtx (smode)); + EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi) + for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref)) + if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref))) + make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref)); } /* Convert whole chain creating required register Index: gcc/config/i386/i386-features.h =================================================================== --- gcc/config/i386/i386-features.h (revision 274945) +++ gcc/config/i386/i386-features.h (working copy) @@ -167,16 +167,19 @@ class scalar_chain class general_scalar_chain : public scalar_chain { public: - general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) - : scalar_chain (smode_, vmode_) {} + general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_); + ~general_scalar_chain (); int compute_convert_gain (); private: hash_map<rtx, rtx> defs_map; + bitmap insns_conv; + unsigned n_sse_to_integer; + unsigned n_integer_to_sse; void mark_dual_mode_def (df_ref def); void convert_insn (rtx_insn *insn); void convert_op (rtx *op, rtx_insn *insn); void convert_reg (rtx_insn *insn, rtx dst, rtx src); - void make_vector_copies (unsigned regno); + void make_vector_copies (rtx_insn *, rtx); void convert_registers (); int vector_const_cost (rtx exp); };