> we need to generate
>
> vxorp[ds] %xmmN, %xmmN, %xmmN
> ...
> vcvtss2sd f(%rip), %xmmN, %xmmX
> ...
> vcvtsi2ss i(%rip), %xmmN, %xmmY
>
> to avoid partial XMM register stall. This patch adds a pass to generate
> a single
>
> vxorps %xmmN, %xmmN, %xmmN
>
> at function entry, which is shared by all SF and DF conversions, instead
> of generating one
>
> vxorp[ds] %xmmN, %xmmN, %xmmN
>
> for each SF/DF conversion.
>
> Performance impacts on SPEC CPU 2017 rate with 1 copy using
>
> -Ofast -march=native -mfpmath=sse -fno-associative-math -funroll-loops
>
> are
>
> 1. On Broadwell server:
>
> 500.perlbench_r (-0.82%)
> 502.gcc_r (0.73%)
> 505.mcf_r (-0.24%)
> 520.omnetpp_r (-2.22%)
> 523.xalancbmk_r (-1.47%)
> 525.x264_r (0.31%)
> 531.deepsjeng_r (0.27%)
> 541.leela_r (0.85%)
> 548.exchange2_r (-0.11%)
> 557.xz_r (-0.34%)
> Geomean: (-0.23%)
>
> 503.bwaves_r (0.00%)
> 507.cactuBSSN_r (-1.88%)
> 508.namd_r (0.00%)
> 510.parest_r (-0.56%)
> 511.povray_r (0.49%)
> 519.lbm_r (-1.28%)
> 521.wrf_r (-0.28%)
> 526.blender_r (0.55%)
> 527.cam4_r (-0.20%)
> 538.imagick_r (2.52%)
> 544.nab_r (-0.18%)
> 549.fotonik3d_r (-0.51%)
> 554.roms_r (-0.22%)
> Geomean: (0.00%)
I wonder why the patch seems to have more effect on specint that should not
care much
about float<->double conversions?
> number of vxorp[ds]:
>
> before after difference
> 14570 4515 -69%
>
> OK for trunk?
This looks very nice though.
+/* At function entry, generate a single
+ vxorps %xmmN, %xmmN, %xmmN
+ for all
+ vcvtss2sd op, %xmmN, %xmmX
+ vcvtsd2ss op, %xmmN, %xmmX
+ vcvtsi2ss op, %xmmN, %xmmX
+ vcvtsi2sd op, %xmmN, %xmmX
+ */
+
+static unsigned int
+remove_partial_avx_dependency (void)
+{
+ timevar_push (TV_MACH_DEP);
+
+ calculate_dominance_info (CDI_DOMINATORS);
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+ df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+ df_md_add_problem ();
+ df_analyze ();
+
+ basic_block bb;
+ rtx_insn *insn, *set_insn;
+ rtx set;
+ rtx v4sf_const0 = NULL_RTX;
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ FOR_BB_INSNS (bb, insn)
+ {
+ if (!NONDEBUG_INSN_P (insn))
+ continue;
+
+ set = single_set (insn);
+ if (set)
+ {
+ machine_mode dest_vecmode, dest_mode;
+ rtx src = SET_SRC (set);
+ rtx dest, vec, zero;
+
+ /* Check for conversions to SF or DF. */
+ switch (GET_CODE (src))
+ {
+ case FLOAT_TRUNCATE:
+ /* DF -> SF. */
+ if (GET_MODE (XEXP (src, 0)) != DFmode)
+ continue;
+ /* Fall through. */
+ case FLOAT_EXTEND:
+ /* SF -> DF. */
+ case FLOAT:
+ /* SI -> SF, SI -> DF, DI -> SF, DI -> DF. */
+ dest = SET_DEST (set);
+ dest_mode = GET_MODE (dest);
+ switch (dest_mode)
+ {
+ case E_SFmode:
+ dest_vecmode = V4SFmode;
+ break;
+ case E_DFmode:
+ dest_vecmode = V2DFmode;
+ break;
+ default:
+ continue;
+ }
+
+ if (!TARGET_64BIT
+ && GET_MODE (XEXP (src, 0)) == DImode)
+ continue;
+
+ if (!v4sf_const0)
+ v4sf_const0 = gen_reg_rtx (V4SFmode);
+
+ if (dest_vecmode == V4SFmode)
+ zero = v4sf_const0;
+ else
+ zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
+
+ /* Change source to vector mode. */
+ src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
+ src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
+ GEN_INT (HOST_WIDE_INT_1U));
+ /* Change destination to vector mode. */
+ vec = gen_reg_rtx (dest_vecmode);
+ /* Generate a XMM vector SET. */
+ set = gen_rtx_SET (vec, src);
+ set_insn = emit_insn_before (set, insn);
+ df_insn_rescan (set_insn);
+
+ src = gen_rtx_SUBREG (dest_mode, vec, 0);
+ set = gen_rtx_SET (dest, src);
+
+ /* Drop possible dead definitions. */
+ PATTERN (insn) = set;
+
+ INSN_CODE (insn) = -1;
+ recog_memoized (insn);
+ df_insn_rescan (insn);
+ break;
+
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ if (v4sf_const0)
+ {
+ /* Generate a single vxorps at function entry and preform df
+ rescan. */
+ bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
+ insn = BB_HEAD (bb);
+ set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
+ set_insn = emit_insn_after (set, insn);
+ df_insn_rescan (set_insn);
+ df_process_deferred_rescans ();
+ }
It seems suboptimal to place the const0 at the entry of function - if the
conversoin happens in cold region of function this will just increase register
pressure. I guess right answer would be to look for the postdominance frontier
of the set of all uses of the zero register?
Honza