Hello! As exposed by corner cases in PR 89902 and PR 89903, we can't always reliably convert variable DImode shifts from a scalar to a vector instruction. The problem is in count register of a SSE vector shift, where the full DImode value is considered as a shift argument. Scalar operations with general register set use size-masked value of a QImode count register, so we have to zero-extend count operand from QImode to DImode if we want to fully emulate scalar instruction with a vector instruction. The STV infrastructure does not correctly handle corner cases where shift operand and count operand are matched, e.g.: "psllq %xmm0, %xmm0", it substitutes every instance with its V2DImode subreg representation. It can happen that a DImode result of a previous vector operation is passed as a count operand without clearing high bits outside QImode subreg, leading to runtime failures (n.b.: SSE shifts don't mask count operand).
Attached patch removes unreliable support to STV variable DImode shifts. 2019-04-02 Uroš Bizjak <ubiz...@gmail.com> PR target/89902 PR target/89903 * config/i386/i386.c (dimode_scalar_to_vector_candidate_p): Return false for variable DImode shifts. (dimode_scalar_chain::compute_convert_gain): Do not handle register count operand in variable DImode shifts. (dimode_scalar_chain::make_vector_copies): Remove support to copy count argument of a variable shift instruction to a vector register. (dimode_scalar_chain::convert_reg): Remove support to convert count argument of a variable shift instruction. testsuite/ChangeLog: 2019-04-02 Uroš Bizjak <ubiz...@gmail.com> PR target/89902 PR target/89903 * gcc.target/i386/pr70799-4.c: Remove. * gcc.target/i386/pr70799-5.c: Remove. * gcc.target/i386/pr89902.c: New test. * gcc.target/i386/pr89903.c: Ditto. Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. I plan to commit the patch later today to mainline and gcc-8 branch. Uros.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 6926085fdd2f..a7544946e0ad 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1058,16 +1058,8 @@ dimode_scalar_to_vector_candidate_p (rtx_insn *insn) case ASHIFT: case LSHIFTRT: - if (!REG_P (XEXP (src, 1)) - && (!SUBREG_P (XEXP (src, 1)) - || SUBREG_BYTE (XEXP (src, 1)) != 0 - || !REG_P (SUBREG_REG (XEXP (src, 1)))) - && (!CONST_INT_P (XEXP (src, 1)) - || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63))) - return false; - - if (GET_MODE (XEXP (src, 1)) != QImode - && !CONST_INT_P (XEXP (src, 1))) + if (!CONST_INT_P (XEXP (src, 1)) + || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)) return false; break; @@ -1664,15 +1656,10 @@ dimode_scalar_chain::compute_convert_gain () { if (CONST_INT_P (XEXP (src, 0))) gain -= vector_const_cost (XEXP (src, 0)); - if (CONST_INT_P (XEXP (src, 1))) - { - gain += ix86_cost->shift_const; - if (INTVAL (XEXP (src, 1)) >= 32) - gain -= COSTS_N_INSNS (1); - } - else - /* Additional gain for omitting two CMOVs. */ - gain += ix86_cost->shift_var + COSTS_N_INSNS (2); + + gain += ix86_cost->shift_const; + if (INTVAL (XEXP (src, 1)) >= 32) + gain -= COSTS_N_INSNS (1); } else if (GET_CODE (src) == PLUS || GET_CODE (src) == MINUS @@ -1788,60 +1775,14 @@ dimode_scalar_chain::make_vector_copies (unsigned regno) { rtx reg = regno_reg_rtx[regno]; rtx vreg = gen_reg_rtx (DImode); - bool count_reg = false; df_ref ref; for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref)) if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) { - df_ref use; - - /* Detect the count register of a shift instruction. */ - for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use)) - if (bitmap_bit_p (insns, DF_REF_INSN_UID (use))) - { - rtx_insn *insn = DF_REF_INSN (use); - rtx def_set = single_set (insn); - - gcc_assert (def_set); - - rtx src = SET_SRC (def_set); - - if ((GET_CODE (src) == ASHIFT - || GET_CODE (src) == ASHIFTRT - || GET_CODE (src) == LSHIFTRT) - && !CONST_INT_P (XEXP (src, 1)) - && reg_or_subregno (XEXP (src, 1)) == regno) - count_reg = true; - } - start_sequence (); - if (count_reg) - { - rtx qreg = gen_lowpart (QImode, reg); - rtx tmp = gen_reg_rtx (SImode); - if (TARGET_ZERO_EXTEND_WITH_AND - && optimize_function_for_speed_p (cfun)) - { - emit_move_insn (tmp, const0_rtx); - emit_insn (gen_movstrictqi - (gen_lowpart (QImode, tmp), qreg)); - } - else - emit_insn (gen_rtx_SET - (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg))); - - if (!TARGET_INTER_UNIT_MOVES_TO_VEC) - { - rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP); - emit_move_insn (slot, tmp); - tmp = copy_rtx (slot); - } - - emit_insn (gen_zero_extendsidi2 (vreg, tmp)); - } - else if (!TARGET_INTER_UNIT_MOVES_TO_VEC) + if (!TARGET_INTER_UNIT_MOVES_TO_VEC) { rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP); emit_move_insn (adjust_address (tmp, SImode, 0), @@ -1889,25 +1830,8 @@ dimode_scalar_chain::make_vector_copies (unsigned regno) if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))) { rtx_insn *insn = DF_REF_INSN (ref); - if (count_reg) - { - rtx def_set = single_set (insn); - gcc_assert (def_set); - rtx src = SET_SRC (def_set); - - if ((GET_CODE (src) == ASHIFT - || GET_CODE (src) == ASHIFTRT - || GET_CODE (src) == LSHIFTRT) - && !CONST_INT_P (XEXP (src, 1)) - && reg_or_subregno (XEXP (src, 1)) == regno) - { - XEXP (src, 0) = replace_with_subreg (XEXP (src, 0), reg, reg); - XEXP (src, 1) = vreg; - } - } - else - replace_with_subreg_in_insn (insn, reg, vreg); + replace_with_subreg_in_insn (insn, reg, vreg); if (dump_file) fprintf (dump_file, " Replaced r%d with r%d in insn %d\n", @@ -2010,43 +1934,7 @@ dimode_scalar_chain::convert_reg (unsigned regno) rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); - if ((GET_CODE (src) == ASHIFT - || GET_CODE (src) == ASHIFTRT - || GET_CODE (src) == LSHIFTRT) - && !CONST_INT_P (XEXP (src, 1)) - && reg_or_subregno (XEXP (src, 1)) == regno) - { - rtx tmp2 = gen_reg_rtx (V2DImode); - - start_sequence (); - - if (TARGET_SSE4_1) - emit_insn (gen_sse4_1_zero_extendv2qiv2di2 - (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0))); - else - { - rtx vec_cst - = gen_rtx_CONST_VECTOR (V2DImode, - gen_rtvec (2, GEN_INT (0xff), - const0_rtx)); - vec_cst - = validize_mem (force_const_mem (V2DImode, vec_cst)); - - emit_insn (gen_rtx_SET - (tmp2, - gen_rtx_AND (V2DImode, - gen_rtx_SUBREG (V2DImode, reg, 0), - vec_cst))); - } - rtx_insn *seq = get_insns (); - end_sequence (); - - emit_insn_before (seq, insn); - - XEXP (src, 0) = replace_with_subreg (XEXP (src, 0), reg, reg); - XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0); - } - else if (!MEM_P (dst) || !REG_P (src)) + if (!MEM_P (dst) || !REG_P (src)) replace_with_subreg_in_insn (insn, reg, reg); bitmap_clear_bit (conv, INSN_UID (insn));