On 20 May 23:27, Vladimir Makarov wrote:
>
>
> On 20/05/15 04:17 AM, Ilya Enkovich wrote:
> >On 19 May 11:22, Vladimir Makarov wrote:
> >>On 05/18/2015 08:13 AM, Ilya Enkovich wrote:
> >>>2015-05-06 17:18 GMT+03:00 Ilya Enkovich <[email protected]>:
> >>>Hi Vladimir,
> >>>
> >>>Could you please comment on this?
> >>>
> >>>
> >>Ilya, I think that the idea is worth to try but results might be
> >>mixed. It is hard to say until you actually try it (as example, Jan
> >>implemented -fpmath=both and it looks a pretty good idea at least
> >>for me but when I checked SPEC2000 the results were not so good even
> >>with IRA/LRA).
> >>
> >>Long ago I did some experiments and found that spilling into SSE
> >>would benefitial for Intel CPUs but not for AMD ones. As I remember
> >>I also found that storing several scalar values into one SSE reg and
> >>extracting it when you need to do some (fp) arithmetics would
> >>benefitial for AMD but not for Intel CPUs. In literature more
> >>general approach is called bitwise register allocator. Actually it
> >>would be a pretty big IRA/LRA project from which some targets might
> >>benefit.
> >I suspect such things are not trivially done in IRA/LRA and want to make it
> >as an independent optimization because its application seems to be quite
> >narrow.
> Yes, that is true. The complications and implementation complexity
> will be probably very high in this project and the positive results
> are not sure. So the project might have a small value.
> >>
> >>As for the wrong code, it is hard for me to say anything w/o RA
> >>dumps. If you send me the dump (-fira-verbose=16), i might say more
> >>what is going on.
> >>
> >>
> >Here are some dumps from my reproducer. The problematic register is r108.
> >
> Thanks. For me it looks like an inheritance bug. It is really hard
> to fix the bug w/o the source code. Could you send me your patch in
> order I can debug RA with it to investigate more.
>
Sure! Here is a patch and a testcase. I applied patch to r222125. Cmd to
reproduce:
gcc -m32 -msse4.2 -O2 pr65105.c -S -march=slm -fPIE
Thanks,
Ilya
void
counter (long long l);
void
test (long long *arr)
{
register unsigned long long tmp;
tmp = arr[0] | arr[1] & arr[2];
while (tmp)
{
counter (tmp);
tmp = *(arr++) & tmp;
}
}
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a607ef4..a9dbfea 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -2554,6 +2554,789 @@ rest_of_handle_insert_vzeroupper (void)
return 0;
}
+static bool
+has_non_address_hard_reg (rtx_insn *insn)
+{
+ df_ref ref;
+ FOR_EACH_INSN_DEF (ref, insn)
+ if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
+ && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER))
+ return true;
+
+ FOR_EACH_INSN_USE (ref, insn)
+ if (!DF_REF_REG_MEM_P(ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
+ return true;
+
+ return false;
+}
+
+static bool
+scalar_to_vector_candidate_p (rtx_insn *insn)
+{
+ rtx def_set = single_set (insn);
+
+ if (!def_set)
+ return false;
+
+ if (has_non_address_hard_reg (insn))
+ return false;
+
+ rtx src = SET_SRC (def_set);
+ rtx dst = SET_DEST (def_set);
+
+ /* We are interested in DImode -> V1DI promotion
+ only. */
+ if (GET_MODE (src) != DImode
+ || GET_MODE (dst) != DImode)
+ return false;
+
+ if (!REG_P (dst) && !MEM_P (dst))
+ return false;
+
+ switch (GET_CODE (src))
+ {
+ case PLUS:
+ case MINUS:
+ case IOR:
+ case XOR:
+ case AND:
+ break;
+
+ default:
+ return false;
+ }
+
+ if (!REG_P (XEXP (src, 0)) && !MEM_P (XEXP (src, 0)))
+ return false;
+
+ if (!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
+ return false;
+
+ if (GET_MODE (XEXP (src, 0)) != DImode
+ || GET_MODE (XEXP (src, 1)) != DImode)
+ return false;
+
+ return true;
+}
+
+/* Remove regs having both convertible and
+ not convertible definitions. */
+static void
+remove_non_convertible_regs (bitmap insns)
+{
+ bitmap_iterator bi;
+ unsigned id;
+ bitmap regs = BITMAP_ALLOC (NULL);
+
+ EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+ {
+ rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
+ rtx reg = SET_DEST (def_set);
+
+ if (!REG_P (reg) || bitmap_bit_p (regs, REGNO (reg)))
+ continue;
+
+ for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+ def;
+ def = DF_REF_NEXT_REG (def))
+ {
+ if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
+ {
+ if (dump_file)
+ fprintf (dump_file,
+ "r%d has non convertible definition in insn %d\n",
+ REGNO (reg), DF_REF_INSN_UID (def));
+
+ bitmap_set_bit (regs, REGNO (reg));
+ break;
+ }
+ }
+ }
+
+ EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
+ {
+ for (df_ref def = DF_REG_DEF_CHAIN (id);
+ def;
+ def = DF_REF_NEXT_REG (def))
+ if (bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
+ {
+ if (dump_file)
+ fprintf (dump_file, "Removing insn %d from candidates list\n",
+ DF_REF_INSN_UID (def));
+
+ bitmap_clear_bit (insns, DF_REF_INSN_UID (def));
+ }
+ }
+
+ BITMAP_FREE (regs);
+}
+
+static bool
+convertible_insn_p (rtx_insn *insn)
+{
+ rtx def_set = single_set (insn);
+
+ if (!def_set)
+ return false;
+
+ if (has_non_address_hard_reg (insn))
+ return false;
+
+ rtx src = SET_SRC (def_set);
+ rtx dst = SET_DEST (def_set);
+
+ if (GET_MODE (src) != DImode
+ || GET_MODE (dst) != DImode)
+ return false;
+
+ /* Convert simple loads, stores and register copies. */
+ return ((REG_P (src) && MEM_P (dst))
+ || ((MEM_P (src) && REG_P (dst)))
+ || ((REG_P (src) && REG_P (dst))));
+}
+
+class scalar_chain
+{
+ public:
+ scalar_chain ();
+ ~scalar_chain ();
+
+ static unsigned max_id;
+
+ unsigned int chain_id;
+ bitmap queue;
+ bitmap insns;
+ bitmap defs;
+ bitmap defs_conv;
+
+ void build (bitmap candidates, unsigned insn_uid);
+ int compute_convert_gain ();
+ void convert ();
+
+ private:
+ void add_insn (bitmap candidates, unsigned insn_uid);
+ void add_to_queue (unsigned insn_uid);
+ void mark_dual_mode_def (df_ref def);
+ void analyze_register_chain (bitmap candidates, df_ref ref);
+ void convert_insn (rtx_insn *insn);
+ void convert_op (rtx *op, rtx_insn *insn);
+ void convert_insn_defs (unsigned regno);
+ void make_scalar_copies (unsigned regno);
+ void make_vector_copies (unsigned regno);
+};
+
+unsigned scalar_chain::max_id = 0;
+
+scalar_chain::scalar_chain ()
+{
+ chain_id = ++max_id;
+
+ if (dump_file)
+ fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
+
+ bitmap_obstack_initialize (NULL);
+ insns = BITMAP_ALLOC (NULL);
+ defs = BITMAP_ALLOC (NULL);
+ defs_conv = BITMAP_ALLOC (NULL);
+ queue = NULL;
+}
+
+scalar_chain::~scalar_chain ()
+{
+ BITMAP_FREE (insns);
+ BITMAP_FREE (defs);
+ BITMAP_FREE (defs_conv);
+ bitmap_obstack_release (NULL);
+}
+
+void
+scalar_chain::add_to_queue (unsigned insn_uid)
+{
+ if (bitmap_bit_p (insns, insn_uid)
+ || bitmap_bit_p (queue, insn_uid))
+ return;
+
+ if (dump_file)
+ fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
+ insn_uid, chain_id);
+ bitmap_set_bit (queue, insn_uid);
+}
+
+void
+scalar_chain::mark_dual_mode_def (df_ref def)
+{
+ gcc_assert (DF_REF_REG_DEF_P (def));
+
+ if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
+ return;
+
+ if (dump_file)
+ fprintf (dump_file,
+ " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
+ DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
+
+ bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
+}
+
+void
+scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
+{
+ df_link *chain;
+ for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
+ {
+ unsigned uid = DF_REF_INSN_UID (chain->ref);
+ if (!DF_REF_REG_MEM_P (chain->ref))
+ {
+ if (bitmap_bit_p (insns, uid))
+ continue;
+
+ if (bitmap_bit_p (candidates, uid))
+ {
+ add_to_queue (uid);
+ continue;
+ }
+
+ if (!DF_REF_REG_MEM_P (chain->ref)
+ && convertible_insn_p (DF_REF_INSN (chain->ref)))
+ {
+ if (dump_file)
+ fprintf (dump_file, " Mark insn %d as convertible\n", uid);
+ bitmap_set_bit (candidates, uid);
+ add_to_queue (uid);
+ continue;
+ }
+ }
+
+ if (DF_REF_REG_DEF_P (chain->ref))
+ {
+ if (dump_file)
+ fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
+ DF_REF_REGNO (chain->ref), uid);
+ mark_dual_mode_def (chain->ref);
+ }
+ else
+ {
+ if (dump_file)
+ fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
+ DF_REF_REGNO (chain->ref), uid);
+ mark_dual_mode_def (ref);
+ }
+ }
+}
+
+void
+scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
+{
+ if (bitmap_bit_p (insns, insn_uid))
+ return;
+
+ if (dump_file)
+ fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
+
+ bitmap_set_bit (insns, insn_uid);
+
+ rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+ rtx def_set = single_set (insn);
+ if (def_set && REG_P (SET_DEST (def_set)))
+ bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
+
+ df_ref ref;
+ for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+ analyze_register_chain (candidates, ref);
+ for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
+ if (!DF_REF_REG_MEM_P (ref))
+ analyze_register_chain (candidates, ref);
+}
+
+void
+scalar_chain::build (bitmap candidates, unsigned insn_uid)
+{
+ queue = BITMAP_ALLOC (NULL);
+ bitmap_set_bit (queue, insn_uid);
+
+ if (dump_file)
+ fprintf (dump_file, "Building chain #%d...\n", chain_id);
+
+ while (!bitmap_empty_p (queue))
+ {
+ insn_uid = bitmap_first_set_bit (queue);
+ bitmap_clear_bit (queue, insn_uid);
+ bitmap_clear_bit (candidates, insn_uid);
+ add_insn (candidates, insn_uid);
+ }
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "Collected chain #%d...\n", chain_id);
+ fprintf (dump_file, " insns: ");
+ dump_bitmap (dump_file, insns);
+ if (!bitmap_empty_p (defs_conv))
+ {
+ bitmap_iterator bi;
+ unsigned id;
+ const char *comma = "";
+ fprintf (dump_file, " defs to convert: ");
+ EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
+ {
+ fprintf (dump_file, "%sr%d", comma, id);
+ comma = ", ";
+ }
+ fprintf (dump_file, "\n");
+ }
+ }
+
+ BITMAP_FREE (queue);
+}
+
+int
+scalar_chain::compute_convert_gain ()
+{
+ bitmap_iterator bi;
+ unsigned insn_uid;
+ int gain = 0;
+ int cost = 0;
+
+ if (dump_file)
+ fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
+
+ EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
+ {
+ rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
+ rtx def_set = single_set (insn);
+ rtx src = SET_SRC (def_set);
+ rtx dst = SET_DEST (def_set);
+
+ if (REG_P (src) && REG_P (dst))
+ gain += COSTS_N_INSNS (2) - ix86_cost->sse_move;
+ else if (REG_P (src) && MEM_P (dst))
+ gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
+ else if (MEM_P (src) && REG_P (dst))
+ gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
+ else if (GET_CODE (src) == PLUS
+ || GET_CODE (src) == MINUS
+ || GET_CODE (src) == IOR
+ || GET_CODE (src) == XOR
+ || GET_CODE (src) == AND)
+ gain += ix86_cost->add;
+ else
+ gcc_unreachable ();
+ }
+
+ if (dump_file)
+ fprintf (dump_file, " Instruction convertion gain: %d\n", gain);
+
+ EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
+ cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
+
+ if (dump_file)
+ fprintf (dump_file, " Registers convertion cost: %d\n", cost);
+
+ gain -= cost;
+
+ if (dump_file)
+ fprintf (dump_file, " Total gain: %d\n", gain);
+
+ return gain;
+}
+
+void
+scalar_chain::make_vector_copies (unsigned regno)
+{
+ rtx reg = regno_reg_rtx[regno];
+ rtx sireg = gen_reg_rtx (SImode);
+ rtx vcopy = gen_rtx_SUBREG (V2DImode, sireg, 0);
+ rtx vcopy1 = gen_rtx_SUBREG (V4SImode, sireg, 0);
+ df_ref ref;
+
+ for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+ {
+ start_sequence ();
+ if (TARGET_SSE4_1)
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+ emit_move_insn (sireg, gen_rtx_SUBREG (SImode, reg, 0));
+ emit_insn (gen_sse4_1_pinsrd (vcopy1, vcopy1,
+ gen_rtx_SUBREG (SImode, reg, 4),
+ GEN_INT (2)));
+ }
+ else
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+ emit_move_insn (sireg,
+ gen_rtx_SUBREG (SImode, reg, 0));
+ emit_move_insn (tmp, gen_rtx_SUBREG (SImode, reg, 4));
+ emit_move_insn (gen_rtx_SUBREG (V2DImode, tmp, 0),
+ gen_rtx_ASHIFT (V2DImode,
+ gen_rtx_SUBREG (V2DImode, tmp, 0),
+ GEN_INT (32)));
+ emit_move_insn (vcopy,
+ gen_rtx_IOR (V2DImode, vcopy,
+ gen_rtx_SUBREG (V2DImode, tmp, 0)));
+ }
+ emit_insn_after (get_insns (), DF_REF_INSN (ref));
+ end_sequence ();
+
+ if (dump_file)
+ fprintf (dump_file,
+ " Copied r%d to a vector register r%d for insn %d\n",
+ regno, REGNO (sireg), DF_REF_INSN_UID (ref));
+ }
+
+ for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+ {
+ replace_rtx (DF_REF_INSN (ref), reg, vcopy);
+
+ if (dump_file)
+ fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
+ regno, REGNO (sireg), DF_REF_INSN_UID (ref));
+ }
+}
+
+void
+scalar_chain::convert_insn_defs (unsigned regno)
+{
+ bool scalar_copy = bitmap_bit_p (defs_conv, regno);
+ rtx reg = regno_reg_rtx[regno];
+ rtx new_reg = NULL_RTX;
+ rtx scopy = NULL_RTX;
+ df_ref ref;
+ bitmap conv;
+
+ conv = BITMAP_ALLOC (NULL);
+ bitmap_copy (conv, insns);
+
+ /* Check we have load or store. In this case we cannot
+ just convert register to V2DI mode and have to use subreg. */
+ for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ {
+ rtx def_set = single_set (DF_REF_INSN (ref));
+ if (def_set && MEM_P (SET_SRC (def_set)))
+ {
+ new_reg = gen_rtx_SUBREG (V2DImode, reg, 0);
+ break;
+ }
+ }
+
+ if (!new_reg)
+ for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ {
+ rtx def_set = single_set (DF_REF_INSN (ref));
+ if (def_set
+ && MEM_P (SET_DEST (def_set))
+ && REG_P (SET_SRC (def_set)))
+ {
+ new_reg = gen_rtx_SUBREG (V2DImode, reg, 0);
+ break;
+ }
+ }
+
+ if (!new_reg)
+ new_reg = gen_rtx_SUBREG (V2DImode, reg, 0);
+
+ if (scalar_copy)
+ scopy = gen_reg_rtx (DImode);
+
+ for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ {
+ rtx def_set = single_set (DF_REF_INSN (ref));
+ rtx src = SET_SRC (def_set);
+ rtx reg = DF_REF_REG (ref);
+
+ if (!MEM_P (src))
+ {
+ replace_rtx (DF_REF_INSN (ref), reg, new_reg);
+ bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
+ }
+
+ if (scalar_copy)
+ {
+ rtx vcopy = gen_reg_rtx (V2DImode);
+
+ start_sequence ();
+ emit_move_insn (vcopy, new_reg);
+ emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+ gen_rtx_SUBREG (SImode, vcopy, 0));
+ emit_move_insn (vcopy,
+ gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
+ emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+ gen_rtx_SUBREG (SImode, vcopy, 0));
+ emit_insn_after (get_insns (), DF_REF_INSN (ref));
+ end_sequence ();
+
+ if (dump_file)
+ fprintf (dump_file,
+ " Copied r%d to a scalar register r%d for insn %d\n",
+ regno, REGNO (scopy), DF_REF_INSN_UID (ref));
+ }
+ }
+
+ for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+ {
+ if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
+ {
+ rtx def_set = single_set (DF_REF_INSN (ref));
+ if (!MEM_P (SET_DEST (def_set))
+ || !REG_P (SET_SRC (def_set)))
+ replace_rtx (DF_REF_INSN (ref), reg, new_reg);
+ bitmap_clear_bit (conv, DF_REF_INSN_UID (ref));
+ }
+ }
+ else
+ {
+ replace_rtx (DF_REF_INSN (ref), reg, scopy);
+ df_insn_rescan (DF_REF_INSN (ref));
+ }
+
+ BITMAP_FREE (conv);
+}
+
+void
+scalar_chain::make_scalar_copies (unsigned regno)
+{
+ rtx scopy = gen_reg_rtx (DImode);
+ rtx vcopy = gen_reg_rtx (V2DImode);
+ df_ref ref;
+
+ for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+ {
+ rtx reg = DF_REF_REG (ref);
+
+ start_sequence ();
+ emit_move_insn (vcopy, reg);
+ emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
+ gen_rtx_SUBREG (SImode, vcopy, 0));
+ emit_move_insn (vcopy,
+ gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
+ emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
+ gen_rtx_SUBREG (SImode, vcopy, 0));
+ emit_insn_after (get_insns (), DF_REF_INSN (ref));
+ end_sequence ();
+
+ if (dump_file)
+ fprintf (dump_file,
+ " Copied r%d to a scalar register r%d for insn %d\n",
+ REGNO (reg), REGNO (scopy), DF_REF_INSN_UID (ref));
+ }
+
+ for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
+ if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
+ {
+ replace_rtx (DF_REF_INSN (ref), DF_REF_REG (ref), scopy);
+ df_insn_rescan (DF_REF_INSN (ref));
+
+ if (dump_file)
+ fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
+ regno, REGNO (scopy), DF_REF_INSN_UID (ref));
+ }
+}
+
+void
+scalar_chain::convert_op (rtx *op, rtx_insn *insn)
+{
+ *op = copy_rtx_if_shared (*op);
+
+ if (MEM_P (*op))
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ rtx tmpv2di = gen_rtx_SUBREG (V2DImode, tmp, 0);
+
+ emit_insn_before (gen_move_insn (tmp, *op), insn);
+ *op = tmpv2di;
+
+ if (dump_file)
+ fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
+ INSN_UID (insn), REGNO (tmp));
+ }
+ else if (REG_P (*op))
+ {
+ //*op = gen_rtx_SUBREG (V2DImode, *op, 0);
+ gcc_assert (bitmap_bit_p
+ (insns, DF_REF_INSN_UID (DF_REG_DEF_CHAIN (REGNO (*op))))
+ || bitmap_bit_p (defs_conv, REGNO (*op)));
+ }
+ else
+ {
+ gcc_assert (SUBREG_P (*op));
+ gcc_assert (GET_MODE (*op) == V2DImode);
+ }
+}
+
+void
+scalar_chain::convert_insn (rtx_insn *insn)
+{
+ rtx def_set = single_set (insn);
+ rtx src = copy_rtx_if_shared (SET_SRC (def_set));
+ rtx dst = SET_DEST (def_set);
+
+ if (MEM_P (dst))
+ {
+ if (!REG_P (src))
+ {
+ /* There are no scalar integer instructions and therefore
+ temporary register usage is required. */
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_insn_after (gen_move_insn (dst, tmp), insn);
+ dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
+ }
+ }
+#if 0
+ else if (REG_P (dst))
+ {
+ df_ref def = DF_REG_DEF_CHAIN (REGNO (dst));
+
+ if (MEM_P (src))
+ {
+ df_link *link;
+ rtx subreg = gen_rtx_SUBREG (V2DImode, dst, 0);
+ for (link = DF_REF_CHAIN (def); link; link = link->next)
+ if (bitmap_bit_p (insns, DF_REF_INSN_UID (link->ref)))
+ {
+ /* replace_rtx dive into subreg and goes into recursion. */
+ rtx tmp = gen_reg_rtx (VOIDmode);
+ replace_rtx (DF_REF_INSN (link->ref), dst, tmp);
+ replace_rtx (DF_REF_INSN (link->ref), tmp, subreg);
+
+ if (dump_file)
+ fprintf (dump_file, " Replace r%d with a subreg in insn
%d\n",
+ REGNO (dst), DF_REF_INSN_UID (link->ref));
+ }
+ }
+ else
+ {
+ PUT_MODE (dst, V2DImode);
+ }
+ }
+ else
+ gcc_unreachable ();
+#endif
+
+ switch (GET_CODE (src))
+ {
+ case PLUS:
+ case MINUS:
+ case IOR:
+ case XOR:
+ case AND:
+ convert_op (&XEXP (src, 0), insn);
+ convert_op (&XEXP (src, 1), insn);
+ PUT_MODE (src, V2DImode);
+ break;
+
+ case MEM:
+ if (!REG_P (dst))
+ convert_op (&src, insn);
+ break;
+
+ case REG:
+ break;
+
+ case SUBREG:
+ gcc_assert (GET_MODE (src) == V2DImode);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ SET_SRC (def_set) = src;
+ SET_DEST (def_set) = dst;
+
+ /* Drop possible dead definitions. */
+ PATTERN (insn) = def_set;
+
+ INSN_CODE (insn) = -1;
+ recog_memoized (insn);
+ df_insn_rescan (insn);
+}
+
+void
+scalar_chain::convert ()
+{
+ bitmap_iterator bi;
+ unsigned id;
+
+ if (dump_file)
+ fprintf (dump_file, "Converting chain #%d...\n", chain_id);
+
+ EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
+ convert_insn_defs (id);
+
+ EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
+ convert_insn (DF_INSN_UID_GET (id)->insn);
+
+ EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
+ make_vector_copies (id);
+}
+
+static unsigned int
+convert_scalars_to_vector ()
+{
+ basic_block bb;
+ bitmap candidates;
+
+ bitmap_obstack_initialize (NULL);
+ candidates = BITMAP_ALLOC (NULL);
+
+ calculate_dominance_info (CDI_DOMINATORS);
+ df_set_flags (DF_DEFER_INSN_RESCAN);
+ df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
+ df_md_add_problem ();
+ df_analyze ();
+
+ /* 1. Find all instructions we want to convert into vector mode. */
+ if (dump_file)
+ fprintf (dump_file, "Searching for mode convertion candidates...\n");
+
+ FOR_EACH_BB_FN (bb, cfun)
+ {
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ if (scalar_to_vector_candidate_p (insn))
+ {
+ if (dump_file)
+ fprintf (dump_file, " insn %d is marked as a candidate\n",
+ INSN_UID (insn));
+
+ bitmap_set_bit (candidates, INSN_UID (insn));
+ }
+ }
+
+ remove_non_convertible_regs (candidates);
+
+ if (bitmap_empty_p (candidates))
+ if (dump_file)
+ fprintf (dump_file, "There are no candidates for optimization.\n");
+
+ while (!bitmap_empty_p (candidates))
+ {
+ unsigned uid = bitmap_first_set_bit (candidates);
+ scalar_chain chain;
+
+ /* Find instructions chain we want to convert to vector mode.
+ Check all uses and definitions to estimate all required
+ convertions. */
+ chain.build (candidates, uid);
+
+ if (chain.compute_convert_gain () > 0)
+ chain.convert ();
+ else
+ if (dump_file)
+ fprintf (dump_file, "Chain #%d convertion is not profitable\n",
+ chain.chain_id);
+ }
+
+ BITMAP_FREE (candidates);
+ bitmap_obstack_release (NULL);
+ df_process_deferred_rescans ();
+ df_verify ();
+
+ return 0;
+}
+
namespace {
const pass_data pass_data_insert_vzeroupper =
@@ -2591,6 +3374,39 @@ public:
}; // class pass_insert_vzeroupper
+const pass_data pass_data_stv =
+{
+ RTL_PASS, /* type */
+ "stv", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_stv : public rtl_opt_pass
+{
+public:
+ pass_stv (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_stv, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return !TARGET_64BIT && TARGET_SSE2 && optimize > 1;
+ }
+
+ virtual unsigned int execute (function *)
+ {
+ return convert_scalars_to_vector ();
+ }
+
+}; // class pass_stv
+
} // anon namespace
rtl_opt_pass *
@@ -2599,6 +3415,12 @@ make_pass_insert_vzeroupper (gcc::context *ctxt)
return new pass_insert_vzeroupper (ctxt);
}
+rtl_opt_pass *
+make_pass_stv (gcc::context *ctxt)
+{
+ return new pass_stv (ctxt);
+}
+
/* Return true if a red-zone is in use. */
static inline bool
@@ -4363,12 +5185,18 @@ ix86_option_override (void)
= { pass_insert_vzeroupper, "reload",
1, PASS_POS_INSERT_AFTER
};
+ opt_pass *pass_stv = make_pass_stv (g);
+ struct register_pass_info stv_info
+ = { pass_stv, "combine",
+ 1, PASS_POS_INSERT_AFTER
+ };
ix86_option_override_internal (true, &global_options, &global_options_set);
/* This needs to be done at start up. It's convenient to do it here. */
register_pass (&insert_vzeroupper_info);
+ register_pass (&stv_info);
}
/* Implement the TARGET_OFFLOAD_OPTIONS hook. */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7195882..6aae22c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -972,6 +972,11 @@
(HI "TARGET_HIMODE_MATH")
SI])
+;; Math-dependant integer modes with DImode.
+(define_mode_iterator SWIM1248x [(QI "TARGET_QIMODE_MATH")
+ (HI "TARGET_HIMODE_MATH")
+ SI DI])
+
;; Math-dependant single word integer modes without QImode.
(define_mode_iterator SWIM248 [(HI "TARGET_HIMODE_MATH")
SI (DI "TARGET_64BIT")])
@@ -7731,9 +7736,9 @@
;; it should be done with splitters.
(define_expand "and<mode>3"
- [(set (match_operand:SWIM 0 "nonimmediate_operand")
- (and:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
- (match_operand:SWIM 2 "<general_szext_operand>")))]
+ [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+ (and:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+ (match_operand:SWIM1248x 2 "<general_szext_operand>")))]
""
{
machine_mode mode = <MODE>mode;
@@ -7811,6 +7816,43 @@
(const_string "*")))
(set_attr "mode" "SI,DI,DI,SI,DI")])
+(define_insn_and_split "*anddi3_doubleword"
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
+ (and:DI
+ (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
+ (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
+ (clobber (reg:CC FLAGS_REG))]
+ "!TARGET_64BIT && ix86_binary_operator_ok (AND, DImode, operands)"
+ "#"
+ "!TARGET_64BIT && reload_completed"
+ [(parallel [(set (match_dup 0)
+ (and:SI (match_dup 1) (match_dup 2)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel [(set (match_dup 3)
+ (and:SI (match_dup 4) (match_dup 5)))
+ (clobber (reg:CC FLAGS_REG))])]
+ "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
+
+(define_insn_and_split "*zext<mode>_doubleword"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (zero_extend:DI (match_operand:SWI24 1 "nonimmediate_operand" "rm")))]
+ "!TARGET_64BIT"
+ "#"
+ "!TARGET_64BIT && reload_completed"
+ [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
+ (set (match_dup 2) (const_int 0))]
+ "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
+
+(define_insn_and_split "*zextqi_doubleword"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (zero_extend:DI (match_operand:QI 1 "nonimmediate_operand" "qm")))]
+ "!TARGET_64BIT"
+ "#"
+ "!TARGET_64BIT && reload_completed"
+ [(set (match_dup 0) (zero_extend:SI (match_dup 1)))
+ (set (match_dup 2) (const_int 0))]
+ "split_double_mode (DImode, &operands[0], 1, &operands[0], &operands[2]);")
+
(define_insn "*andsi_1"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r,Ya,!k")
(and:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,qm,k")
@@ -8298,9 +8340,9 @@
;; If this is considered useful, it should be done with splitters.
(define_expand "<code><mode>3"
- [(set (match_operand:SWIM 0 "nonimmediate_operand")
- (any_or:SWIM (match_operand:SWIM 1 "nonimmediate_operand")
- (match_operand:SWIM 2 "<general_operand>")))]
+ [(set (match_operand:SWIM1248x 0 "nonimmediate_operand")
+ (any_or:SWIM1248x (match_operand:SWIM1248x 1 "nonimmediate_operand")
+ (match_operand:SWIM1248x 2 "<general_operand>")))]
""
"ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
@@ -8318,6 +8360,23 @@
[(set_attr "type" "alu,alu,msklog")
(set_attr "mode" "<MODE>")])
+(define_insn_and_split "*<code>di3_doubleword"
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r")
+ (any_or:DI
+ (match_operand:DI 1 "nonimmediate_operand" "%0,0,0")
+ (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,rm")))
+ (clobber (reg:CC FLAGS_REG))]
+ "!TARGET_64BIT && ix86_binary_operator_ok (<CODE>, DImode, operands)"
+ "#"
+ "!TARGET_64BIT && reload_completed"
+ [(parallel [(set (match_dup 0)
+ (any_or:SI (match_dup 1) (match_dup 2)))
+ (clobber (reg:CC FLAGS_REG))])
+ (parallel [(set (match_dup 3)
+ (any_or:SI (match_dup 4) (match_dup 5)))
+ (clobber (reg:CC FLAGS_REG))])]
+ "split_double_mode (DImode, &operands[0], 3, &operands[0], &operands[3]);")
+
(define_insn "*<code>hi_1"
[(set (match_operand:HI 0 "nonimmediate_operand" "=r,rm,!k")
(any_or:HI