Hello Richard:
On 12/08/24 5:30 pm, Richard Sandiford wrote:
> Ajit Agarwal <[email protected]> writes:
>> [...]
>> +static void
>> +update_change (set_info *set)
>> +{
>> + if (!set->has_any_uses ())
>> + return;
>> +
>> + auto *use = *set->all_uses ().begin ();
>> + do
>> + {
>> + auto *next_use = use->next_use ();
>> + if (use->is_in_phi ())
>> + {
>> + update_change (use->phi ());
>> + }
>> + else
>> + {
>> + crtl->ssa->remove_use (use);
>
> This isn't right. AFAICT it's simply removing uses from a debug
> instruction even though the debug instruction pattern still refers
> to the register.
>
> Instead, the debug instruction needs to be updated to refer to the
> correct half of the new double-width load result. If the update fails
> for any reason, the debug insn should be "reset" (i.e. changed to
>
> INSN_VAR_LOCATION_LOC (use_rtl) = gen_rtx_UNKNOWN_VAR_LOC ();
>
> see insn_combination::substitute_debug_use in late-combine.cc for
> an example).
>
Addressed in v8 of the patch.
>> + }
>> + use = next_use;
>> + }
>> + while (use);
>> +}
>> [...]
>> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair.
>> +static rtx
>> +new_reg_rtx (rtx old_dest)
>> +{
>> + rtx new_dest_exp = gen_reg_rtx (OOmode);
>> + ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest);
>> + REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest);
>> + REG_POINTER (new_dest_exp) = REG_POINTER (old_dest);
>> + REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest);
>> + max_regno = max_reg_num ();
>> + return new_dest_exp;
>> +}
>
> The new register is a different size and mode from OLD_DEST, so it isn't
> appropriate to copy across this much information. The caller should just
> use gen_reg_rtx directly.
>
Addressed in v8 of the patch.
>> +
>> +// Set subreg with use of INSN given SRC rtx instruction.
>> +static void
>> +set_load_subreg (insn_info *i1, rtx src)
>> +{
>> + rtx set = single_set (i1->rtl());
>> + rtx old_dest = SET_DEST (set);
>> +
>> + for (auto def : i1->defs ())
>> + {
>> + auto set = dyn_cast<set_info *> (def);
>> + for (auto use : set->nondebug_insn_uses ())
>> + {
>> + insn_info *info = use->insn ();
>> + if (!info || !info->rtl ())
>> + continue;
>> +
>> + rtx_insn *rtl_insn = info->rtl ();
>> + df_ref ref;
>> +
>> + FOR_EACH_INSN_USE (ref, rtl_insn)
>> + {
>> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
>> + if (REG_P (dest_exp)
>> + && DF_REF_REGNO (ref) == REGNO (dest_exp))
>> + {
>> + rtx *loc = DF_REF_LOC (ref);
>> + insn_propagation prop (rtl_insn, old_dest, src);
>> + if (GET_CODE (*loc) == SUBREG)
>> + {
>> + if (!prop.apply_to_pattern (loc))
>> + {
>> + if (dump_file != NULL)
>> + {
>> + fprintf (dump_file,
>> + "Cannot propagate insn \n");
>> + print_rtl_single (dump_file, rtl_insn);
>
> We can't simply ignore the failure, since it would leave the instruction
> referring to the wrong register. We either need to assert that the failure
> can't happen (dangerous) or bail out of the load fusion if propagation fails.
>
Addressed in v8 of the patch.
>> + }
>> + return;
>> + }
>> + }
>> + else
>> + *loc = copy_rtx (src);
>> + }
>> + }
>> + }
>
> We shouldn't have two ways of doing this. The above FOR_EACH_INSN_USE
> loop should be replaced by something like:
>
> insn_propagation prop (rtl_insn, old_dest, src);
> if (!prop.apply_to_pattern (loc))
> ...
>
> so that the substitution is done in one step, and only done using
> insn_propagation.
>
> Like I mentioned before, this change needs to be described as an
> insn_change as well, so that rtl-ssa framework will see it. This is
> important because later insn movement decisions require the def-use
> information to be up-to-date.
>
Addressed in v8 of the patch.
>> + }
>> +}
>> +
>> +// Set subreg for OO mode store pair to generate registers in pairs
>> +// given insn_info I1 and I2.
>> +static void
>> +set_multiword_subreg_store (insn_info *i1, insn_info *i2)
>> +{
>> + rtx_insn *insn1 = i1->rtl ();
>> + rtx_insn *insn2 = i2->rtl ();
>> + rtx body = PATTERN (insn1);
>> + rtx src_exp = SET_SRC (body);
>> + rtx insn2_body = PATTERN (insn2);
>> + rtx insn2_dest_exp = SET_DEST (insn2_body);
>> + machine_mode mode = GET_MODE (src_exp);
>> + int regoff;
>> + rtx src;
>> + rtx addr = XEXP (insn2_dest_exp, 0);
>> +
>> + PUT_MODE_RAW (src_exp, OOmode);
>> + if (GET_CODE (addr) == PLUS
>> + && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
>> + regoff = 16;
>> + else
>> + regoff = 0;
>> +
>> + src = simplify_gen_subreg (mode,
>> + src_exp, GET_MODE (src_exp),
>> + regoff);
>> +
>> + set_store_subreg (i1, src, regoff);
>> +
>> + int regoff1 = 0;
>> + rtx src1;
>> +
>> + src1 = simplify_gen_subreg (mode,
>> + src_exp, GET_MODE (src_exp),
>> + regoff1);
>> +
>> + set_store_subreg (i2, src1, regoff1);
>> + set_rescan_store (i1);
>> + set_rescan_store (i2);
>> + df_insn_rescan (insn1);
>> +}
>> +
>> +// Return true iff insn I1 has already existing subreg.
>> +static bool
>> +use_has_subreg_p (insn_info *i1)
>> +{
>> + for (auto def : i1->defs ())
>> + {
>> + auto set = dyn_cast<set_info *> (def);
>> + for (auto use : set->nondebug_insn_uses ())
>> + {
>> + insn_info *info = use->insn ();
>> + if (info && info->rtl ())
>> + {
>> + rtx_insn *rtl_insn = info->rtl ();
>> + df_ref ref;
>> + FOR_EACH_INSN_USE (ref, rtl_insn)
>> + {
>> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
>> + if (REG_P (dest_exp)
>> + && DF_REF_REGNO (ref) == REGNO (dest_exp))
>> + {
>> + rtx *loc = DF_REF_LOC (ref);
>> + if (GET_CODE (*loc) == SUBREG)
>> + return true;
>> + }
>> + }
>> + }
>> + }
>> + }
>> + return false;
>> +}
>> +
>> +// Set subreg for OO mode pair load for existing subreg rtx to generate
>> +// registers in pairs given insn_info I2 and I2.
>> +static void
>> +set_multiword_subreg_load (insn_info *i1, insn_info *i2)
>> +
>> +{
>> + rtx_insn *insn1 = i1->rtl ();
>> + rtx body = PATTERN (insn1);
>> + rtx dest_exp = SET_DEST (body);
>> + machine_mode mode = GET_MODE (dest_exp);
>> + int regoff1;
>> + regoff1 = 16;
>> +
>> + bool subreg_p = false;
>> +
>> + if (use_has_subreg_p (i1))
>> + subreg_p = true;
>
> Like I said in the last review, I don't think we should have a separate
> path for uses that involved subregs and uses that didn't. What is currently
> the "subreg_p" path should be the only path. We should use it even if
> there were no references involving subregs.
>
> We shouldn't in general change the width of an existing pseudo register,
> since it would invalidate REG_ATTRS and any debug information associated
> with the register. The rs6000 form of load fusion should always create
> a fresh pseudo register for the double-width fused result.
>
Addressed in v8 of the patch.
>> +
>> + rtx new_dest_exp = dest_exp;
>> +
>> + if (subreg_p)
>> + new_dest_exp = new_reg_rtx (dest_exp);
>> + else
>> + PUT_MODE_RAW (new_dest_exp, OOmode);
>> +
>> + rtx src = simplify_gen_subreg (mode,
>> + new_dest_exp,
>> + OOmode,
>> + regoff1);
>> +
>> + set_load_subreg (i1, src);
>> + if (subreg_p)
>> + propagate_insn (i1, new_dest_exp);
>> +
>> + int regoff = 0;
>> + rtx sset = single_set (i2->rtl ());
>> + rtx insn2_dest_exp = SET_DEST (sset);
>> + machine_mode insn2_mode = GET_MODE (insn2_dest_exp);
>> +
>> + src = simplify_gen_subreg (insn2_mode,
>> + new_dest_exp,
>> + OOmode,
>> + regoff);
>> +
>> + set_load_subreg (i2, src);
>> +
>> + if (subreg_p)
>> + propagate_insn (i2, new_dest_exp);
>> +
>> + if (subreg_p)
>> + {
>> + auto attempt = crtl->ssa->new_change_attempt ();
>> + resource_info resource = { GET_MODE (new_dest_exp), REGNO
>> (new_dest_exp) };
>> + auto *set = crtl->ssa->allocate<set_info> (i1, resource);
>> + if (set)
>> + {
>> + auto def = find_access (i1->defs (), REGNO (new_dest_exp));
>> + if (!def)
>> + i1->defs() = insert_access (attempt, set, i1->defs());
>> + }
>
> Changes to rtl-ssa insn_infos should always happen through insn_changes,
> rather than by direct modification of insns. This allows the framework
> to do all the required book-keeping. E.g. the above does not extend
> the function_info::m_defs array. (And it shouldn't extend that array
> directly -- rtl-ssa should do that internally.)
>
> We already have:
>
> function_info::create_set
>
> which creates a temporary definition of something, for use in an
> insn_change. I'm not sure that it handles completely new registers,
> because no-one's needed that yet, but it would be a reasonable thing
> to add (in function_info::change_insns).
>
> In summary:
>
> - please always use a new pseudo register for the fused load result
> - please always use insn_propagation to update uses of the load results
> - please create an insn_change for each use, and add the changes to the
> existing list of insn_changes made by the main pair-fusion.cc code
> - please don't try to change the rtl-ssa ir directly from rs6000 code;
> instead, describe the new instruction using insn_changes
> - once we know that the fusion is going to go ahead, we should update
> all debug uses to refer to the new register half, in a similar way
> to late-combine's insn_combination::substitute_optional_uses
> (perhaps I should try to provide an abstraction around that).
>
Addressed in v8 of the patch.
> Thanks,
> Richard
>
Thanks & Regards
Ajit
>> + }
>> +
>> + set_rescan_load (i1);
>> + set_rescan_load (i2);
>> + df_insn_rescan (insn1);
>> +}
>> +
>> +// Set subreg for OO mode pair to generate sequential registers given
>> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
>> +// if store insn.
>> +void
>> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
>> + bool load_p)
>> +{
>> + if (load_p)
>> + set_multiword_subreg_load (i1, i2);
>> + else
>> + set_multiword_subreg_store (i1, i2);
>> +}
>> +
>> +rtx
>> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p)
>> +{
>> + rtx i1 = pats[0];
>> + rtx src_exp = SET_SRC (i1);
>> + rtx dest_exp = SET_DEST (i1);
>> + PUT_MODE_RAW (src_exp, OOmode);
>> + PUT_MODE_RAW (dest_exp, OOmode);
>> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp),
>> + gen_rtvec (1, src_exp),
>> + UNSPEC_LXVP);
>> + rtx set = gen_rtx_SET (dest_exp, unspec);
>> + if (dump_file)
>> + {
>> + if (load_p)
>> + fprintf (dump_file, "lxv with lxvp ");
>> + else
>> + fprintf (dump_file, "stxv with stxvp ");
>> + print_rtl_single (dump_file, set);
>> + }
>> + return set;
>> +}
>> +
>> +const pass_data pass_data_mem_fusion =
>> +{
>> + RTL_PASS, /* type */
>> + "mem_fusion", /* name */
>> + OPTGROUP_NONE, /* optinfo_flags */
>> + TV_NONE, /* tv_id */
>> + 0, /* properties_required */
>> + 0, /* properties_provided */
>> + 0, /* properties_destroyed */
>> + 0, /* todo_flags_start */
>> + TODO_df_finish, /* todo_flags_finish */
>> +};
>> +
>> +class pass_mem_fusion : public rtl_opt_pass
>> +{
>> +public:
>> + pass_mem_fusion (gcc::context *ctxt)
>> + : rtl_opt_pass (pass_data_mem_fusion, ctxt)
>> + {}
>> +
>> + opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
>> +
>> + /* opt_pass methods: */
>> + bool gate (function *)
>> + {
>> + return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
>> + }
>> +
>> + unsigned int execute (function *) final override
>> + {
>> + rs6000_pair_fusion pass;
>> + pass.run ();
>> + return 0;
>> + }
>> +}; // class pass_mem_fusion
>> +
>> +rtl_opt_pass *
>> +make_pass_mem_fusion (gcc::context *ctxt)
>> +{
>> + return new pass_mem_fusion (ctxt);
>> +}
>> diff --git a/gcc/config/rs6000/rs6000-passes.def
>> b/gcc/config/rs6000/rs6000-passes.def
>> index 46a0d0b8c56..0b48f57014d 100644
>> --- a/gcc/config/rs6000/rs6000-passes.def
>> +++ b/gcc/config/rs6000/rs6000-passes.def
>> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see
>> The power8 does not have instructions that automaticaly do the byte
>> swaps
>> for loads and stores. */
>> INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
>> -
>> + /* Pass to replace adjacent memory addresses lxv/stxv instruction with
>> + lxvp/stxvp instruction. */
>> + INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
>> /* Pass to do the PCREL_OPT optimization that combines the load of an
>> external symbol's address along with a single load or store using that
>> address as a base register. */
>> diff --git a/gcc/config/rs6000/rs6000-protos.h
>> b/gcc/config/rs6000/rs6000-protos.h
>> index b40557a8557..5295ff66950 100644
>> --- a/gcc/config/rs6000/rs6000-protos.h
>> +++ b/gcc/config/rs6000/rs6000-protos.h
>> @@ -342,6 +342,7 @@ namespace gcc { class context; }
>> class rtl_opt_pass;
>>
>> extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
>> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
>> extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
>> extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
>> extern bool rs6000_quadword_masked_address_p (const_rtx exp);
>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
>> index 5ed64b1e686..449d9e446d5 100644
>> --- a/gcc/config/rs6000/rs6000.cc
>> +++ b/gcc/config/rs6000/rs6000.cc
>> @@ -27442,7 +27442,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> reg_mode = word_mode;
>> reg_mode_size = GET_MODE_SIZE (reg_mode);
>>
>> - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
>> + gcc_assert (mode == OOmode
>> + || reg_mode_size * nregs == GET_MODE_SIZE (mode));
>>
>> /* TDmode residing in FP registers is special, since the ISA requires that
>> the lower-numbered word of a register pair is always the most
>> significant
>> @@ -27489,6 +27490,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
>> if (MEM_P (dst))
>> {
>> + rtx addr = XEXP (dst, 0);
>> + rtx opnd1 = NULL_RTX;
>> + if (addr && GET_CODE (addr) == PLUS)
>> + opnd1 = XEXP (addr,1);
>> +
>> unsigned offset = 0;
>> unsigned size = GET_MODE_SIZE (reg_mode);
>>
>> @@ -27502,7 +27508,13 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> {
>> unsigned subreg
>> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
>> - rtx dst2 = adjust_address (dst, reg_mode, offset);
>> + rtx dst2 = dst;
>> +
>> + if ((GET_CODE (addr) != PLUS
>> + || (opnd1 && CONST_INT_P(opnd1))))
>> + dst2 = adjust_address (dst, reg_mode, offset);
>> + else
>> + PUT_MODE_RAW (dst, reg_mode);
>> rtx src2 = gen_rtx_REG (reg_mode, reg + subreg);
>> offset += size;
>> emit_insn (gen_rtx_SET (dst2, src2));
>> @@ -27513,15 +27525,25 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>
>> if (MEM_P (src))
>> {
>> + rtx addr = XEXP (src, 0);
>> + rtx opnd1 = NULL_RTX;
>> + if (addr && GET_CODE (addr) == PLUS)
>> + opnd1 = XEXP (addr,1);
>> +
>> unsigned offset = 0;
>> unsigned size = GET_MODE_SIZE (reg_mode);
>>
>> - for (int i = 0; i < nregs; i += reg_mode_nregs)
>> + for (int i = nregs-1; i >= 0; i -= reg_mode_nregs)
>> {
>> unsigned subreg
>> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
>> rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg);
>> - rtx src2 = adjust_address (src, reg_mode, offset);
>> + rtx src2 = src;
>> +
>> + if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1))))
>> + src2 = adjust_address (src, reg_mode, offset);
>> + else
>> + PUT_MODE_RAW (src2, reg_mode);
>> offset += size;
>> emit_insn (gen_rtx_SET (dst2, src2));
>> }
>> @@ -27529,7 +27551,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> /* If we are writing an accumulator register, we have to
>> prime it after we've written it. */
>> if (TARGET_MMA
>> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
>> + && REG_P (dst) && GET_MODE (dst) == XOmode
>> + && FP_REGNO_P (REGNO (dst)))
>> emit_insn (gen_mma_xxmtacc (dst, dst));
>>
>> return;
>> @@ -27622,9 +27645,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> {
>> for (i = nregs - 1; i >= 0; i--)
>> {
>> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
>> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
>> - emit_insn (gen_rtx_SET (dst_i, src_i));
>> + if (REG_P (dst) && REG_P (src))
>> + {
>> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
>> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
>> + emit_insn (gen_rtx_SET (dst_i, src_i));
>> + }
>> }
>> }
>> else
>> @@ -27639,7 +27665,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> /* If we are writing an accumulator register, we have to
>> prime it after we've written it. */
>> if (TARGET_MMA
>> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
>> + && REG_P (dst) && GET_MODE (dst) == XOmode
>> + && FP_REGNO_P (REGNO (dst)))
>> emit_insn (gen_mma_xxmtacc (dst, dst));
>> }
>> else
>> @@ -27696,7 +27723,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>
>> /* If the base register we are using to address memory is
>> also a destination reg, then change that register last. */
>> - if (REG_P (breg)
>> + if (REG_P (dst) && REG_P (breg)
>> && REGNO (breg) >= REGNO (dst)
>> && REGNO (breg) < REGNO (dst) + nregs)
>> j = REGNO (breg) - REGNO (dst);
>> @@ -27794,9 +27821,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> /* XO/OO are opaque so cannot use subregs. */
>> if (mode == OOmode || mode == XOmode )
>> {
>> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
>> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
>> - emit_insn (gen_rtx_SET (dst_i, src_i));
>> + if (REG_P (dst) && REG_P (src))
>> + {
>> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
>> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
>> + emit_insn (gen_rtx_SET (dst_i, src_i));
>> + }
>> }
>> else
>> emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode,
>> @@ -27814,7 +27844,9 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>> if (restore_basereg != NULL_RTX)
>> emit_insn (restore_basereg);
>> }
>> + return;
>> }
>> +
>>
>> /* Return true if the peephole2 can combine a load involving a combination
>> of
>> an addis instruction and a load with an offset that can be fused
>> together on
>> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
>> index f59be536570..99f070ee24d 100644
>> --- a/gcc/config/rs6000/rs6000.md
>> +++ b/gcc/config/rs6000/rs6000.md
>> @@ -159,6 +159,7 @@
>> UNSPEC_XXSPLTIW_CONST
>> UNSPEC_FMAX
>> UNSPEC_FMIN
>> + UNSPEC_LXVP
>> ])
>>
>> ;;
>> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
>> index 155788de40a..a052163eec4 100644
>> --- a/gcc/config/rs6000/t-rs6000
>> +++ b/gcc/config/rs6000/t-rs6000
>> @@ -34,6 +34,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
>> $(COMPILE) $<
>> $(POSTCOMPILE)
>>
>> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
>> + $(COMPILE) $<
>> + $(POSTCOMPILE)
>> +
>> +
>> rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
>> $(COMPILE) $<
>> $(POSTCOMPILE)
>> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
>> index 31d2c21c88f..b1521253474 100644
>> --- a/gcc/pair-fusion.cc
>> +++ b/gcc/pair-fusion.cc
>> @@ -312,9 +312,9 @@ static int
>> encode_lfs (lfs_fields fields)
>> {
>> int size_log2 = exact_log2 (fields.size);
>> - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
>> - return ((int)fields.load_p << 3)
>> - | ((int)fields.fpsimd_p << 2)
>> + gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
>> + return ((int)fields.load_p << 4)
>> + | ((int)fields.fpsimd_p << 3)
>> | (size_log2 - 2);
>> }
>>
>> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields)
>> static lfs_fields
>> decode_lfs (int lfs)
>> {
>> - bool load_p = (lfs & (1 << 3));
>> - bool fpsimd_p = (lfs & (1 << 2));
>> + bool load_p = (lfs & (1 << 4));
>> + bool fpsimd_p = (lfs & (1 << 3));
>> unsigned size = 1U << ((lfs & 3) + 2);
>> return { load_p, fpsimd_p, size };
>> }
>> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool
>> load_p, rtx mem)
>> if (MEM_VOLATILE_P (mem))
>> return;
>>
>> + if (load_p && !m_pass->fuseable_load_p (insn))
>> + return;
>> +
>> // Ignore writeback accesses if the hook says to do so.
>> if (!m_pass->should_handle_writeback (writeback_type::EXISTING)
>> && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>> }
>>
>> rtx reg_notes = combine_reg_notes (first, second, load_p);
>> -
>> + m_pass->set_multiword_subreg (i1, i2, load_p);
>> rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
>> insn_change *pair_change = nullptr;
>> auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>> @@ -1828,18 +1831,18 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>> changes.safe_push (make_delete (first));
>> pair_change = make_change (second);
>> changes.safe_push (pair_change);
>> -
>> pair_change->move_range = move_range;
>> pair_change->new_defs = merge_access_arrays (attempt,
>> input_defs[0],
>> input_defs[1]);
>> + m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes);
>> gcc_assert (pair_change->new_defs.is_valid ());
>> -
>> pair_change->new_uses
>> = merge_access_arrays (attempt,
>> drop_memory_access (input_uses[0]),
>> drop_memory_access (input_uses[1]));
>> gcc_assert (pair_change->new_uses.is_valid ());
>> + m_pass->set_uses (i1, i2, &pair_change, &attempt);
>> set_pair_pat (pair_change);
>> }
>> else
>> @@ -1872,6 +1875,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>> store_def,
>> change->new_defs);
>> gcc_assert (change->new_defs.is_valid ());
>> + m_pass->set_defs (i1, i2, &change, &attempt);
>> change->move_range = move_range;
>> pair_change = change;
>> break;
>> @@ -1908,6 +1912,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>> change->new_defs = insert_access (attempt, new_set,
>> change->new_defs);
>> gcc_assert (change->new_defs.is_valid ());
>> + m_pass->set_defs (i1, i2, &change, &attempt);
>> pair_change = change;
>> break;
>> }
>> @@ -2405,6 +2410,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p,
>> unsigned access_size,
>> reg_ops[i] = XEXP (pats[i], !load_p);
>> }
>>
>> + if (!load_p && !m_pass->fuseable_store_p (i1, i2))
>> + {
>> + if (dump_file)
>> + fprintf (dump_file,
>> + "punting on store-mem-pairs due to non fuseable cand
>> (%d,%d)\n",
>> + insns[0]->uid (), insns[1]->uid ());
>> + return false;
>> + }
>> +
>> if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>> {
>> if (dump_file)
>> @@ -2997,6 +3011,8 @@ void pair_fusion::process_block (bb_info *bb)
>> if (GET_CODE (pat) != SET)
>> continue;
>>
>> + change_existing_multword_mode (rti);
>> +
>> if (track_stores && MEM_P (XEXP (pat, 0)))
>> bb_state.track_access (insn, false, XEXP (pat, 0));
>> else if (track_loads && MEM_P (XEXP (pat, 1)))
>> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
>> index 45e4edceecb..b3af49de892 100644
>> --- a/gcc/pair-fusion.h
>> +++ b/gcc/pair-fusion.h
>> @@ -26,8 +26,12 @@ namespace rtl_ssa {
>> class insn_info;
>> class insn_range_info;
>> class bb_info;
>> + class insn_change;
>> + class insn_range_info;
>> }
>>
>> +class obstack_watermark;
>> +
>> // Information about a potential base candidate, used in try_fuse_pair.
>> // There may be zero, one, or two viable RTL bases for a given pair.
>> struct base_cand
>> @@ -142,6 +146,19 @@ struct pair_fusion {
>> // true iff INSN is a load pair.
>> virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0;
>>
>> + // Given INSN change multiword mode load and store to respective
>> + // unspec instruction.
>> + virtual void change_existing_multword_mode (rtx_insn *insn) = 0;
>> +
>> + // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the
>> + // new rtx with INSN. Remove all uses of definition that are
>> + // removed given CHANGES.
>> + virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first,
>> + obstack_watermark *attempt,
>> + rtl_ssa::insn_change **pair_change,
>> + auto_vec<rtl_ssa::insn_change *> &changes)
>> + = 0;
>> +
>> // Return true if we should track loads.
>> virtual bool track_loads_p ()
>> {
>> @@ -171,6 +188,37 @@ struct pair_fusion {
>> virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
>> rtx regs[2], bool load_p) = 0;
>>
>> + // Given insn_info pair I1 and I2, sets subreg with multiword registers
>> + // to assign register pairs by allocators.
>> + // LOAD_P is true iff the pair is a load.
>> + virtual void set_multiword_subreg (rtl_ssa::insn_info *i1,
>> + rtl_ssa::insn_info *i2,
>> + bool load_p) = 0;
>> +
>> + // Given insn_info pair I1 and I2, sets uses in PAIR_CHANGE.
>> + virtual void set_uses (rtl_ssa::insn_info *i1,
>> + rtl_ssa::insn_info *i2,
>> + rtl_ssa::insn_change **pair_change,
>> + obstack_watermark *attempt) = 0;
>> +
>> + // Given insn_info pair I1 and I2, sets defss in PAIR_CHANGE.
>> + virtual void set_defs (rtl_ssa::insn_info *i1,
>> + rtl_ssa::insn_info *i2,
>> + rtl_ssa::insn_change **pair_change,
>> + obstack_watermark *attempt) = 0;
>> +
>> +
>> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
>> + // store mem pairs.
>> + // Return true if feasible to perform store mem pairs otherwise false.
>> + virtual bool fuseable_store_p (rtl_ssa::insn_info *i1,
>> + rtl_ssa::insn_info *i2) = 0;
>> +
>> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
>> + // load mem pairs.
>> + // Return true if feasible to perform load mem pairs otherwise false.
>> + virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
>> +
>> void process_block (rtl_ssa::bb_info *bb);
>> rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
>> const rtl_ssa::insn_range_info
>> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
>> index 8be04f1aa96..27a732b7353 100644
>> --- a/gcc/rtl-ssa/functions.h
>> +++ b/gcc/rtl-ssa/functions.h
>> @@ -222,6 +222,13 @@ public:
>> template<typename T, typename... Ts>
>> T *change_alloc (obstack_watermark &wm, Ts... args);
>>
>> + auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; }
>> +
>> + template<typename T, typename... Ts>
>> + T *allocate (Ts... args);
>> +
>> + void remove_use (use_info *);
>> +
>> private:
>> class bb_phi_info;
>> class build_info;
>> @@ -231,9 +238,6 @@ private:
>> // allocate_temp during its lifetime.
>> obstack_watermark temp_watermark () { return &m_temp_obstack; }
>>
>> - template<typename T, typename... Ts>
>> - T *allocate (Ts... args);
>> -
>> template<typename T, typename... Ts>
>> T *allocate_temp (Ts... args);
>>
>> @@ -269,7 +273,6 @@ private:
>> static void insert_use_after (use_info *, use_info *);
>>
>> void add_use (use_info *);
>> - void remove_use (use_info *);
>>
>> insn_info::order_node *need_order_node (insn_info *);
>>
>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>> b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>> new file mode 100644
>> index 00000000000..d10ff0cdf36
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>> @@ -0,0 +1,22 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target power10_ok } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
>> +
>> +#include <altivec.h>
>> +
>> +void
>> +foo2 ()
>> +{
>> + __vector_quad *dst1;
>> + __vector_quad *dst2;
>> + vector unsigned char src;
>> + __vector_quad acc;
>> + vector unsigned char *ptr;
>> + __builtin_mma_xvf32ger(&acc, src, ptr[0]);
>> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
>> + *dst1 = acc;
>> + __builtin_mma_xvf32ger(&acc, src, ptr[2]);
>> + __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
>> + *dst2 = acc;
>> +}
>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C
>> b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
>> new file mode 100644
>> index 00000000000..c523572cf3c
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
>> @@ -0,0 +1,15 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target power10_ok } */
>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
>> +
>> +#include <altivec.h>
>> +
>> +void
>> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char
>> src)
>> +{
>> + __vector_quad acc;
>> + __builtin_mma_xvf32ger(&acc, src, ptr[0]);
>> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
>> + *dst = acc;
>> +}
>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>> b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>> index 69ee826e1be..ae29127f954 100644
>> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t
>> *vec)
>> dst[13] = acc;
>> }
>>
>> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
>> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
>> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
>> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
>> /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
>> /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
>> /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */