Ajit Agarwal <[email protected]> writes:
> Hello All:
>
> This version of patch relaxes store fusion for more use cases.
>
> Common infrastructure using generic code for pair mem fusion of different
> targets.
>
> rs6000 target specific code implement virtual functions defined by generic
> code.
>
> Target specific code are added in rs6000-mem-fusion.cc.
>
> Bootstrapped and regtested on powerpc64-linux-gnu.
>
> Thanks & Regards
> Ajit
>
>
> rs6000, middle-end: Add implementation for different targets for pair mem
> fusion
>
> Common infrastructure using generic code for pair mem fusion of different
> targets.
>
> rs6000 target specific code implement virtual functions defined by generic
> code.
>
> Target specific code are added in rs6000-mem-fusion.cc.
>
> 2024-07-02 Ajit Kumar Agarwal <[email protected]>
>
> gcc/ChangeLog:
>
> * config/rs6000/rs6000-passes.def: New mem fusion pass
> before pass_early_remat.
> * pair-fusion.h: Add additional pure virtual function
> required for rs6000 target implementation.
> * pair-fusion.cc: Use of virtual functions for additional
> virtual function addded for rs6000 target.
> * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
> Add target specific implementation for generic pure virtual
> functions.
> * config/rs6000/mma.md: Modify movoo machine description.
> Add new machine description movoo1.
> * config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
> to expand movoo machine description for all constraints.
> * config.gcc: Add new object file.
> * config/rs6000/rs6000-protos.h: Add new prototype for mem
> fusion pass.
> * config/rs6000/t-rs6000: Add new rule.
> * rtl-ssa/functions.h: Move out allocate function from private
> to public and add get_m_temp_defs function.
>
> gcc/testsuite/ChangeLog:
>
> * g++.target/powerpc/mem-fusion.C: New test.
> * g++.target/powerpc/mem-fusion-1.C: New test.
> * gcc.target/powerpc/mma-builtin-1.c: Modify test.
> ---
> gcc/config.gcc | 2 +
> gcc/config/rs6000/mma.md | 26 +-
> gcc/config/rs6000/rs6000-mem-fusion.cc | 708 ++++++++++++++++++
> gcc/config/rs6000/rs6000-passes.def | 4 +-
> gcc/config/rs6000/rs6000-protos.h | 1 +
> gcc/config/rs6000/rs6000.cc | 57 +-
> gcc/config/rs6000/rs6000.md | 1 +
> gcc/config/rs6000/t-rs6000 | 5 +
> gcc/pair-fusion.cc | 27 +-
> gcc/pair-fusion.h | 34 +
> gcc/rtl-ssa/functions.h | 11 +-
> .../g++.target/powerpc/mem-fusion-1.C | 22 +
> gcc/testsuite/g++.target/powerpc/mem-fusion.C | 15 +
> .../gcc.target/powerpc/mma-builtin-1.c | 4 +-
> 14 files changed, 890 insertions(+), 27 deletions(-)
> create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
> create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
> create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C
>
> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index bc45615741b..12f79a78177 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -524,6 +524,7 @@ powerpc*-*-*)
> extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
> extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
> extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
> + extra_objs="${extra_objs} rs6000-mem-fusion.o"
> extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
> extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
> extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
> @@ -560,6 +561,7 @@ rs6000*-*-*)
> extra_options="${extra_options} g.opt fused-madd.opt
> rs6000/rs6000-tables.opt"
> extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
> extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
> + extra_objs="${extra_objs} rs6000-mem-fusion.o"
> target_gtfiles="$target_gtfiles
> \$(srcdir)/config/rs6000/rs6000-logue.cc
> \$(srcdir)/config/rs6000/rs6000-call.cc"
> target_gtfiles="$target_gtfiles
> \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
> ;;
> diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
> index 04e2d0066df..88413926a02 100644
> --- a/gcc/config/rs6000/mma.md
> +++ b/gcc/config/rs6000/mma.md
> @@ -294,7 +294,31 @@
>
> (define_insn_and_split "*movoo"
> [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
> - (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
> + (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
> + "TARGET_MMA
> + && (gpc_reg_operand (operands[0], OOmode)
> + || gpc_reg_operand (operands[1], OOmode))"
> +;; ""
> + "@
> + #
> + #
> + #"
> + "&& reload_completed"
> + [(const_int 0)]
> +{
> + rs6000_split_multireg_move (operands[0], operands[1]);
> + DONE;
> +}
> + [(set_attr "type" "vecload,vecstore,veclogical")
> + (set_attr "length" "*,*,8")])
> +;; (set_attr "max_prefixed_insns" "2,2,*")])
> +
> +
> +(define_insn_and_split "*movoo1"
> + [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
> + (unspec [
> + (match_operand:OO 1 "input_operand" "ZwO,wa,wa")
> + ] UNSPEC_LXVP))]
> "TARGET_MMA
> && (gpc_reg_operand (operands[0], OOmode)
> || gpc_reg_operand (operands[1], OOmode))"
> diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc
> b/gcc/config/rs6000/rs6000-mem-fusion.cc
> new file mode 100644
> index 00000000000..b63b6f31001
> --- /dev/null
> +++ b/gcc/config/rs6000/rs6000-mem-fusion.cc
> @@ -0,0 +1,708 @@
> +/* Subroutines used to perform adjacent load/store into
> + paired memory accesses for TARGET_POWER10 and TARGET_VSX.
> +
> + Copyright (C) 2024 Free Software Foundation, Inc.
> +
> + This file is part of GCC.
> +
> + GCC is free software; you can redistribute it and/or modify it
> + under the terms of the GNU General Public License as published
> + by the Free Software Foundation; either version 3, or (at your
> + option) any later version.
> +
> + GCC is distributed in the hope that it will be useful, but WITHOUT
> + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
> + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
> + License for more details.
> +
> + You should have received a copy of the GNU General Public License
> + along with GCC; see the file COPYING3. If not see
> + <http://www.gnu.org/licenses/>. */
> +
> +#define INCLUDE_ALGORITHM
> +#define INCLUDE_FUNCTIONAL
> +#define INCLUDE_LIST
> +#define INCLUDE_TYPE_TRAITS
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "backend.h"
> +#include "rtl.h"
> +#include "df.h"
> +#include "rtl-iter.h"
> +#include "rtl-ssa.h"
> +#include "rtl-ssa/internals.h"
> +#include "rtl-ssa/internals.inl"
> +#include "cfgcleanup.h"
> +#include "tree-pass.h"
> +#include "pair-fusion.h"
> +
> +using namespace rtl_ssa;
> +
> +struct rs6000_pair_fusion : public pair_fusion
> +{
> + bool fpsimd_op_p (rtx , machine_mode , bool) override final
> + {
> + return false;
> + }
> +
> + bool pair_mem_insn_p (rtx_insn *, bool &) override final
> + {
> + return false;
> + }
> +
> + void change_existing_multword_mode (rtx_insn *insn) override final;
> +
> + bool pair_mem_ok_with_policy (rtx, bool) override final
> + {
> + return true;
> + }
> +
> + bool pair_operand_mode_ok_p (machine_mode mode) override final;
> +
> + rtx gen_pair (rtx *pats, rtx, bool load_p) override final;
> +
> + bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final
> + {
> + return true;
> + }
> +
> + int pair_mem_alias_check_limit () override final
> + {
> + return 0;
> + }
> +
> + bool should_handle_writeback (enum writeback_type) override final
> + {
> + return false;
> + }
> +
> + bool track_loads_p () override final
> + {
> + return true;
> + }
> +
> + bool track_stores_p () override final
> + {
> + return true;
> + }
> +
> + bool pair_mem_in_range_p (HOST_WIDE_INT) override final
> + {
> + return true;
> + }
> +
> + rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final
> + {
> + return NULL_RTX;
> + }
> +
> + rtx destructure_pair (rtx_def **, rtx, bool) override final
> + {
> + return NULL_RTX;
> + }
> +
> + bool fuseable_store_p (insn_info *i1, insn_info *i2) override final;
> +
> + bool fuseable_load_p (insn_info *insn) override final;
> +
> + void set_multiword_subreg (insn_info *i1, insn_info *i2,
> + bool load_p) override final;
> +
> + void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt,
> + insn_change **pair_change,
> + auto_vec <insn_change *> &changes) override final;
> +};
> +
> +bool
> +rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
> +{
> + return (ALTIVEC_OR_VSX_VECTOR_MODE (mode));
> +}
> +
> +void
> +rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn)
> +{
> + rtx set = single_set (insn);
> + rtx src = SET_SRC (set);
> + rtx dest = SET_DEST (set);
> + rtx copy = NULL_RTX;
> +
> + if ((MEM_P (src) && GET_MODE (src) == OOmode)
> + || (MEM_P (dest) && GET_MODE (dest) == OOmode))
> + {
> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest),
> + gen_rtvec (1, src),
> + UNSPEC_LXVP);
> + copy = gen_rtx_SET (dest, unspec);
> + rtx_insn *new_insn = emit_insn_after (copy, insn);
> + set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
> + df_insn_rescan (new_insn);
> + df_insn_delete (insn);
> + remove_insn (insn);
> + insn->set_deleted ();
> + }
> +}
> +
> +static void
> +update_change (set_info *set)
> +{
> + if (!set->has_any_uses ())
> + return;
> +
> + auto *use = *set->all_uses ().begin ();
> + do
> + {
> + auto *next_use = use->next_use ();
> + if (use->is_in_phi ())
> + {
> + update_change (use->phi ());
> + }
> + else
> + {
> + crtl->ssa->remove_use (use);
> + }
> + use = next_use;
> + }
> + while (use);
> +}
> +
> +void
> +rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first,
> + obstack_watermark *attempt,
> + insn_change **pair_change,
> + auto_vec<insn_change *> &changes)
> +{
> + for (insn_change *change : changes)
> + for (auto def : change->old_defs ())
> + {
> + auto set = dyn_cast<set_info *> (def);
> + update_change (set);
> + }
> +
> + auto &new_defs = (*pair_change)->new_defs;
> + vec_rtx_properties properties;
> + properties.add_insn (first->rtl (), true);
> + // Build up the new list of definitions.
> + for (rtx_obj_reference ref : properties.refs ())
> + if (ref.is_write ())
> + {
> + auto *set = crtl->ssa->allocate<set_info> (first,
> + full_register (ref.regno));
> + if (set)
> + {
> + auto def = find_access (new_defs, ref.regno);
> + if (!def)
> + {
> + new_defs = insert_access (*attempt, set,
> + new_defs);
> + auto &m_temp_defs = crtl->ssa->get_m_temp_defs ();
> + m_temp_defs.safe_push (set);
> + }
> + }
> + }
> +}
> +
> +// df_insn_rescan dependent instruction where operands
> +// are reversed given insn_info INFO.
> +static void
> +set_rescan_load (insn_info *i1)
> +{
> + for (auto def : i1->defs ())
> + {
> + auto set = dyn_cast<set_info *> (def);
> + for (auto use : set->all_uses ())
> + {
> + insn_info *info = use->insn ();
> + if (info && info->rtl ())
> + {
> + rtx_insn *rtl_insn = info->rtl ();
> + df_insn_rescan (rtl_insn);
> + }
> + }
> + }
> +}
> +
> +// df_insn_rescan the def instruction where operands are reversed given INSN.
> +static bool
> +set_rescan_store (insn_info *insn)
> +{
> + for (auto use : insn->uses())
> + {
> + auto def = use->def ();
> +
> + if (!def)
> + return false;
> +
> + if (def->insn ()->is_artificial ())
> + return false;
> +
> + if (def->insn () && def->insn ()->rtl ()
> + && def->insn()->is_real ())
> + {
> + rtx_insn *rtl_insn = def->insn ()->rtl ();
> + rtx set = single_set (rtl_insn);
> +
> + if (set == NULL_RTX)
> + return false;
> + df_insn_rescan (rtl_insn);
> + }
> + }
> + return true;
> +}
> +
> +// Check for feasibility of store to be fuseable or not. Return true if
> +// feasible otherwise false.
> +static bool
> +feasible_store_p (insn_info *insn)
> +{
> + for (auto use : insn->uses ())
> + {
> + auto def = use->def ();
> +
> + if (def->insn ()->is_artificial ())
> + return false;
> +
> + if (def->insn () && def->insn ()->rtl ()
> + && def->insn()->is_real ())
> + {
> + rtx_insn *rtl_insn = def->insn ()->rtl ();
> + rtx set = single_set (rtl_insn);
> +
> + if (set == NULL_RTX)
> + return false;
> +
> + // Return false if dependent def is load.
> + // This is done as def instruction could be a fused load and
> + // to avoid already existing subreg (reg:OO R) offset.
> + if (rtl_insn && MEM_P (SET_SRC (set)))
> + return false;
> +
> + // Return false if dependent def is store.
> + if (rtl_insn && MEM_P (SET_DEST (set)))
> + return false;
I don't understand these tests. It might help to turn it around and
say: what sort of cases do you want to handle?
> + }
> + }
> + return true;
> +}
> +
> +// Check if store can be fuseable or not. Return true if fuseable otherwise
> +// false.
> +bool
> +rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2)
> +{
> + rtx_insn *insn1 = i1->rtl ();
> + rtx_insn *insn2 = i2->rtl ();
> + rtx body = PATTERN (insn1);
> + rtx src_exp = SET_SRC (body);
> + rtx insn2_body = PATTERN (insn2);
> + rtx insn2_src_exp = SET_SRC (insn2_body);
> +
> + if (!(REG_P (src_exp)
> + && crtl->ssa->single_dominating_def (REGNO (src_exp))))
> + return false;
> +
> + // This is done as def instruction could be a fused load and
> + // to avoid already existing subreg (reg:OO R) offset.
> + if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1)
> + return false;
> +
> + // Return false if src of insn1 and src of insn2 are same.
> + if (src_exp == insn2_src_exp)
> + return false;
> +
> + // Return false if src of insn1 is subreg.
> + if (GET_CODE (src_exp) == SUBREG)
> + return false;
This can't be true after the REG_P check above.
> +
> + // Return false if src of insn2 is subreg.
> + if (GET_CODE (insn2_src_exp) == SUBREG)
> + return false;
Shouldn't the tests for i1 and i2 be symmetrical, with i2 also
requiring a single dominating definition?
> +
> + if (!feasible_store_p (i1))
> + return false;;
> +
> + if (!feasible_store_p (i2))
> + return false;
> +
> + return true;
> +}
> +
> +// Set subreg for def of store INSN given rtx SRC instruction.
> +static void
> +set_store_subreg (insn_info *i1, rtx src, int regoff)
> +{
> + for (auto use: i1->uses ())
> + {
> + auto def = use->def ();
> + if (!def)
> + return;
> +
> + insn_info *info = def->insn ();
> +
> + if (info->is_artificial ())
> + return;
> +
> + if (info && info->is_real ())
> + {
> + rtx_insn *rtl_insn = info->rtl ();
> + rtx set = single_set (rtl_insn);
> + if (set == NULL_RTX)
> + return;
> + df_ref ref;
> + FOR_EACH_INSN_DEF (ref, rtl_insn)
> + {
> + rtx src_exp = SET_SRC (PATTERN (i1->rtl ()));
> + if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp))
> + {
> + rtx *loc = DF_REF_LOC (ref);
> + if (GET_CODE (*loc) == SUBREG)
> + {
> + rtx src1 = simplify_gen_subreg (GET_MODE (*loc),
> + SUBREG_REG (src),
> + OOmode,
> + regoff);
> + *loc = copy_rtx (src1);
> + }
> + else
> + *loc = copy_rtx (src);
> + }
> + }
> + }
> + }
> +}
> +
> +// Check whether load can be fusable or not.
> +// Return true if fuseable otherwise false.
> +bool
> +rs6000_pair_fusion::fuseable_load_p (insn_info *i1)
> +{
> + rtx_insn *insn = i1->rtl ();
> + rtx body = PATTERN (insn);
> + rtx dest_exp = SET_DEST (body);
> +
> + if (!(REG_P (dest_exp)
> + && crtl->ssa->single_dominating_def (REGNO (dest_exp))))
> + return false;
> + return true;
> +}
> +
> +// Propagate insn I1 with new rtx NEW_DEST_EXP.
> +static void
> +propagate_insn (insn_info *i1, rtx new_dest_exp)
> +{
> + df_ref ref;
> + FOR_EACH_INSN_DEF (ref, i1->rtl())
> + {
> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
> + if (REG_P (dest_exp)
> + && DF_REF_REGNO (ref) == REGNO (dest_exp))
> + {
> + rtx *loc = DF_REF_LOC (ref);
> + *loc = new_dest_exp;
> + }
> + }
> +}
> +
> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair.
> +static rtx
> +new_reg_rtx (rtx old_dest)
> +{
> + rtx new_dest_exp = gen_reg_rtx (OOmode);
> + ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest);
> + REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest);
> + REG_POINTER (new_dest_exp) = REG_POINTER (old_dest);
> + REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest);
> + max_regno = max_reg_num ();
> + return new_dest_exp;
> +}
> +
> +// Set subreg with use of INSN given SRC rtx instruction.
> +static void
> +set_load_subreg (insn_info *i1, rtx src)
> +{
> + rtx set = single_set (i1->rtl());
> + rtx old_dest = SET_DEST (set);
> +
> + for (auto def : i1->defs ())
> + {
> + auto set = dyn_cast<set_info *> (def);
> + for (auto use : set->nondebug_insn_uses ())
> + {
> + insn_info *info = use->insn ();
> + if (!info || !info->rtl ())
> + continue;
> +
> + rtx_insn *rtl_insn = info->rtl ();
> + df_ref ref;
> +
> + FOR_EACH_INSN_USE (ref, rtl_insn)
> + {
> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
> + if (REG_P (dest_exp)
> + && DF_REF_REGNO (ref) == REGNO (dest_exp))
> + {
> + rtx *loc = DF_REF_LOC (ref);
> + insn_propagation prop (rtl_insn, old_dest, src);
> + if (GET_CODE (*loc) == SUBREG)
> + {
> + if (!prop.apply_to_pattern (loc))
> + {
> + if (dump_file != NULL)
> + {
> + fprintf (dump_file,
> + "Cannot propagate insn \n");
> + print_rtl_single (dump_file, rtl_insn);
> + }
> + return;
> + }
> + }
> + else
> + *loc = copy_rtx (src);
> + }
> + }
> + }
> + }
> +}
> +
> +// Set subreg for OO mode store pair to generate registers in pairs
> +// given insn_info I1 and I2.
> +static void
> +set_multiword_subreg_store (insn_info *i1, insn_info *i2)
> +{
> + rtx_insn *insn1 = i1->rtl ();
> + rtx_insn *insn2 = i2->rtl ();
> + rtx body = PATTERN (insn1);
> + rtx src_exp = SET_SRC (body);
> + rtx insn2_body = PATTERN (insn2);
> + rtx insn2_dest_exp = SET_DEST (insn2_body);
> + machine_mode mode = GET_MODE (src_exp);
> + int regoff;
> + rtx src;
> + rtx addr = XEXP (insn2_dest_exp, 0);
> +
> + PUT_MODE_RAW (src_exp, OOmode);
> + if (GET_CODE (addr) == PLUS
> + && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
> + regoff = 16;
> + else
> + regoff = 0;
> +
> + src = simplify_gen_subreg (mode,
> + src_exp, GET_MODE (src_exp),
> + regoff);
> +
> + set_store_subreg (i1, src, regoff);
> +
> + int regoff1 = 0;
> + rtx src1;
> +
> + src1 = simplify_gen_subreg (mode,
> + src_exp, GET_MODE (src_exp),
> + regoff1);
> +
> + set_store_subreg (i2, src1, regoff1);
> + set_rescan_store (i1);
> + set_rescan_store (i2);
> + df_insn_rescan (insn1);
> +}
> +
> +// Set subreg for OO mode pair load to generate registers in pairs given
> +// insn_info I2 and I2.
> +static void
> +set_multiword_subreg_load (insn_info *i1, insn_info *i2)
> +{
> + rtx_insn *insn1 = i1->rtl ();
> + rtx body = PATTERN (insn1);
> + rtx dest_exp = SET_DEST (body);
> + machine_mode mode = GET_MODE (dest_exp);
> + PUT_MODE_RAW (dest_exp, OOmode);
> +
> + int regoff = 0;
> + rtx src;
> +
> + src = simplify_gen_subreg (mode,
> + dest_exp, GET_MODE (dest_exp),
> + regoff);
> +
> + set_load_subreg (i2, src);
> +
> + int regoff1;
> + rtx src1;
> +
> + regoff1 = 16;
> + src1 = simplify_gen_subreg (mode,
> + dest_exp, GET_MODE (dest_exp),
> + regoff1);
> + set_load_subreg (i1, src1);
> +
> + set_rescan_load (i1);
> + set_rescan_load (i2);
> + df_insn_rescan (insn1);
> +}
> +
> +// Set subreg for OO mode pair load for existing subreg rtx to generate
> +// registers in pairs given insn_info I2 and I2.
> +static void
> +set_multiword_existing_subreg (insn_info *i1, insn_info *i2)
> +{
> + rtx_insn *insn1 = i1->rtl ();
> + rtx body = PATTERN (insn1);
> + rtx dest_exp = SET_DEST (body);
> + machine_mode mode = GET_MODE (dest_exp);
> + int regoff1;
> + regoff1 = 16;
> + rtx new_dest_exp = new_reg_rtx (dest_exp);
> +
> + rtx src = simplify_gen_subreg (mode,
> + new_dest_exp,
> + OOmode,
> + regoff1);
> +
> + set_load_subreg (i1, src);
> + propagate_insn (i1, new_dest_exp);
> +
> + int regoff = 0;
> + rtx sset = single_set (i2->rtl ());
> + rtx insn2_dest_exp = SET_DEST (sset);
> + machine_mode insn2_mode = GET_MODE (insn2_dest_exp);
> +
> + src = simplify_gen_subreg (insn2_mode,
> + new_dest_exp,
> + OOmode,
> + regoff);
> +
> + set_load_subreg (i2, src);
> + propagate_insn (i2, new_dest_exp);
> +
> + auto attempt = crtl->ssa->new_change_attempt ();
> + resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) };
> + auto *set = crtl->ssa->allocate<set_info> (i1, resource);
> + if (set)
> + {
> + auto def = find_access (i1->defs (), REGNO (new_dest_exp));
> + if (!def)
> + i1->defs() = insert_access (attempt, set, i1->defs());
> + }
> +
> + set_rescan_load (i1);
> + set_rescan_load (i2);
> + df_insn_rescan (insn1);
> +}
> +
> +// Return true iff insn I1 has already existing subreg.
> +static bool
> +use_has_subreg_p (insn_info *i1)
> +{
> + for (auto def : i1->defs ())
> + {
> + auto set = dyn_cast<set_info *> (def);
> + for (auto use : set->nondebug_insn_uses ())
> + {
> + insn_info *info = use->insn ();
> + if (info && info->rtl ())
> + {
> + rtx_insn *rtl_insn = info->rtl ();
> + df_ref ref;
> + FOR_EACH_INSN_USE (ref, rtl_insn)
> + {
> + rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
> + if (REG_P (dest_exp)
> + && DF_REF_REGNO (ref) == REGNO (dest_exp))
> + {
> + rtx *loc = DF_REF_LOC (ref);
> + if (GET_CODE (*loc) == SUBREG)
> + return true;
> + }
> + }
> + }
> + }
> + }
> + return false;
> +}
> +
> +// Set subreg for OO mode pair to generate sequential registers given
> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
> +// if store insn.
> +void
> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
> + bool load_p)
> +{
> + if (load_p)
> + {
> + bool i1_subreg_p = use_has_subreg_p (i1);
> + bool i2_subreg_p = use_has_subreg_p (i2);
> +
> + if (i1_subreg_p || i2_subreg_p)
> + set_multiword_existing_subreg (i1, i2);
> + else
> + set_multiword_subreg_load (i1, i2);
I don't understand this. Why do we have both set_multiword_existing_subreg
and set_multiword_subreg_load? i1_subreg_p and i2_subreg_p are logically
independent of one another (since i1 and i2 were separate instructions
until now). So "i1_subreg_p || i2_subreg_p" implies that
set_multiword_existing_subreg can handle i1s that have no existing
subreg (used when i2_subreg_p) and that it can handle i2s that have no
existing subreg (used when i1_subreg_p). So doesn't this mean that
set_multiword_existing_subreg can handle everything?
IMO, the way the update should work is that:
(a) all references to the old registers should be updated via
insn_propagation (regardless of whether the old references
involved subregs).
(b) those updates should be part of the same insn_change group as
the change to the load itself.
For stores, definitions of the stored register can probably be handled
directly using df_refs, but there too, the updates should IMO be part
of the same insn_change group as the change to the store itself.
In both cases, it's the:
crtl->ssa->change_insns (changes);
in pair_fusion_bb_info::fuse_pair that should be responsible for
updating the rtl-ssa IR. The changes that the pass wants to make
should be described as insn_changes and passed to change_insns.
The reason for funneling all changes through change_insns is that
it allows rtl-ssa to maintain more complex datastructures. Clients
aren't supposed to manually update the datastructures piecemeal.
Thanks,
Richard
> + }
> + else
> + set_multiword_subreg_store (i1, i2);
> +}
> +
> +rtx
> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx, bool load_p)
> +{
> + rtx i1 = pats[0];
> + rtx src_exp = SET_SRC (i1);
> + rtx dest_exp = SET_DEST (i1);
> + PUT_MODE_RAW (src_exp, OOmode);
> + PUT_MODE_RAW (dest_exp, OOmode);
> + rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest_exp),
> + gen_rtvec (1, src_exp),
> + UNSPEC_LXVP);
> + rtx set = gen_rtx_SET (dest_exp, unspec);
> + if (dump_file)
> + {
> + if (load_p)
> + fprintf (dump_file, "lxv with lxvp ");
> + else
> + fprintf (dump_file, "stxv with stxvp ");
> + print_rtl_single (dump_file, set);
> + }
> + return set;
> +}
> +
> +const pass_data pass_data_mem_fusion =
> +{
> + RTL_PASS, /* type */
> + "mem_fusion", /* name */
> + OPTGROUP_NONE, /* optinfo_flags */
> + TV_NONE, /* tv_id */
> + 0, /* properties_required */
> + 0, /* properties_provided */
> + 0, /* properties_destroyed */
> + 0, /* todo_flags_start */
> + TODO_df_finish, /* todo_flags_finish */
> +};
> +
> +class pass_mem_fusion : public rtl_opt_pass
> +{
> +public:
> + pass_mem_fusion (gcc::context *ctxt)
> + : rtl_opt_pass (pass_data_mem_fusion, ctxt)
> + {}
> +
> + opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
> +
> + /* opt_pass methods: */
> + bool gate (function *)
> + {
> + return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
> + }
> +
> + unsigned int execute (function *) final override
> + {
> + rs6000_pair_fusion pass;
> + pass.run ();
> + return 0;
> + }
> +}; // class pass_mem_fusion
> +
> +rtl_opt_pass *
> +make_pass_mem_fusion (gcc::context *ctxt)
> +{
> + return new pass_mem_fusion (ctxt);
> +}
> diff --git a/gcc/config/rs6000/rs6000-passes.def
> b/gcc/config/rs6000/rs6000-passes.def
> index 46a0d0b8c56..0b48f57014d 100644
> --- a/gcc/config/rs6000/rs6000-passes.def
> +++ b/gcc/config/rs6000/rs6000-passes.def
> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3. If not see
> The power8 does not have instructions that automaticaly do the byte
> swaps
> for loads and stores. */
> INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
> -
> + /* Pass to replace adjacent memory addresses lxv/stxv instruction with
> + lxvp/stxvp instruction. */
> + INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
> /* Pass to do the PCREL_OPT optimization that combines the load of an
> external symbol's address along with a single load or store using that
> address as a base register. */
> diff --git a/gcc/config/rs6000/rs6000-protos.h
> b/gcc/config/rs6000/rs6000-protos.h
> index 09a57a806fa..1412b31c2eb 100644
> --- a/gcc/config/rs6000/rs6000-protos.h
> +++ b/gcc/config/rs6000/rs6000-protos.h
> @@ -343,6 +343,7 @@ namespace gcc { class context; }
> class rtl_opt_pass;
>
> extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
> extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
> extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
> extern bool rs6000_quadword_masked_address_p (const_rtx exp);
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index 58553ff66f4..6da4e70973d 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -27428,7 +27428,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> reg_mode = word_mode;
> reg_mode_size = GET_MODE_SIZE (reg_mode);
>
> - gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
> + gcc_assert (mode == OOmode
> + || reg_mode_size * nregs == GET_MODE_SIZE (mode));
>
> /* TDmode residing in FP registers is special, since the ISA requires that
> the lower-numbered word of a register pair is always the most
> significant
> @@ -27475,6 +27476,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
> if (MEM_P (dst))
> {
> + rtx addr = XEXP (dst, 0);
> + rtx opnd1 = NULL_RTX;
> + if (addr && GET_CODE (addr) == PLUS)
> + opnd1 = XEXP (addr,1);
> +
> unsigned offset = 0;
> unsigned size = GET_MODE_SIZE (reg_mode);
>
> @@ -27488,7 +27494,13 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> {
> unsigned subreg
> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
> - rtx dst2 = adjust_address (dst, reg_mode, offset);
> + rtx dst2 = dst;
> +
> + if ((GET_CODE (addr) != PLUS
> + || (opnd1 && CONST_INT_P(opnd1))))
> + dst2 = adjust_address (dst, reg_mode, offset);
> + else
> + PUT_MODE_RAW (dst, reg_mode);
> rtx src2 = gen_rtx_REG (reg_mode, reg + subreg);
> offset += size;
> emit_insn (gen_rtx_SET (dst2, src2));
> @@ -27499,15 +27511,25 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>
> if (MEM_P (src))
> {
> + rtx addr = XEXP (src, 0);
> + rtx opnd1 = NULL_RTX;
> + if (addr && GET_CODE (addr) == PLUS)
> + opnd1 = XEXP (addr,1);
> +
> unsigned offset = 0;
> unsigned size = GET_MODE_SIZE (reg_mode);
>
> - for (int i = 0; i < nregs; i += reg_mode_nregs)
> + for (int i = nregs-1; i >= 0; i -= reg_mode_nregs)
> {
> unsigned subreg
> = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
> rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg);
> - rtx src2 = adjust_address (src, reg_mode, offset);
> + rtx src2 = src;
> +
> + if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1))))
> + src2 = adjust_address (src, reg_mode, offset);
> + else
> + PUT_MODE_RAW (src2, reg_mode);
> offset += size;
> emit_insn (gen_rtx_SET (dst2, src2));
> }
> @@ -27515,7 +27537,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> /* If we are writing an accumulator register, we have to
> prime it after we've written it. */
> if (TARGET_MMA
> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
> + && REG_P (dst) && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO
> (dst)))
> emit_insn (gen_mma_xxmtacc (dst, dst));
>
> return;
> @@ -27608,9 +27630,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> {
> for (i = nregs - 1; i >= 0; i--)
> {
> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
> - emit_insn (gen_rtx_SET (dst_i, src_i));
> + if (REG_P (dst) && REG_P (src))
> + {
> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
> + emit_insn (gen_rtx_SET (dst_i, src_i));
> + }
> }
> }
> else
> @@ -27625,7 +27650,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> /* If we are writing an accumulator register, we have to
> prime it after we've written it. */
> if (TARGET_MMA
> - && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
> + && REG_P (dst) && GET_MODE (dst) == XOmode
> + && FP_REGNO_P (REGNO (dst)))
> emit_insn (gen_mma_xxmtacc (dst, dst));
> }
> else
> @@ -27682,7 +27708,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>
> /* If the base register we are using to address memory is
> also a destination reg, then change that register last. */
> - if (REG_P (breg)
> + if (REG_P (dst) && REG_P (breg)
> && REGNO (breg) >= REGNO (dst)
> && REGNO (breg) < REGNO (dst) + nregs)
> j = REGNO (breg) - REGNO (dst);
> @@ -27780,9 +27806,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> /* XO/OO are opaque so cannot use subregs. */
> if (mode == OOmode || mode == XOmode )
> {
> - rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
> - rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
> - emit_insn (gen_rtx_SET (dst_i, src_i));
> + if (REG_P (dst) && REG_P (src))
> + {
> + rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
> + rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
> + emit_insn (gen_rtx_SET (dst_i, src_i));
> + }
> }
> else
> emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode,
> @@ -27800,7 +27829,9 @@ rs6000_split_multireg_move (rtx dst, rtx src)
> if (restore_basereg != NULL_RTX)
> emit_insn (restore_basereg);
> }
> + return;
> }
> +
>
> /* Return true if the peephole2 can combine a load involving a combination of
> an addis instruction and a load with an offset that can be fused together
> on
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index a5d20594789..2106e1a1fed 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -159,6 +159,7 @@
> UNSPEC_XXSPLTIW_CONST
> UNSPEC_FMAX
> UNSPEC_FMIN
> + UNSPEC_LXVP
> ])
>
> ;;
> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
> index b3ce09d523b..df9b3a35b66 100644
> --- a/gcc/config/rs6000/t-rs6000
> +++ b/gcc/config/rs6000/t-rs6000
> @@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
> $(COMPILE) $<
> $(POSTCOMPILE)
>
> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
> + $(COMPILE) $<
> + $(POSTCOMPILE)
> +
> +
> rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
> $(COMPILE) $<
> $(POSTCOMPILE)
> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
> index 31d2c21c88f..ff77a0bc8c6 100644
> --- a/gcc/pair-fusion.cc
> +++ b/gcc/pair-fusion.cc
> @@ -312,9 +312,9 @@ static int
> encode_lfs (lfs_fields fields)
> {
> int size_log2 = exact_log2 (fields.size);
> - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
> - return ((int)fields.load_p << 3)
> - | ((int)fields.fpsimd_p << 2)
> + gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
> + return ((int)fields.load_p << 4)
> + | ((int)fields.fpsimd_p << 3)
> | (size_log2 - 2);
> }
>
> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields)
> static lfs_fields
> decode_lfs (int lfs)
> {
> - bool load_p = (lfs & (1 << 3));
> - bool fpsimd_p = (lfs & (1 << 2));
> + bool load_p = (lfs & (1 << 4));
> + bool fpsimd_p = (lfs & (1 << 3));
> unsigned size = 1U << ((lfs & 3) + 2);
> return { load_p, fpsimd_p, size };
> }
> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, bool
> load_p, rtx mem)
> if (MEM_VOLATILE_P (mem))
> return;
>
> + if (load_p && !m_pass->fuseable_load_p (insn))
> + return;
> +
> // Ignore writeback accesses if the hook says to do so.
> if (!m_pass->should_handle_writeback (writeback_type::EXISTING)
> && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
> }
>
> rtx reg_notes = combine_reg_notes (first, second, load_p);
> -
> + m_pass->set_multiword_subreg (i1, i2, load_p);
> rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
> insn_change *pair_change = nullptr;
> auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
> @@ -1833,6 +1836,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
> pair_change->new_defs = merge_access_arrays (attempt,
> input_defs[0],
> input_defs[1]);
> + m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes);
> gcc_assert (pair_change->new_defs.is_valid ());
>
> pair_change->new_uses
> @@ -2405,6 +2409,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p,
> unsigned access_size,
> reg_ops[i] = XEXP (pats[i], !load_p);
> }
>
> + if (!load_p && !m_pass->fuseable_store_p (i1, i2))
> + {
> + if (dump_file)
> + fprintf (dump_file,
> + "punting on store-mem-pairs due to non fuseable cand
> (%d,%d)\n",
> + insns[0]->uid (), insns[1]->uid ());
> + return false;
> + }
> +
> if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
> {
> if (dump_file)
> @@ -2997,6 +3010,8 @@ void pair_fusion::process_block (bb_info *bb)
> if (GET_CODE (pat) != SET)
> continue;
>
> + change_existing_multword_mode (rti);
> +
> if (track_stores && MEM_P (XEXP (pat, 0)))
> bb_state.track_access (insn, false, XEXP (pat, 0));
> else if (track_loads && MEM_P (XEXP (pat, 1)))
> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
> index 45e4edceecb..756357db794 100644
> --- a/gcc/pair-fusion.h
> +++ b/gcc/pair-fusion.h
> @@ -26,8 +26,11 @@ namespace rtl_ssa {
> class insn_info;
> class insn_range_info;
> class bb_info;
> + class insn_change;
> }
>
> +class obstack_watermark;
> +
> // Information about a potential base candidate, used in try_fuse_pair.
> // There may be zero, one, or two viable RTL bases for a given pair.
> struct base_cand
> @@ -142,6 +145,19 @@ struct pair_fusion {
> // true iff INSN is a load pair.
> virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0;
>
> + // Given INSN change multiword mode load and store to respective
> + // unspec instruction.
> + virtual void change_existing_multword_mode (rtx_insn *insn) = 0;
> +
> + // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the
> + // new rtx with INSN. Remove all uses of definition that are
> + // removed given CHANGES.
> + virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first,
> + obstack_watermark *attempt,
> + rtl_ssa::insn_change **pair_change,
> + auto_vec<rtl_ssa::insn_change *> &changes)
> + = 0;
> +
> // Return true if we should track loads.
> virtual bool track_loads_p ()
> {
> @@ -171,6 +187,24 @@ struct pair_fusion {
> virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
> rtx regs[2], bool load_p) = 0;
>
> + // Given insn_info pair I1 and I2, sets subreg with multiword registers
> + // to assign register pairs by allocators.
> + // LOAD_P is true iff the pair is a load.
> + virtual void set_multiword_subreg (rtl_ssa::insn_info *i1,
> + rtl_ssa::insn_info *i2,
> + bool load_p) = 0;
> +
> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
> + // store mem pairs.
> + // Return true if feasible to perform store mem pairs otherwise false.
> + virtual bool fuseable_store_p (rtl_ssa::insn_info *i1,
> + rtl_ssa::insn_info *i2) = 0;
> +
> + // Given insn_info pair I1 and I2, checks if pairs are feasible to perform
> + // load mem pairs.
> + // Return true if feasible to perform load mem pairs otherwise false.
> + virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
> +
> void process_block (rtl_ssa::bb_info *bb);
> rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
> const rtl_ssa::insn_range_info
> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
> index e2134621723..d5c5b80f8aa 100644
> --- a/gcc/rtl-ssa/functions.h
> +++ b/gcc/rtl-ssa/functions.h
> @@ -222,6 +222,13 @@ public:
> template<typename T, typename... Ts>
> T *change_alloc (obstack_watermark &wm, Ts... args);
>
> + auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; }
> +
> + template<typename T, typename... Ts>
> + T *allocate (Ts... args);
> +
> + void remove_use (use_info *);
> +
> private:
> class bb_phi_info;
> class build_info;
> @@ -231,9 +238,6 @@ private:
> // allocate_temp during its lifetime.
> obstack_watermark temp_watermark () { return &m_temp_obstack; }
>
> - template<typename T, typename... Ts>
> - T *allocate (Ts... args);
> -
> template<typename T, typename... Ts>
> T *allocate_temp (Ts... args);
>
> @@ -269,7 +273,6 @@ private:
> static void insert_use_after (use_info *, use_info *);
>
> void add_use (use_info *);
> - void remove_use (use_info *);
>
> insn_info::order_node *need_order_node (insn_info *);
>
> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
> b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
> new file mode 100644
> index 00000000000..d10ff0cdf36
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +#include <altivec.h>
> +
> +void
> +foo2 ()
> +{
> + __vector_quad *dst1;
> + __vector_quad *dst2;
> + vector unsigned char src;
> + __vector_quad acc;
> + vector unsigned char *ptr;
> + __builtin_mma_xvf32ger(&acc, src, ptr[0]);
> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
> + *dst1 = acc;
> + __builtin_mma_xvf32ger(&acc, src, ptr[2]);
> + __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
> + *dst2 = acc;
> +}
> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C
> b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
> new file mode 100644
> index 00000000000..c523572cf3c
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target power10_ok } */
> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
> +
> +#include <altivec.h>
> +
> +void
> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char src)
> +{
> + __vector_quad acc;
> + __builtin_mma_xvf32ger(&acc, src, ptr[0]);
> + __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
> + *dst = acc;
> +}
> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
> b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
> index 69ee826e1be..ae29127f954 100644
> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t
> *vec)
> dst[13] = acc;
> }
>
> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
> /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
> /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
> /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */