Hello Richard:

On 18/07/24 2:04 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 18/07/24 1:17 am, Richard Sandiford wrote:
>> Ajit Agarwal <aagar...@linux.ibm.com> writes:
>>> Hello All:
>>>
>>> This version of patch relaxes store fusion for more use cases.
>>>
>>> Common infrastructure using generic code for pair mem fusion of different
>>> targets.
>>>
>>> rs6000 target specific code implement virtual functions defined by generic 
>>> code.
>>>
>>> Target specific code are added in rs6000-mem-fusion.cc.
>>>
>>> Bootstrapped and regtested on powerpc64-linux-gnu.
>>>
>>> Thanks & Regards
>>> Ajit
>>>
>>>
>>> rs6000, middle-end: Add implementation for different targets for pair mem 
>>> fusion
>>>
>>> Common infrastructure using generic code for pair mem fusion of different
>>> targets.
>>>
>>> rs6000 target specific code implement virtual functions defined by generic 
>>> code.
>>>
>>> Target specific code are added in rs6000-mem-fusion.cc.
>>>
>>> 2024-07-02  Ajit Kumar Agarwal  <aagar...@linux.ibm.com>
>>>
>>> gcc/ChangeLog:
>>>
>>>     * config/rs6000/rs6000-passes.def: New mem fusion pass
>>>     before pass_early_remat.
>>>     * pair-fusion.h: Add additional pure virtual function
>>>     required for rs6000 target implementation.
>>>     * pair-fusion.cc: Use of virtual functions for additional
>>>     virtual function addded for rs6000 target.
>>>     * config/rs6000/rs6000-mem-fusion.cc: Add new pass.
>>>     Add target specific implementation for generic pure virtual
>>>     functions.
>>>     * config/rs6000/mma.md: Modify movoo machine description.
>>>     Add new machine description movoo1.
>>>     * config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
>>>     to expand movoo machine description for all constraints.
>>>     * config.gcc: Add new object file.
>>>     * config/rs6000/rs6000-protos.h: Add new prototype for mem
>>>     fusion pass.
>>>     * config/rs6000/t-rs6000: Add new rule.
>>>     * rtl-ssa/functions.h: Move out allocate function from private
>>>     to public and add get_m_temp_defs function.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>>     * g++.target/powerpc/mem-fusion.C: New test.
>>>     * g++.target/powerpc/mem-fusion-1.C: New test.
>>>     * gcc.target/powerpc/mma-builtin-1.c: Modify test.
>>> ---
>>>  gcc/config.gcc                                |   2 +
>>>  gcc/config/rs6000/mma.md                      |  26 +-
>>>  gcc/config/rs6000/rs6000-mem-fusion.cc        | 708 ++++++++++++++++++
>>>  gcc/config/rs6000/rs6000-passes.def           |   4 +-
>>>  gcc/config/rs6000/rs6000-protos.h             |   1 +
>>>  gcc/config/rs6000/rs6000.cc                   |  57 +-
>>>  gcc/config/rs6000/rs6000.md                   |   1 +
>>>  gcc/config/rs6000/t-rs6000                    |   5 +
>>>  gcc/pair-fusion.cc                            |  27 +-
>>>  gcc/pair-fusion.h                             |  34 +
>>>  gcc/rtl-ssa/functions.h                       |  11 +-
>>>  .../g++.target/powerpc/mem-fusion-1.C         |  22 +
>>>  gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
>>>  .../gcc.target/powerpc/mma-builtin-1.c        |   4 +-
>>>  14 files changed, 890 insertions(+), 27 deletions(-)
>>>  create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
>>>  create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>>>  create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C
>>>
>>> diff --git a/gcc/config.gcc b/gcc/config.gcc
>>> index bc45615741b..12f79a78177 100644
>>> --- a/gcc/config.gcc
>>> +++ b/gcc/config.gcc
>>> @@ -524,6 +524,7 @@ powerpc*-*-*)
>>>     extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
>>>     extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
>>>     extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
>>> +   extra_objs="${extra_objs} rs6000-mem-fusion.o"
>>>     extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
>>>     extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
>>>     extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
>>> @@ -560,6 +561,7 @@ rs6000*-*-*)
>>>     extra_options="${extra_options} g.opt fused-madd.opt 
>>> rs6000/rs6000-tables.opt"
>>>     extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
>>>     extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
>>> +   extra_objs="${extra_objs} rs6000-mem-fusion.o"
>>>     target_gtfiles="$target_gtfiles 
>>> \$(srcdir)/config/rs6000/rs6000-logue.cc 
>>> \$(srcdir)/config/rs6000/rs6000-call.cc"
>>>     target_gtfiles="$target_gtfiles 
>>> \$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
>>>     ;;
>>> diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
>>> index 04e2d0066df..88413926a02 100644
>>> --- a/gcc/config/rs6000/mma.md
>>> +++ b/gcc/config/rs6000/mma.md
>>> @@ -294,7 +294,31 @@
>>>  
>>>  (define_insn_and_split "*movoo"
>>>    [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
>>> -   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
>>> +        (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
>>> +  "TARGET_MMA
>>> +   && (gpc_reg_operand (operands[0], OOmode)
>>> +       || gpc_reg_operand (operands[1], OOmode))"
>>> +;;    ""
>>> +  "@
>>> +   #
>>> +   #
>>> +   #"
>>> +  "&& reload_completed"
>>> +  [(const_int 0)]
>>> +{
>>> +  rs6000_split_multireg_move (operands[0], operands[1]);
>>> +  DONE;
>>> +}
>>> +  [(set_attr "type" "vecload,vecstore,veclogical")
>>> +   (set_attr "length" "*,*,8")])
>>> +;;   (set_attr "max_prefixed_insns" "2,2,*")])
>>> +
>>> +
>>> +(define_insn_and_split "*movoo1"
>>> +  [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
>>> +        (unspec [
>>> +          (match_operand:OO 1 "input_operand" "ZwO,wa,wa")
>>> +        ] UNSPEC_LXVP))]
>>>    "TARGET_MMA
>>>     && (gpc_reg_operand (operands[0], OOmode)
>>>         || gpc_reg_operand (operands[1], OOmode))"
>>> diff --git a/gcc/config/rs6000/rs6000-mem-fusion.cc 
>>> b/gcc/config/rs6000/rs6000-mem-fusion.cc
>>> new file mode 100644
>>> index 00000000000..b63b6f31001
>>> --- /dev/null
>>> +++ b/gcc/config/rs6000/rs6000-mem-fusion.cc
>>> @@ -0,0 +1,708 @@
>>> +/* Subroutines used to perform adjacent load/store into
>>> +   paired memory accesses for TARGET_POWER10 and TARGET_VSX.
>>> +
>>> +   Copyright (C) 2024 Free Software Foundation, Inc.
>>> +
>>> +   This file is part of GCC.
>>> +
>>> +   GCC is free software; you can redistribute it and/or modify it
>>> +   under the terms of the GNU General Public License as published
>>> +   by the Free Software Foundation; either version 3, or (at your
>>> +   option) any later version.
>>> +
>>> +   GCC is distributed in the hope that it will be useful, but WITHOUT
>>> +   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
>>> +   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
>>> +   License for more details.
>>> +
>>> +   You should have received a copy of the GNU General Public License
>>> +   along with GCC; see the file COPYING3.  If not see
>>> +   <http://www.gnu.org/licenses/>.  */
>>> +
>>> +#define INCLUDE_ALGORITHM
>>> +#define INCLUDE_FUNCTIONAL
>>> +#define INCLUDE_LIST
>>> +#define INCLUDE_TYPE_TRAITS
>>> +#include "config.h"
>>> +#include "system.h"
>>> +#include "coretypes.h"
>>> +#include "backend.h"
>>> +#include "rtl.h"
>>> +#include "df.h"
>>> +#include "rtl-iter.h"
>>> +#include "rtl-ssa.h"
>>> +#include "rtl-ssa/internals.h"
>>> +#include "rtl-ssa/internals.inl"
>>> +#include "cfgcleanup.h"
>>> +#include "tree-pass.h"
>>> +#include "pair-fusion.h"
>>> +
>>> +using namespace rtl_ssa;
>>> +
>>> +struct rs6000_pair_fusion : public pair_fusion
>>> +{
>>> +  bool fpsimd_op_p (rtx , machine_mode , bool)  override final
>>> +  {
>>> +    return false;
>>> +  }
>>> +
>>> +  bool pair_mem_insn_p (rtx_insn *, bool &) override final
>>> +  {
>>> +    return false;
>>> +  }
>>> +
>>> +  void change_existing_multword_mode (rtx_insn *insn) override final;
>>> +
>>> +  bool pair_mem_ok_with_policy (rtx, bool) override final
>>> +  {
>>> +    return true;
>>> +  }
>>> +
>>> +  bool pair_operand_mode_ok_p (machine_mode mode) override final;
>>> +
>>> +  rtx gen_pair (rtx *pats, rtx, bool load_p) override final;
>>> +
>>> +  bool pair_reg_operand_ok_p (bool, rtx, machine_mode) override final
>>> +  {
>>> +    return true;
>>> +  }
>>> +
>>> +  int pair_mem_alias_check_limit () override final
>>> +  {
>>> +    return 0;
>>> +  }
>>> +
>>> +  bool should_handle_writeback (enum writeback_type) override final
>>> +  {
>>> +    return false;
>>> +  }
>>> +
>>> +  bool track_loads_p () override final
>>> +  {
>>> +    return true;
>>> +  }
>>> +
>>> +  bool track_stores_p () override final
>>> +  {
>>> +    return true;
>>> +  }
>>> +
>>> +  bool pair_mem_in_range_p (HOST_WIDE_INT) override final
>>> +  {
>>> +    return true;
>>> +  }
>>> +
>>> +  rtx gen_promote_writeback_pair (rtx, rtx, rtx *, bool) override final
>>> +  {
>>> +    return NULL_RTX;
>>> +  }
>>> +
>>> +  rtx destructure_pair (rtx_def **, rtx, bool) override final
>>> +  {
>>> +    return NULL_RTX;
>>> +  }
>>> +
>>> +  bool fuseable_store_p (insn_info *i1, insn_info *i2) override final;
>>> +
>>> +  bool fuseable_load_p (insn_info *insn) override final;
>>> +
>>> +  void set_multiword_subreg (insn_info *i1, insn_info *i2,
>>> +                        bool load_p) override final;
>>> +
>>> +  void modify_new_rtx_insn (insn_info *first, obstack_watermark *attempt,
>>> +                       insn_change **pair_change,
>>> +                       auto_vec <insn_change *> &changes) override final;
>>> +};
>>> +
>>> +bool
>>> +rs6000_pair_fusion::pair_operand_mode_ok_p (machine_mode mode)
>>> +{
>>> +  return (ALTIVEC_OR_VSX_VECTOR_MODE (mode));
>>> +}
>>> +
>>> +void
>>> +rs6000_pair_fusion::change_existing_multword_mode (rtx_insn *insn)
>>> +{
>>> +  rtx set = single_set (insn);
>>> +  rtx src = SET_SRC (set);
>>> +  rtx dest = SET_DEST (set);
>>> +  rtx copy = NULL_RTX;
>>> +
>>> +  if ((MEM_P (src) && GET_MODE (src) == OOmode)
>>> +       || (MEM_P (dest) && GET_MODE (dest) == OOmode))
>>> +    {
>>> +      rtx unspec  = gen_rtx_UNSPEC (GET_MODE (dest),
>>> +                               gen_rtvec (1, src),
>>> +                               UNSPEC_LXVP);
>>> +      copy =  gen_rtx_SET (dest, unspec);
>>> +      rtx_insn *new_insn = emit_insn_after (copy, insn);
>>> +      set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
>>> +      df_insn_rescan (new_insn);
>>> +      df_insn_delete (insn);
>>> +      remove_insn (insn);
>>> +      insn->set_deleted ();
>>> +    }
>>> +}
>>> +
>>> +static void
>>> +update_change (set_info *set)
>>> +{
>>> +  if (!set->has_any_uses ())
>>> +    return;
>>> +
>>> +  auto *use = *set->all_uses ().begin ();
>>> +  do
>>> +    {
>>> +      auto *next_use = use->next_use ();
>>> +      if (use->is_in_phi ())
>>> +   {
>>> +     update_change (use->phi ());
>>> +   }
>>> +      else
>>> +   {
>>> +     crtl->ssa->remove_use (use);
>>> +   }
>>> +      use = next_use;
>>> +    }
>>> +  while (use);
>>> +}
>>> +
>>> +void
>>> +rs6000_pair_fusion::modify_new_rtx_insn (insn_info *first,
>>> +                                    obstack_watermark *attempt,
>>> +                                    insn_change **pair_change,
>>> +                                    auto_vec<insn_change *> &changes)
>>> +{
>>> +  for (insn_change *change : changes)
>>> +    for (auto def : change->old_defs ())
>>> +      {
>>> +   auto set = dyn_cast<set_info *> (def);
>>> +   update_change (set);
>>> +      }
>>> +
>>> +  auto &new_defs = (*pair_change)->new_defs;
>>> +  vec_rtx_properties properties;
>>> +  properties.add_insn (first->rtl (), true);
>>> +  // Build up the new list of definitions.
>>> +  for (rtx_obj_reference ref : properties.refs ())
>>> +    if (ref.is_write ())
>>> +      {
>>> +   auto *set = crtl->ssa->allocate<set_info> (first,
>>> +                                              full_register (ref.regno));
>>> +   if (set)
>>> +     {
>>> +       auto def = find_access (new_defs, ref.regno);
>>> +       if (!def)
>>> +         {
>>> +           new_defs = insert_access (*attempt, set,
>>> +                                     new_defs);
>>> +           auto &m_temp_defs = crtl->ssa->get_m_temp_defs ();
>>> +           m_temp_defs.safe_push (set);
>>> +         }
>>> +     }
>>> +      }
>>> +}
>>> +
>>> +// df_insn_rescan dependent instruction where operands
>>> +// are reversed given insn_info INFO.
>>> +static void
>>> +set_rescan_load (insn_info *i1)
>>> +{
>>> +  for (auto def : i1->defs ())
>>> +    {
>>> +      auto set = dyn_cast<set_info *> (def);
>>> +      for (auto use : set->all_uses ())
>>> +   {
>>> +     insn_info *info = use->insn ();
>>> +     if (info && info->rtl ())
>>> +       {
>>> +         rtx_insn *rtl_insn = info->rtl ();
>>> +         df_insn_rescan (rtl_insn);
>>> +       }
>>> +   }
>>> +    }
>>> +}
>>> +
>>> +// df_insn_rescan the def instruction where operands are reversed given 
>>> INSN.
>>> +static bool
>>> +set_rescan_store (insn_info *insn)
>>> +{
>>> +  for (auto use : insn->uses())
>>> +    {
>>> +      auto def = use->def ();
>>> +
>>> +      if (!def)
>>> +   return false;
>>> +
>>> +      if (def->insn ()->is_artificial ())
>>> +   return false;
>>> +
>>> +      if (def->insn () && def->insn ()->rtl ()
>>> +     && def->insn()->is_real ())
>>> +   {
>>> +     rtx_insn *rtl_insn = def->insn ()->rtl ();
>>> +     rtx set = single_set (rtl_insn);
>>> +
>>> +     if (set == NULL_RTX)
>>> +       return false;
>>> +     df_insn_rescan (rtl_insn);
>>> +   }
>>> +    }
>>> +  return true;
>>> +}
>>> +
>>> +// Check for feasibility of store to be fuseable or not. Return true if
>>> +// feasible otherwise false.
>>> +static bool
>>> +feasible_store_p (insn_info *insn)
>>> +{
>>> +  for (auto use : insn->uses ())
>>> +    {
>>> +      auto def = use->def ();
>>> +
>>> +      if (def->insn ()->is_artificial ())
>>> +   return false;
>>> +
>>> +      if (def->insn () && def->insn ()->rtl ()
>>> +     && def->insn()->is_real ())
>>> +   {
>>> +     rtx_insn *rtl_insn = def->insn ()->rtl ();
>>> +     rtx set = single_set (rtl_insn);
>>> +
>>> +     if (set == NULL_RTX)
>>> +       return false;
>>> +
>>> +     // Return false if dependent def is load.
>>> +     // This is done as def instruction could be a fused load and
>>> +     // to avoid already existing subreg (reg:OO R) offset.
>>> +     if (rtl_insn && MEM_P (SET_SRC (set)))
>>> +       return false;
>>> +
>>> +     // Return false if dependent def is store.
>>> +     if (rtl_insn && MEM_P (SET_DEST (set)))
>>> +       return false;
>>
>> I don't understand these tests.  It might help to turn it around and
>> say: what sort of cases do you want to handle?

If the def instruction is Load and store which already are fused
then I bail out with store fusion.

>>
>>> +   }
>>> +    }
>>> +  return true;
>>> +}
>>> +
>>> +// Check if store can be fuseable or not.  Return true if fuseable 
>>> otherwise
>>> +// false.
>>> +bool
>>> +rs6000_pair_fusion::fuseable_store_p (insn_info *i1, insn_info *i2)
>>> +{
>>> +  rtx_insn *insn1 = i1->rtl ();
>>> +  rtx_insn *insn2 = i2->rtl ();
>>> +  rtx body = PATTERN (insn1);
>>> +  rtx src_exp = SET_SRC (body);
>>> +  rtx insn2_body = PATTERN (insn2);
>>> +  rtx insn2_src_exp = SET_SRC (insn2_body);
>>> +
>>> +  if (!(REG_P (src_exp)
>>> +      && crtl->ssa->single_dominating_def (REGNO (src_exp))))
>>> +    return false;
>>> +
>>> +  // This is done as def instruction could be a fused load and
>>> +  // to avoid  already existing subreg (reg:OO R) offset.
>>> +  if (DF_REG_USE_COUNT (REGNO (src_exp)) > 1)
>>> +    return false;
>>> +
>>> +  // Return false if src of insn1 and src of insn2 are same.
>>> +  if (src_exp == insn2_src_exp)
>>> +    return false;
>>> +
>>> +  // Return false if src of insn1 is subreg.
>>> +  if (GET_CODE (src_exp) == SUBREG)
>>> +    return false;
>>
>> This can't be true after the REG_P check above.
>>
> 
> I will make this change.
>  

I have made these changes and send a separate patch.

>>> +
>>> +  // Return false if src of insn2 is subreg.
>>> +  if (GET_CODE (insn2_src_exp) == SUBREG)
>>> +    return false;
>>
>> Shouldn't the tests for i1 and i2 be symmetrical, with i2 also
>> requiring a single dominating definition?
>>
> 
> I will make this change.
>

I have made changes in separate patch.
 
>>> +
>>> +  if (!feasible_store_p (i1))
>>> +    return false;;
>>> +
>>> +  if (!feasible_store_p (i2))
>>> +    return false;
>>> +
>>> +  return true;
>>> +}
>>> +
>>> +// Set subreg for def of store INSN given rtx SRC instruction.
>>> +static void
>>> +set_store_subreg (insn_info *i1, rtx src, int regoff)
>>> +{
>>> +  for (auto use: i1->uses ())
>>> +    {
>>> +      auto def = use->def ();
>>> +      if (!def)
>>> +   return;
>>> +
>>> +      insn_info *info = def->insn ();
>>> +
>>> +      if (info->is_artificial ())
>>> +   return;
>>> +
>>> +      if (info && info->is_real ())
>>> +   {
>>> +     rtx_insn *rtl_insn = info->rtl ();
>>> +     rtx set = single_set (rtl_insn);
>>> +     if (set == NULL_RTX)
>>> +       return;
>>> +     df_ref ref;
>>> +     FOR_EACH_INSN_DEF (ref, rtl_insn)
>>> +       {
>>> +         rtx src_exp = SET_SRC (PATTERN (i1->rtl ()));
>>> +         if (REG_P (src_exp) && DF_REF_REGNO (ref) == REGNO (src_exp))
>>> +           {
>>> +             rtx *loc = DF_REF_LOC (ref);
>>> +             if (GET_CODE (*loc) == SUBREG)
>>> +               {
>>> +                 rtx src1 = simplify_gen_subreg (GET_MODE (*loc),
>>> +                                                 SUBREG_REG (src),
>>> +                                                 OOmode,
>>> +                                                 regoff);
>>> +                 *loc =  copy_rtx (src1);
>>> +               }
>>> +              else
>>> +                *loc = copy_rtx (src);
>>> +           }
>>> +       }
>>> +   }
>>> +    }
>>> +}
>>> +
>>> +// Check whether load can be fusable or not.
>>> +// Return true if fuseable otherwise false.
>>> +bool
>>> +rs6000_pair_fusion::fuseable_load_p (insn_info *i1)
>>> +{
>>> +  rtx_insn *insn = i1->rtl ();
>>> +  rtx body = PATTERN (insn);
>>> +  rtx dest_exp = SET_DEST (body);
>>> +
>>> +  if (!(REG_P (dest_exp)
>>> +      && crtl->ssa->single_dominating_def (REGNO (dest_exp))))
>>> +    return false;
>>> +  return true;
>>> +}
>>> +
>>> +// Propagate insn I1 with new rtx NEW_DEST_EXP.
>>> +static void
>>> +propagate_insn (insn_info *i1, rtx new_dest_exp)
>>> +{
>>> +  df_ref ref;
>>> +  FOR_EACH_INSN_DEF (ref, i1->rtl())
>>> +    {
>>> +      rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
>>> +      if (REG_P (dest_exp)
>>> +     && DF_REF_REGNO (ref) == REGNO (dest_exp))
>>> +   {
>>> +     rtx *loc = DF_REF_LOC (ref);
>>> +     *loc = new_dest_exp;
>>> +   }
>>> +     }
>>> +}
>>> +
>>> +// Generate new reg rtx with copy of OLD_DEST for OOmode pair.
>>> +static rtx
>>> +new_reg_rtx (rtx old_dest)
>>> +{
>>> +  rtx new_dest_exp = gen_reg_rtx (OOmode);
>>> +  ORIGINAL_REGNO (new_dest_exp) = ORIGINAL_REGNO (old_dest);
>>> +  REG_USERVAR_P (new_dest_exp) = REG_USERVAR_P (old_dest);
>>> +  REG_POINTER (new_dest_exp) = REG_POINTER (old_dest);
>>> +  REG_ATTRS (new_dest_exp) = REG_ATTRS (old_dest);
>>> +  max_regno = max_reg_num ();
>>> +  return new_dest_exp;
>>> +}
>>> +
>>> +// Set subreg with use of INSN given SRC rtx instruction.
>>> +static void
>>> +set_load_subreg (insn_info *i1, rtx src)
>>> +{
>>> +  rtx set = single_set (i1->rtl());
>>> +  rtx old_dest = SET_DEST (set);
>>> +
>>> +  for (auto def : i1->defs ())
>>> +    {
>>> +      auto set = dyn_cast<set_info *> (def);
>>> +      for (auto use : set->nondebug_insn_uses ())
>>> +   {
>>> +     insn_info *info = use->insn ();
>>> +     if (!info || !info->rtl ())
>>> +       continue;
>>> +
>>> +     rtx_insn *rtl_insn = info->rtl ();
>>> +     df_ref ref;
>>> +
>>> +     FOR_EACH_INSN_USE (ref, rtl_insn)
>>> +       {
>>> +         rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
>>> +         if (REG_P (dest_exp)
>>> +             && DF_REF_REGNO (ref) == REGNO (dest_exp))
>>> +           {
>>> +             rtx *loc = DF_REF_LOC (ref);
>>> +             insn_propagation prop (rtl_insn, old_dest, src);
>>> +             if (GET_CODE (*loc) == SUBREG)
>>> +               {
>>> +                 if (!prop.apply_to_pattern (loc))
>>> +                   {
>>> +                     if (dump_file != NULL)
>>> +                       {
>>> +                         fprintf (dump_file,
>>> +                                  "Cannot propagate insn \n");
>>> +                         print_rtl_single (dump_file, rtl_insn);
>>> +                       }
>>> +                     return;
>>> +                   }
>>> +               }
>>> +             else
>>> +               *loc = copy_rtx (src);
>>> +           }
>>> +       }
>>> +   }
>>> +    }
>>> +}
>>> +
>>> +// Set subreg for OO mode store pair to generate registers in pairs
>>> +// given insn_info I1 and I2.
>>> +static void
>>> +set_multiword_subreg_store (insn_info *i1, insn_info *i2)
>>> +{
>>> +  rtx_insn *insn1 = i1->rtl ();
>>> +  rtx_insn *insn2 = i2->rtl ();
>>> +  rtx body = PATTERN (insn1);
>>> +  rtx src_exp = SET_SRC (body);
>>> +  rtx insn2_body = PATTERN (insn2);
>>> +  rtx insn2_dest_exp = SET_DEST (insn2_body);
>>> +  machine_mode mode = GET_MODE (src_exp);
>>> +  int regoff;
>>> +  rtx src;
>>> +  rtx addr = XEXP (insn2_dest_exp, 0);
>>> +
>>> +  PUT_MODE_RAW (src_exp, OOmode);
>>> +  if (GET_CODE (addr) == PLUS
>>> +      && XEXP (addr, 1) && CONST_INT_P (XEXP (addr, 1)))
>>> +    regoff = 16;
>>> +  else
>>> +    regoff = 0;
>>> +
>>> +  src = simplify_gen_subreg (mode,
>>> +                        src_exp, GET_MODE (src_exp),
>>> +                        regoff);
>>> +
>>> +  set_store_subreg (i1, src, regoff);
>>> +
>>> +  int regoff1 = 0;
>>> +  rtx src1;
>>> +
>>> +  src1 = simplify_gen_subreg (mode,
>>> +                         src_exp, GET_MODE (src_exp),
>>> +                         regoff1);
>>> +
>>> +  set_store_subreg (i2, src1, regoff1);
>>> +  set_rescan_store (i1);
>>> +  set_rescan_store (i2);
>>> +  df_insn_rescan (insn1);
>>> +}
>>> +
>>> +// Set subreg for OO mode pair load to generate registers in pairs given
>>> +// insn_info I2 and I2.
>>> +static void
>>> +set_multiword_subreg_load (insn_info *i1, insn_info *i2)
>>> +{
>>> +  rtx_insn *insn1 = i1->rtl ();
>>> +  rtx body = PATTERN (insn1);
>>> +  rtx dest_exp = SET_DEST (body);
>>> +  machine_mode mode = GET_MODE (dest_exp);
>>> +  PUT_MODE_RAW (dest_exp, OOmode);
>>> +
>>> +  int regoff = 0;
>>> +  rtx src;
>>> +
>>> +  src = simplify_gen_subreg (mode,
>>> +                        dest_exp, GET_MODE (dest_exp),
>>> +                        regoff);
>>> +
>>> +  set_load_subreg (i2, src);
>>> +
>>> +  int regoff1;
>>> +  rtx src1;
>>> +
>>> +  regoff1 = 16;
>>> +  src1 = simplify_gen_subreg (mode,
>>> +                         dest_exp, GET_MODE (dest_exp),
>>> +                         regoff1);
>>> +  set_load_subreg (i1, src1);
>>> +
>>> +  set_rescan_load (i1);
>>> +  set_rescan_load (i2);
>>> +  df_insn_rescan (insn1);
>>> +}
>>> +
>>> +// Set subreg for OO mode pair load for existing subreg rtx to generate
>>> +// registers in pairs given insn_info I2 and I2.
>>> +static void
>>> +set_multiword_existing_subreg (insn_info *i1, insn_info *i2)
>>> +{
>>> +  rtx_insn *insn1 = i1->rtl ();
>>> +  rtx body = PATTERN (insn1);
>>> +  rtx dest_exp = SET_DEST (body);
>>> +  machine_mode mode = GET_MODE (dest_exp);
>>> +  int regoff1;
>>> +  regoff1 = 16;
>>> +  rtx new_dest_exp = new_reg_rtx (dest_exp);
>>> +
>>> +  rtx src = simplify_gen_subreg (mode,
>>> +                            new_dest_exp,
>>> +                            OOmode,
>>> +                            regoff1);
>>> +
>>> +  set_load_subreg (i1, src);
>>> +  propagate_insn (i1, new_dest_exp);
>>> +
>>> +  int regoff = 0;
>>> +  rtx sset = single_set (i2->rtl ());
>>> +  rtx insn2_dest_exp = SET_DEST (sset);
>>> +  machine_mode insn2_mode = GET_MODE (insn2_dest_exp);
>>> +
>>> +  src = simplify_gen_subreg (insn2_mode,
>>> +                        new_dest_exp,
>>> +                        OOmode,
>>> +                        regoff);
>>> +
>>> +  set_load_subreg (i2, src);
>>> +  propagate_insn (i2, new_dest_exp);
>>> +
>>> +  auto attempt = crtl->ssa->new_change_attempt ();
>>> +  resource_info resource = { GET_MODE (new_dest_exp), REGNO (new_dest_exp) 
>>> };
>>> +  auto *set = crtl->ssa->allocate<set_info> (i1, resource);
>>> +  if (set)
>>> +    {
>>> +      auto def = find_access (i1->defs (), REGNO (new_dest_exp));
>>> +      if (!def)
>>> +   i1->defs() = insert_access (attempt, set, i1->defs());
>>> +    }
>>> +
>>> +  set_rescan_load (i1);
>>> +  set_rescan_load (i2);
>>> +  df_insn_rescan (insn1);
>>> +}
>>> +
>>> +// Return true iff insn I1 has already existing subreg.
>>> +static bool
>>> +use_has_subreg_p (insn_info *i1)
>>> +{
>>> +  for (auto def : i1->defs ())
>>> +    {
>>> +      auto set = dyn_cast<set_info *> (def);
>>> +      for (auto use : set->nondebug_insn_uses ())
>>> +   {
>>> +     insn_info *info = use->insn ();
>>> +     if (info && info->rtl ())
>>> +       {
>>> +         rtx_insn *rtl_insn = info->rtl ();
>>> +         df_ref ref;
>>> +         FOR_EACH_INSN_USE (ref, rtl_insn)
>>> +           {
>>> +             rtx dest_exp = SET_DEST (PATTERN (i1->rtl ()));
>>> +             if (REG_P (dest_exp)
>>> +                 && DF_REF_REGNO (ref) == REGNO (dest_exp))
>>> +               {
>>> +                 rtx *loc = DF_REF_LOC (ref);
>>> +                 if (GET_CODE (*loc) == SUBREG)
>>> +                   return true;
>>> +               }
>>> +           }
>>> +        }
>>> +   }
>>> +    }
>>> +   return false;
>>> +}
>>> +
>>> +// Set subreg for OO mode pair to generate sequential registers given
>>> +// insn_info pairs I1, I2 and LOAD_P is true iff load insn and false
>>> +// if store insn.
>>> +void
>>> +rs6000_pair_fusion::set_multiword_subreg (insn_info *i1, insn_info *i2,
>>> +                                     bool load_p)
>>> +{
>>> +  if (load_p)
>>> +    {
>>> +      bool i1_subreg_p = use_has_subreg_p (i1);
>>> +      bool i2_subreg_p = use_has_subreg_p (i2);
>>> +
>>> +      if (i1_subreg_p || i2_subreg_p)
>>> +   set_multiword_existing_subreg (i1, i2);
>>> +      else
>>> +   set_multiword_subreg_load (i1, i2);
>>
>> I don't understand this.  Why do we have both set_multiword_existing_subreg
>> and set_multiword_subreg_load?  i1_subreg_p and i2_subreg_p are logically
>> independent of one another (since i1 and i2 were separate instructions
>> until now).  So "i1_subreg_p || i2_subreg_p" implies that
>> set_multiword_existing_subreg can handle i1s that have no existing
>> subreg (used when i2_subreg_p) and that it can handle i2s that have no
>> existing subreg (used when i1_subreg_p).  So doesn't this mean that
>> set_multiword_existing_subreg can handle everything?
>>
> 
> I will make the following change.
>  if (load_p)
>     {
>       bool i1_subreg_p = use_has_subreg_p (i1);
>       bool i2_subreg_p = use_has_subreg_p (i2);
> 
>       if (!i1_subreg_p && !i2_subreg_p) 
>         set_multiword_subreg_load (i1, i2);
>       else
>         set_multiword_existing_subreg (i1, i2);
>     }
> 
> Is this okay.
> 

I have made these changes.
> 
>> IMO, the way the update should work is that:
>>
>> (a) all references to the old registers should be updated via
>>     insn_propagation (regardless of whether the old references
>>     involved subregs).
>>
>> (b) those updates should be part of the same insn_change group as
>>     the change to the load itself.
>>
>> For stores, definitions of the stored register can probably be handled
>> directly using df_refs, but there too, the updates should IMO be part
>> of the same insn_change group as the change to the store itself.
>>
>> In both cases, it's the:
>>
>>   crtl->ssa->change_insns (changes);
>>
>> in pair_fusion_bb_info::fuse_pair that should be responsible for
>> updating the rtl-ssa IR.  The changes that the pass wants to make
>> should be described as insn_changes and passed to change_insns.
>>
>> The reason for funneling all changes through change_insns is that
>> it allows rtl-ssa to maintain more complex datastructures.  Clients
>> aren't supposed to manually update the datastructures piecemeal.
>>
> 
> I am afraid I am not getting this. Would you mind elaborating this.
> Sorry for that.
> 

I have made reference to all uses for loads and defs for store in the 
same change. I will send a separate patch with this changes.

>> Thanks,
>> Richard
>>
>

Thanks & Regards
Ajit
 
> Thanks & Regards
> Ajit
>  
>>> +    }
>>> +  else
>>> +    set_multiword_subreg_store (i1, i2);
>>> +}
>>> +
>>> +rtx
>>> +rs6000_pair_fusion::gen_pair (rtx *pats, rtx,  bool load_p)
>>> +{
>>> +  rtx i1 = pats[0];
>>> +  rtx src_exp = SET_SRC (i1);
>>> +  rtx dest_exp = SET_DEST (i1);
>>> +  PUT_MODE_RAW (src_exp, OOmode);
>>> +  PUT_MODE_RAW (dest_exp, OOmode);
>>> +  rtx unspec  = gen_rtx_UNSPEC (GET_MODE (dest_exp),
>>> +                           gen_rtvec (1, src_exp),
>>> +                           UNSPEC_LXVP);
>>> +  rtx set =  gen_rtx_SET (dest_exp, unspec);
>>> +  if (dump_file)
>>> +    {
>>> +      if (load_p)
>>> +   fprintf (dump_file, "lxv with lxvp ");
>>> +      else
>>> +   fprintf (dump_file, "stxv with stxvp ");
>>> +      print_rtl_single (dump_file, set);
>>> +    }
>>> +  return set;
>>> +}
>>> +
>>> +const pass_data pass_data_mem_fusion =
>>> +{
>>> +  RTL_PASS, /* type */
>>> +  "mem_fusion", /* name */
>>> +  OPTGROUP_NONE, /* optinfo_flags */
>>> +  TV_NONE, /* tv_id */
>>> +  0, /* properties_required */
>>> +  0, /* properties_provided */
>>> +  0, /* properties_destroyed */
>>> +  0, /* todo_flags_start */
>>> +  TODO_df_finish, /* todo_flags_finish */
>>> +};
>>> +
>>> +class pass_mem_fusion : public rtl_opt_pass
>>> +{
>>> +public:
>>> +  pass_mem_fusion (gcc::context *ctxt)
>>> +    : rtl_opt_pass (pass_data_mem_fusion, ctxt)
>>> +  {}
>>> +
>>> +  opt_pass *clone () override { return new pass_mem_fusion (m_ctxt);}
>>> +
>>> +  /* opt_pass methods: */
>>> +  bool gate (function *)
>>> +    {
>>> +      return (optimize > 0 && TARGET_VSX && TARGET_POWER10);
>>> +    }
>>> +
>>> +  unsigned int execute (function *) final override
>>> +    {
>>> +      rs6000_pair_fusion pass;
>>> +      pass.run ();
>>> +      return 0;
>>> +    }
>>> +}; // class pass_mem_fusion
>>> +
>>> +rtl_opt_pass *
>>> +make_pass_mem_fusion (gcc::context *ctxt)
>>> +{
>>> +  return new pass_mem_fusion (ctxt);
>>> +}
>>> diff --git a/gcc/config/rs6000/rs6000-passes.def 
>>> b/gcc/config/rs6000/rs6000-passes.def
>>> index 46a0d0b8c56..0b48f57014d 100644
>>> --- a/gcc/config/rs6000/rs6000-passes.def
>>> +++ b/gcc/config/rs6000/rs6000-passes.def
>>> @@ -28,7 +28,9 @@ along with GCC; see the file COPYING3.  If not see
>>>       The power8 does not have instructions that automaticaly do the byte 
>>> swaps
>>>       for loads and stores.  */
>>>    INSERT_PASS_BEFORE (pass_cse, 1, pass_analyze_swaps);
>>> -
>>> +  /* Pass to replace adjacent memory addresses lxv/stxv instruction with
>>> +     lxvp/stxvp instruction.  */
>>> +  INSERT_PASS_BEFORE (pass_early_remat, 1, pass_mem_fusion);
>>>    /* Pass to do the PCREL_OPT optimization that combines the load of an
>>>       external symbol's address along with a single load or store using that
>>>       address as a base register.  */
>>> diff --git a/gcc/config/rs6000/rs6000-protos.h 
>>> b/gcc/config/rs6000/rs6000-protos.h
>>> index 09a57a806fa..1412b31c2eb 100644
>>> --- a/gcc/config/rs6000/rs6000-protos.h
>>> +++ b/gcc/config/rs6000/rs6000-protos.h
>>> @@ -343,6 +343,7 @@ namespace gcc { class context; }
>>>  class rtl_opt_pass;
>>>  
>>>  extern rtl_opt_pass *make_pass_analyze_swaps (gcc::context *);
>>> +extern rtl_opt_pass *make_pass_mem_fusion (gcc::context *);
>>>  extern rtl_opt_pass *make_pass_pcrel_opt (gcc::context *);
>>>  extern bool rs6000_sum_of_two_registers_p (const_rtx expr);
>>>  extern bool rs6000_quadword_masked_address_p (const_rtx exp);
>>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
>>> index 58553ff66f4..6da4e70973d 100644
>>> --- a/gcc/config/rs6000/rs6000.cc
>>> +++ b/gcc/config/rs6000/rs6000.cc
>>> @@ -27428,7 +27428,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>      reg_mode = word_mode;
>>>    reg_mode_size = GET_MODE_SIZE (reg_mode);
>>>  
>>> -  gcc_assert (reg_mode_size * nregs == GET_MODE_SIZE (mode));
>>> +  gcc_assert (mode == OOmode
>>> +         || reg_mode_size * nregs == GET_MODE_SIZE (mode));
>>>  
>>>    /* TDmode residing in FP registers is special, since the ISA requires 
>>> that
>>>       the lower-numbered word of a register pair is always the most 
>>> significant
>>> @@ -27475,6 +27476,11 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>        int reg_mode_nregs = hard_regno_nregs (reg, reg_mode);
>>>        if (MEM_P (dst))
>>>     {
>>> +     rtx addr = XEXP (dst, 0);
>>> +     rtx opnd1 = NULL_RTX;
>>> +     if (addr && GET_CODE (addr) == PLUS)
>>> +       opnd1 = XEXP (addr,1);
>>> +
>>>       unsigned offset = 0;
>>>       unsigned size = GET_MODE_SIZE (reg_mode);
>>>  
>>> @@ -27488,7 +27494,13 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>         {
>>>           unsigned subreg
>>>             = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
>>> -         rtx dst2 = adjust_address (dst, reg_mode, offset);
>>> +         rtx dst2 = dst;
>>> +
>>> +         if ((GET_CODE (addr) != PLUS
>>> +             || (opnd1 && CONST_INT_P(opnd1))))
>>> +           dst2 = adjust_address (dst, reg_mode, offset);
>>> +         else
>>> +           PUT_MODE_RAW (dst, reg_mode);
>>>           rtx src2 = gen_rtx_REG (reg_mode, reg + subreg);
>>>           offset += size;
>>>           emit_insn (gen_rtx_SET (dst2, src2));
>>> @@ -27499,15 +27511,25 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>  
>>>        if (MEM_P (src))
>>>     {
>>> +     rtx  addr = XEXP (src, 0);
>>> +     rtx opnd1 = NULL_RTX;
>>> +     if (addr && GET_CODE (addr) == PLUS)
>>> +       opnd1 = XEXP (addr,1);
>>> +
>>>       unsigned offset = 0;
>>>       unsigned size = GET_MODE_SIZE (reg_mode);
>>>  
>>> -     for (int i = 0; i < nregs; i += reg_mode_nregs)
>>> +     for (int i = nregs-1; i >= 0; i -= reg_mode_nregs)
>>>         {
>>>           unsigned subreg
>>>             = WORDS_BIG_ENDIAN ? i : (nregs - reg_mode_nregs - i);
>>>           rtx dst2 = gen_rtx_REG (reg_mode, reg + subreg);
>>> -         rtx src2 = adjust_address (src, reg_mode, offset);
>>> +         rtx src2 = src;
>>> +
>>> +         if ((GET_CODE (addr) != PLUS || (opnd1 && CONST_INT_P (opnd1))))
>>> +           src2 = adjust_address (src, reg_mode, offset);
>>> +         else
>>> +           PUT_MODE_RAW (src2, reg_mode);
>>>           offset += size;
>>>           emit_insn (gen_rtx_SET (dst2, src2));
>>>         }
>>> @@ -27515,7 +27537,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>       /* If we are writing an accumulator register, we have to
>>>          prime it after we've written it.  */
>>>       if (TARGET_MMA
>>> -         && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
>>> +         && REG_P (dst) && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO 
>>> (dst)))
>>>         emit_insn (gen_mma_xxmtacc (dst, dst));
>>>  
>>>       return;
>>> @@ -27608,9 +27630,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>     {
>>>       for (i = nregs - 1; i >= 0; i--)
>>>         {
>>> -         rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
>>> -         rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
>>> -         emit_insn (gen_rtx_SET (dst_i, src_i));
>>> +         if (REG_P (dst) && REG_P (src))
>>> +           {
>>> +             rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + i);
>>> +             rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + i);
>>> +             emit_insn (gen_rtx_SET (dst_i, src_i));
>>> +           }
>>>         }
>>>     }
>>>        else
>>> @@ -27625,7 +27650,8 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>        /* If we are writing an accumulator register, we have to
>>>      prime it after we've written it.  */
>>>        if (TARGET_MMA
>>> -     && GET_MODE (dst) == XOmode && FP_REGNO_P (REGNO (dst)))
>>> +     && REG_P (dst) && GET_MODE (dst) == XOmode
>>> +     && FP_REGNO_P (REGNO (dst)))
>>>     emit_insn (gen_mma_xxmtacc (dst, dst));
>>>      }
>>>    else
>>> @@ -27682,7 +27708,7 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>  
>>>       /* If the base register we are using to address memory is
>>>          also a destination reg, then change that register last.  */
>>> -     if (REG_P (breg)
>>> +     if (REG_P (dst) && REG_P (breg)
>>>           && REGNO (breg) >= REGNO (dst)
>>>           && REGNO (breg) < REGNO (dst) + nregs)
>>>         j = REGNO (breg) - REGNO (dst);
>>> @@ -27780,9 +27806,12 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>       /* XO/OO are opaque so cannot use subregs. */
>>>       if (mode == OOmode || mode == XOmode )
>>>         {
>>> -         rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
>>> -         rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
>>> -         emit_insn (gen_rtx_SET (dst_i, src_i));
>>> +         if (REG_P (dst) &&  REG_P (src))
>>> +           {
>>> +             rtx dst_i = gen_rtx_REG (reg_mode, REGNO (dst) + j);
>>> +             rtx src_i = gen_rtx_REG (reg_mode, REGNO (src) + j);
>>> +             emit_insn (gen_rtx_SET (dst_i, src_i));
>>> +           }
>>>         }
>>>       else
>>>         emit_insn (gen_rtx_SET (simplify_gen_subreg (reg_mode, dst, mode,
>>> @@ -27800,7 +27829,9 @@ rs6000_split_multireg_move (rtx dst, rtx src)
>>>        if (restore_basereg != NULL_RTX)
>>>     emit_insn (restore_basereg);
>>>      }
>>> +  return;
>>>  }
>>> +
>>>  
>>>  /* Return true if the peephole2 can combine a load involving a combination 
>>> of
>>>     an addis instruction and a load with an offset that can be fused 
>>> together on
>>> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
>>> index a5d20594789..2106e1a1fed 100644
>>> --- a/gcc/config/rs6000/rs6000.md
>>> +++ b/gcc/config/rs6000/rs6000.md
>>> @@ -159,6 +159,7 @@
>>>     UNSPEC_XXSPLTIW_CONST
>>>     UNSPEC_FMAX
>>>     UNSPEC_FMIN
>>> +   UNSPEC_LXVP
>>>    ])
>>>  
>>>  ;;
>>> diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000
>>> index b3ce09d523b..df9b3a35b66 100644
>>> --- a/gcc/config/rs6000/t-rs6000
>>> +++ b/gcc/config/rs6000/t-rs6000
>>> @@ -35,6 +35,11 @@ rs6000-p8swap.o: $(srcdir)/config/rs6000/rs6000-p8swap.cc
>>>     $(COMPILE) $<
>>>     $(POSTCOMPILE)
>>>  
>>> +rs6000-mem-fusion.o: $(srcdir)/config/rs6000/rs6000-mem-fusion.cc
>>> +   $(COMPILE) $<
>>> +   $(POSTCOMPILE)
>>> +
>>> +
>>>  rs6000-d.o: $(srcdir)/config/rs6000/rs6000-d.cc
>>>     $(COMPILE) $<
>>>     $(POSTCOMPILE)
>>> diff --git a/gcc/pair-fusion.cc b/gcc/pair-fusion.cc
>>> index 31d2c21c88f..ff77a0bc8c6 100644
>>> --- a/gcc/pair-fusion.cc
>>> +++ b/gcc/pair-fusion.cc
>>> @@ -312,9 +312,9 @@ static int
>>>  encode_lfs (lfs_fields fields)
>>>  {
>>>    int size_log2 = exact_log2 (fields.size);
>>> -  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
>>> -  return ((int)fields.load_p << 3)
>>> -    | ((int)fields.fpsimd_p << 2)
>>> +  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 9);
>>> +  return ((int)fields.load_p << 4)
>>> +    | ((int)fields.fpsimd_p << 3)
>>>      | (size_log2 - 2);
>>>  }
>>>  
>>> @@ -322,8 +322,8 @@ encode_lfs (lfs_fields fields)
>>>  static lfs_fields
>>>  decode_lfs (int lfs)
>>>  {
>>> -  bool load_p = (lfs & (1 << 3));
>>> -  bool fpsimd_p = (lfs & (1 << 2));
>>> +  bool load_p = (lfs & (1 << 4));
>>> +  bool fpsimd_p = (lfs & (1 << 3));
>>>    unsigned size = 1U << ((lfs & 3) + 2);
>>>    return { load_p, fpsimd_p, size };
>>>  }
>>> @@ -425,6 +425,9 @@ pair_fusion_bb_info::track_access (insn_info *insn, 
>>> bool load_p, rtx mem)
>>>    if (MEM_VOLATILE_P (mem))
>>>      return;
>>>  
>>> +  if (load_p && !m_pass->fuseable_load_p (insn))
>>> +    return;
>>> +
>>>    // Ignore writeback accesses if the hook says to do so.
>>>    if (!m_pass->should_handle_writeback (writeback_type::EXISTING)
>>>        && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC)
>>> @@ -1814,7 +1817,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>>>      }
>>>  
>>>    rtx reg_notes = combine_reg_notes (first, second, load_p);
>>> -
>>> +  m_pass->set_multiword_subreg (i1, i2, load_p);
>>>    rtx pair_pat = m_pass->gen_pair (pats, writeback_effect, load_p);
>>>    insn_change *pair_change = nullptr;
>>>    auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
>>> @@ -1833,6 +1836,7 @@ pair_fusion_bb_info::fuse_pair (bool load_p,
>>>        pair_change->new_defs = merge_access_arrays (attempt,
>>>                                                input_defs[0],
>>>                                                input_defs[1]);
>>> +      m_pass->modify_new_rtx_insn (first, &attempt, &pair_change, changes);
>>>        gcc_assert (pair_change->new_defs.is_valid ());
>>>  
>>>        pair_change->new_uses
>>> @@ -2405,6 +2409,15 @@ pair_fusion_bb_info::try_fuse_pair (bool load_p, 
>>> unsigned access_size,
>>>        reg_ops[i] = XEXP (pats[i], !load_p);
>>>      }
>>>  
>>> +  if (!load_p && !m_pass->fuseable_store_p (i1, i2))
>>> +    {
>>> +      if (dump_file)
>>> +   fprintf (dump_file,
>>> +            "punting on store-mem-pairs due to non fuseable cand 
>>> (%d,%d)\n",
>>> +            insns[0]->uid (), insns[1]->uid ());
>>> +      return false;
>>> +    }
>>> +
>>>    if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
>>>      {
>>>        if (dump_file)
>>> @@ -2997,6 +3010,8 @@ void pair_fusion::process_block (bb_info *bb)
>>>        if (GET_CODE (pat) != SET)
>>>     continue;
>>>  
>>> +      change_existing_multword_mode (rti);
>>> +
>>>        if (track_stores && MEM_P (XEXP (pat, 0)))
>>>     bb_state.track_access (insn, false, XEXP (pat, 0));
>>>        else if (track_loads && MEM_P (XEXP (pat, 1)))
>>> diff --git a/gcc/pair-fusion.h b/gcc/pair-fusion.h
>>> index 45e4edceecb..756357db794 100644
>>> --- a/gcc/pair-fusion.h
>>> +++ b/gcc/pair-fusion.h
>>> @@ -26,8 +26,11 @@ namespace rtl_ssa {
>>>    class insn_info;
>>>    class insn_range_info;
>>>    class bb_info;
>>> +  class insn_change;
>>>  }
>>>  
>>> +class obstack_watermark;
>>> +
>>>  // Information about a potential base candidate, used in try_fuse_pair.
>>>  // There may be zero, one, or two viable RTL bases for a given pair.
>>>  struct base_cand
>>> @@ -142,6 +145,19 @@ struct pair_fusion {
>>>    // true iff INSN is a load pair.
>>>    virtual bool pair_mem_insn_p (rtx_insn *insn, bool &load_p) = 0;
>>>  
>>> +  // Given INSN change multiword mode load and store to respective
>>> +  // unspec instruction.
>>> +  virtual void change_existing_multword_mode (rtx_insn *insn) = 0;
>>> +
>>> +  // Given INSN and watermark ATTEMPT and PAIR_CHANGE sets the
>>> +  // new rtx with INSN.  Remove all uses of definition that are
>>> +  // removed given CHANGES.
>>> +  virtual void modify_new_rtx_insn (rtl_ssa::insn_info *first,
>>> +                               obstack_watermark *attempt,
>>> +                               rtl_ssa::insn_change **pair_change,
>>> +                               auto_vec<rtl_ssa::insn_change *> &changes)
>>> +                               = 0;
>>> +
>>>    // Return true if we should track loads.
>>>    virtual bool track_loads_p ()
>>>    {
>>> @@ -171,6 +187,24 @@ struct pair_fusion {
>>>    virtual rtx gen_promote_writeback_pair (rtx wb_effect, rtx mem,
>>>                                       rtx regs[2], bool load_p) = 0;
>>>  
>>> +  // Given insn_info pair I1 and I2, sets subreg with multiword registers
>>> +  // to assign register pairs by allocators.
>>> +  // LOAD_P is true iff the pair is a load.
>>> +  virtual void set_multiword_subreg (rtl_ssa::insn_info *i1,
>>> +                                rtl_ssa::insn_info *i2,
>>> +                                bool load_p) = 0;
>>> +
>>> +  // Given insn_info pair I1 and I2, checks if pairs are feasible to 
>>> perform
>>> +  // store mem pairs.
>>> +  // Return true if feasible to perform store mem pairs otherwise false.
>>> +  virtual bool fuseable_store_p (rtl_ssa::insn_info *i1,
>>> +                            rtl_ssa::insn_info *i2) = 0;
>>> +
>>> +  // Given insn_info pair I1 and I2, checks if pairs are feasible to 
>>> perform
>>> +  // load mem pairs.
>>> +  // Return true if feasible to perform load mem pairs otherwise false.
>>> +  virtual bool fuseable_load_p (rtl_ssa::insn_info *info) = 0;
>>> +
>>>    void process_block (rtl_ssa::bb_info *bb);
>>>    rtl_ssa::insn_info *find_trailing_add (rtl_ssa::insn_info *insns[2],
>>>                                      const rtl_ssa::insn_range_info
>>> diff --git a/gcc/rtl-ssa/functions.h b/gcc/rtl-ssa/functions.h
>>> index e2134621723..d5c5b80f8aa 100644
>>> --- a/gcc/rtl-ssa/functions.h
>>> +++ b/gcc/rtl-ssa/functions.h
>>> @@ -222,6 +222,13 @@ public:
>>>    template<typename T, typename... Ts>
>>>    T *change_alloc (obstack_watermark &wm, Ts... args);
>>>  
>>> +  auto_vec<access_info *> &get_m_temp_defs () { return m_temp_defs; }
>>> +
>>> +  template<typename T, typename... Ts>
>>> +  T *allocate (Ts... args);
>>> +
>>> +  void remove_use (use_info *);
>>> +
>>>  private:
>>>    class bb_phi_info;
>>>    class build_info;
>>> @@ -231,9 +238,6 @@ private:
>>>    // allocate_temp during its lifetime.
>>>    obstack_watermark temp_watermark () { return &m_temp_obstack; }
>>>  
>>> -  template<typename T, typename... Ts>
>>> -  T *allocate (Ts... args);
>>> -
>>>    template<typename T, typename... Ts>
>>>    T *allocate_temp (Ts... args);
>>>  
>>> @@ -269,7 +273,6 @@ private:
>>>    static void insert_use_after (use_info *, use_info *);
>>>  
>>>    void add_use (use_info *);
>>> -  void remove_use (use_info *);
>>>  
>>>    insn_info::order_node *need_order_node (insn_info *);
>>>  
>>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C 
>>> b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>>> new file mode 100644
>>> index 00000000000..d10ff0cdf36
>>> --- /dev/null
>>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
>>> @@ -0,0 +1,22 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-require-effective-target power10_ok } */
>>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
>>> +
>>> +#include <altivec.h>
>>> +   
>>> +void
>>> +foo2 ()
>>> +{
>>> +  __vector_quad *dst1;
>>> +  __vector_quad *dst2;
>>> +  vector unsigned char src;
>>> +  __vector_quad acc;
>>> +  vector unsigned char *ptr;
>>> +  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
>>> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
>>> +  *dst1 = acc;
>>> +  __builtin_mma_xvf32ger(&acc, src, ptr[2]);
>>> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[3]);
>>> +  *dst2 = acc;
>>> +}
>>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
>>> diff --git a/gcc/testsuite/g++.target/powerpc/mem-fusion.C 
>>> b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
>>> new file mode 100644
>>> index 00000000000..c523572cf3c
>>> --- /dev/null
>>> +++ b/gcc/testsuite/g++.target/powerpc/mem-fusion.C
>>> @@ -0,0 +1,15 @@
>>> +/* { dg-do compile } */ 
>>> +/* { dg-require-effective-target power10_ok } */
>>> +/* { dg-options "-mdejagnu-cpu=power10 -O2" } */ 
>>> +
>>> +#include <altivec.h>
>>> +
>>> +void
>>> +foo (__vector_quad *dst, vector unsigned char *ptr, vector unsigned char 
>>> src)
>>> +{
>>> +  __vector_quad acc;
>>> +  __builtin_mma_xvf32ger(&acc, src, ptr[0]);
>>> +  __builtin_mma_xvf32gerpp(&acc, src, ptr[1]);
>>> +  *dst = acc;
>>> +}
>>> +/* { dg-final { scan-assembler {\mlxvp\M} } } */
>>> diff --git a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c 
>>> b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>>> index 69ee826e1be..ae29127f954 100644
>>> --- a/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>>> +++ b/gcc/testsuite/gcc.target/powerpc/mma-builtin-1.c
>>> @@ -258,8 +258,8 @@ foo13b (__vector_quad *dst, __vector_quad *src, vec_t 
>>> *vec)
>>>    dst[13] = acc;
>>>  }
>>>  
>>> -/* { dg-final { scan-assembler-times {\mlxv\M} 40 } } */
>>> -/* { dg-final { scan-assembler-times {\mlxvp\M} 12 } } */
>>> +/* { dg-final { scan-assembler-times {\mlxv\M} 0 } } */
>>> +/* { dg-final { scan-assembler-times {\mlxvp\M} 32 } } */
>>>  /* { dg-final { scan-assembler-times {\mstxvp\M} 40 } } */
>>>  /* { dg-final { scan-assembler-times {\mxxmfacc\M} 20 } } */
>>>  /* { dg-final { scan-assembler-times {\mxxmtacc\M} 6 } } */

Reply via email to