store pair fusion pass

Alex Coplan Thu, 19 Oct 2023 07:56:15 -0700

Hi,

This v2 fixes a significant compile-time hog in the original patch
for the pass posted here:
https://gcc.gnu.org/pipermail/gcc-patches/2023-October/633355.html


Having seen a large compile-time regression when turning on the early
ldp pass, compiling 502.gcc_r from SPEC CPU 2017, I found that the
benchmark's insn-attrtab.c dominated the compile time, and moreover that
compile time for that file increased by 3.79x when enabling the early
ldp fusion pass at -O.

Running cc1 under a profiler revealed that ~44% of the entire profile
was in rtx_equal_p (inlined via cleanup_tombstones into ldp_fusion_bb).
I missed that we can skip running cleanup_tombstones entirely in the
(common) case that we never emitted a tombstone insn for a given bb.

This patch implements that optimization.  This reduces the overhead for
the early ldp pass on that file to around 1%, which seems more like an
acceptable overhead for an additional pass.

Incrementally, the change is as follows:

diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
index e5de4bbb3f5..f1621c9a384 100644
--- a/gcc/config/aarch64/aarch64-ldp-fusion.cc
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -148,7 +148,7 @@ struct ldp_bb_info
   static const size_t obstack_alignment = sizeof (void *);
   bb_info *m_bb;
 
-  ldp_bb_info (bb_info *bb) : m_bb (bb)
+  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
   {
     obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
                                obstack_alignment, obstack_chunk_alloc,
@@ -164,7 +164,10 @@ struct ldp_bb_info
   inline void cleanup_tombstones ();
 
 private:
+  // Did we emit a tombstone insn for this bb?
+  bool m_emitted_tombstone;
   obstack m_obstack;
+
   inline splay_tree_node<access_record *> *node_alloc (access_record *);
 
   template<typename Map>
@@ -1006,7 +1009,8 @@ fuse_pair (bool load_p,
           insn_info *i1,
           insn_info *i2,
           base_cand &base,
-          const insn_range_info &move_range)
+          const insn_range_info &move_range,
+          bool &emitted_tombstone_p)
 {
   auto attempt = crtl->ssa->new_change_attempt ();
 
@@ -1021,6 +1025,9 @@ fuse_pair (bool load_p,
                                                    insn_change::DELETE);
     };
 
+  // Are we using a tombstone insn for this pair?
+  bool have_tombstone_p = false;
+
   insn_info *first = (*i1 < *i2) ? i1 : i2;
   insn_info *second = (first == i1) ? i2 : i1;
 
@@ -1217,6 +1224,7 @@ fuse_pair (bool load_p,
                  gcc_assert (validate_change (rti, &REG_NOTES (rti),
                                               NULL_RTX, true));
                  change->new_uses = use_array (nullptr, 0);
+                 have_tombstone_p = true;
                }
              gcc_assert (change->new_defs.is_valid ());
              gcc_assert (change->new_uses.is_valid ());
@@ -1283,6 +1291,7 @@ fuse_pair (bool load_p,
 
   confirm_change_group ();
   crtl->ssa->change_insns (changes);
+  emitted_tombstone_p |= have_tombstone_p;
   return true;
 }
 
@@ -1702,7 +1711,8 @@ try_fuse_pair (bool load_p,
               unsigned access_size,
               insn_info *i1,
               insn_info *i2,
-              base_info binfo)
+              base_info binfo,
+              bool &emitted_tombstone_p)
 {
   if (dump_file)
     fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
@@ -1991,7 +2001,7 @@ try_fuse_pair (bool load_p,
               range.first->uid (), range.last->uid ());
     }
 
-  return fuse_pair (load_p, i1, i2, *base, range);
+  return fuse_pair (load_p, i1, i2, *base, range, emitted_tombstone_p);
 }
 
 // Erase [l.begin (), i] inclusive, respecting iterator order.
@@ -2047,7 +2057,8 @@ merge_pairs (insn_iter_t l_begin,
             hash_set <insn_info *> &to_delete,
             bool load_p,
             unsigned access_size,
-            base_info binfo)
+            base_info binfo,
+            bool &emitted_tombstone_p)
 {
   auto iter_l = l_begin;
   auto iter_r = r_begin;
@@ -2076,7 +2087,8 @@ merge_pairs (insn_iter_t l_begin,
       bool update_r = false;
 
       result = try_fuse_pair (load_p, access_size,
-                             *iter_l, *iter_r, binfo);
+                             *iter_l, *iter_r, binfo,
+                             emitted_tombstone_p);
       if (result)
        {
          update_l = update_r = true;
@@ -2153,7 +2165,8 @@ ldp_bb_info::try_form_pairs (insn_list_t *left_orig,
   merge_pairs (left_orig->begin (), left_orig->end (),
               right_copy.begin (), right_copy.end (),
               *left_orig, right_copy,
-              to_delete, load_p, access_size, binfo);
+              to_delete, load_p, access_size, binfo,
+              m_emitted_tombstone);
 
   // If we formed all right candidates into pairs,
   // then we can skip the next iteration.
@@ -2206,6 +2219,10 @@ ldp_bb_info::transform_for_base (base_info binfo,
 void
 ldp_bb_info::cleanup_tombstones ()
 {
+  // No need to do anything if we didn't emit a tombstone insn for this bb.
+  if (!m_emitted_tombstone)
+    return;
+
   insn_info *insn = m_bb->head_insn ();
   while (insn)
     {

A v2 patch for the pass is attached. Bootstrapped/regtested as a series
on aarch64-linux-gnu, OK for trunk?

Thanks,
Alex

diff --git a/gcc/config.gcc b/gcc/config.gcc
index fa192637a52..9c1a604bd5e 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -349,8 +349,8 @@ aarch64*-*-*)
        c_target_objs="aarch64-c.o"
        cxx_target_objs="aarch64-c.o"
        d_target_objs="aarch64-d.o"
-       extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o"
-       target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc"
+       extra_objs="aarch64-builtins.o aarch-common.o aarch64-sve-builtins.o 
aarch64-sve-builtins-shapes.o aarch64-sve-builtins-base.o 
aarch64-sve-builtins-sve2.o cortex-a57-fma-steering.o aarch64-speculation.o 
falkor-tag-collision-avoidance.o aarch-bti-insert.o aarch64-cc-fusion.o 
aarch64-ldp-fusion.o"
+       target_gtfiles="\$(srcdir)/config/aarch64/aarch64-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.h 
\$(srcdir)/config/aarch64/aarch64-sve-builtins.cc 
\$(srcdir)/config/aarch64/aarch64-ldp-fusion.cc"
        target_has_targetm_common=yes
        ;;
 alpha*-*-*)
diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc 
b/gcc/config/aarch64/aarch64-ldp-fusion.cc
new file mode 100644
index 00000000000..f1621c9a384
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc
@@ -0,0 +1,2395 @@
+// LoadPair fusion optimization pass for AArch64.
+// Copyright (C) 2023 Free Software Foundation, Inc.
+//
+// This file is part of GCC.
+//
+// GCC is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// GCC is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+#define INCLUDE_ALGORITHM
+#define INCLUDE_FUNCTIONAL
+#define INCLUDE_LIST
+#define INCLUDE_TYPE_TRAITS
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "df.h"
+#include "rtl-ssa.h"
+#include "cfgcleanup.h"
+#include "tree-pass.h"
+#include "ordered-hash-map.h"
+#include "tree-dfa.h"
+#include "fold-const.h"
+#include "tree-hash-traits.h"
+#include "print-tree.h"
+
+using namespace rtl_ssa;
+
+enum
+{
+  LDP_IMM_BITS = 7,
+  LDP_IMM_MASK = (1 << LDP_IMM_BITS) - 1,
+  LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1)),
+  LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1,
+  LDP_MIN_IMM = -LDP_MAX_IMM - 1,
+};
+
+// We pack these fields (load_p, fpsimd_p, and size) into an integer
+// (LFS) which we use as part of the key into the main hash tables.
+//
+// The idea is that we group candidates together only if they agree on
+// the fields below.  Candidates that disagree on any of these
+// properties shouldn't be merged together.
+struct lfs_fields
+{
+  bool load_p;
+  bool fpsimd_p;
+  unsigned size;
+};
+
+using insn_list_t = std::list <insn_info *>;
+using insn_iter_t = insn_list_t::iterator;
+
+// A tagged union representing either an RTL-SSA def_info base or a
+// tree decl base.
+class base_info
+{
+  pointer_mux <def_info, union tree_node> mux;
+
+public:
+  base_info (tree decl) : mux (decl) {}
+  base_info (def_info *def) : mux (def) {}
+
+  bool is_reg () const { return mux.is_first (); }
+  def_info *get_reg () const
+  {
+    gcc_checking_assert (mux.is_first ());
+    return mux.known_first ();
+  }
+};
+
+// Information about the accesses at a given offset from a particular
+// base.  Stored in an access_group, see below.
+struct access_record
+{
+  poly_int64 offset;
+  std::list<insn_info *> cand_insns;
+  std::list<access_record>::iterator place;
+
+  access_record (poly_int64 off) : offset (off) {}
+};
+
+// A group of accesses where adjacent accesses could be ldp/stp
+// candidates.  The splay tree supports efficient insertion,
+// while the list supports efficient iteration.
+struct access_group
+{
+  splay_tree <access_record *> tree;
+  std::list<access_record> list;
+
+  template<typename Alloc>
+  inline void track (Alloc node_alloc, poly_int64 offset, insn_info *insn);
+};
+
+// Information about a potential base candidate, used in try_fuse_pair.
+// There may be zero, one, or two viable RTL bases for a given pair.
+struct base_cand
+{
+  def_info *m_def;
+
+  // FROM_INSN is -1 if the base candidate is already shared by both
+  // candidate insns.  Otherwise it holds the index of the insn from
+  // which the base originated.
+  int from_insn;
+
+  // Initially: dataflow hazards that arise if we choose this base as
+  // the common base register for the pair.
+  //
+  // Later these get narrowed, taking alias hazards into account.
+  insn_info *hazards[2];
+
+  base_cand (def_info *def, int insn)
+    : m_def (def), from_insn (insn), hazards {nullptr, nullptr} {}
+
+  base_cand (def_info *def) : base_cand (def, -1) {}
+
+  bool viable () const
+  {
+    return !hazards[0] || !hazards[1] || (*hazards[0] > *hazards[1]);
+  }
+};
+
+// State used by the pass for a given basic block.
+struct ldp_bb_info
+{
+  using def_hash = nofree_ptr_hash <def_info>;
+  using expr_key_t = pair_hash <tree_operand_hash, int_hash <int, -1, -2>>;
+  using def_key_t = pair_hash <def_hash, int_hash <int, -1, -2>>;
+
+  // Map of <tree base, LFS> -> access_group.
+  ordered_hash_map <expr_key_t, access_group> expr_map;
+
+  // Map of <RTL-SSA def_info *, LFS> -> access_group.
+  ordered_hash_map <def_key_t, access_group> def_map;
+
+  static const size_t obstack_alignment = sizeof (void *);
+  bb_info *m_bb;
+
+  ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false)
+  {
+    obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE,
+                               obstack_alignment, obstack_chunk_alloc,
+                               obstack_chunk_free);
+  }
+  ~ldp_bb_info ()
+  {
+    obstack_free (&m_obstack, nullptr);
+  }
+
+  inline void track_access (insn_info *, bool load, rtx mem);
+  inline void transform ();
+  inline void cleanup_tombstones ();
+
+private:
+  // Did we emit a tombstone insn for this bb?
+  bool m_emitted_tombstone;
+  obstack m_obstack;
+
+  inline splay_tree_node<access_record *> *node_alloc (access_record *);
+
+  template<typename Map>
+  inline void traverse_base_map (Map &map);
+  inline void transform_for_base (base_info binfo, int load_size,
+                                 access_group &group);
+
+  inline bool try_form_pairs (insn_list_t *, insn_list_t *,
+                             bool load_p, unsigned access_size,
+                             base_info binfo);
+
+  inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs);
+};
+
+splay_tree_node<access_record *> *
+ldp_bb_info::node_alloc (access_record *access)
+{
+  using T = splay_tree_node<access_record *>;
+  void *addr = obstack_alloc (&m_obstack, sizeof (T));
+  return new (addr) T (access);
+}
+
+static bool
+mem_valid_p (rtx mem)
+{
+  addr_space_t as = MEM_ADDR_SPACE (mem);
+  machine_mode mode = GET_MODE (mem);
+  rtx addr = XEXP (mem, 0);
+  return memory_address_addr_space_p (mode, addr, as);
+}
+
+static rtx
+try_adjust_address (rtx mem, machine_mode mode, poly_int64 offset)
+{
+  gcc_checking_assert (mem_valid_p (mem));
+  rtx adjusted = adjust_address_nv (mem, mode, offset);
+  return mem_valid_p (adjusted) ? adjusted : NULL_RTX;
+}
+
+static bool
+ldp_operand_mode_p (machine_mode mode)
+{
+  const bool allow_qregs
+    = !(aarch64_tune_params.extra_tuning_flags
+       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
+
+  if (VECTOR_MODE_P (mode))
+    {
+      const auto poly_size = GET_MODE_SIZE (mode);
+      if (!poly_size.is_constant ())
+       return false;
+
+      HOST_WIDE_INT size = poly_size.to_constant ();
+      return size == 4
+       || size == 8
+       || (size == 16 && allow_qregs);
+    }
+
+  switch (mode)
+    {
+    case E_SImode:
+    case E_DImode:
+    case E_SFmode:
+    case E_SDmode:
+    case E_DFmode:
+    case E_DDmode:
+      return true;
+    case E_TFmode:
+    case E_TDmode:
+      return allow_qregs;
+    case E_TImode:
+      return allow_qregs && reload_completed;
+    default:
+      return false;
+    }
+}
+
+static int
+encode_lfs (lfs_fields fields)
+{
+  int size_log2 = exact_log2 (fields.size);
+  gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4);
+  return ((int)fields.load_p << 3)
+    | ((int)fields.fpsimd_p << 2)
+    | (size_log2 - 2);
+}
+
+static lfs_fields
+decode_lfs (int lfs)
+{
+  bool load_p = (lfs & (1 << 3));
+  bool fpsimd_p = (lfs & (1 << 2));
+  unsigned size = 1U << ((lfs & 3) + 2);
+  return { load_p, fpsimd_p, size };
+}
+
+template<typename Alloc>
+void
+access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn)
+{
+  auto insert_before = [&](std::list<access_record>::iterator after)
+    {
+      auto it = list.emplace (after, offset);
+      it->cand_insns.push_back (insn);
+      it->place = it;
+      return &*it;
+    };
+
+  if (!list.size ())
+    {
+      auto access = insert_before (list.end ());
+      tree.insert_max_node (alloc_node (access));
+      return;
+    }
+
+  auto compare = [&](splay_tree_node<access_record *> *node)
+    {
+      return compare_sizes_for_sort (offset, node->value ()->offset);
+    };
+  auto result = tree.lookup (compare);
+  splay_tree_node<access_record *> *node = tree.root ();
+  if (result == 0)
+    node->value ()->cand_insns.push_back (insn);
+  else
+    {
+      auto it = node->value ()->place;
+      auto after = (result > 0) ? std::next (it) : it;
+      auto access = insert_before (after);
+      tree.insert_child (node, result > 0, alloc_node (access));
+    }
+}
+
+bool
+ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs)
+{
+  if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem))
+    return false;
+
+  // Punt on auto-inc accesses for now so we don't have to deal
+  // with the complexity later on.  TODO: would be good to relax
+  // this eventually.
+  if (side_effects_p (XEXP (mem, 0)))
+    return false;
+
+  poly_int64 offset;
+  tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem),
+                                                 &offset);
+  if (!base_expr || !DECL_P (base_expr))
+    return false;
+
+  offset += MEM_OFFSET (mem);
+
+  const machine_mode mem_mode = GET_MODE (mem);
+  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+
+  // Punt on misaligned offsets.
+  if (offset.coeffs[0] & (mem_size - 1))
+    return false;
+
+  const auto key = std::make_pair (base_expr, encode_lfs (lfs));
+  access_group &group = expr_map.get_or_insert (key, NULL);
+  auto alloc = [&](access_record *access) { return node_alloc (access); };
+  group.track (alloc, offset, insn);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "[bb %u] tracking insn %d via ",
+              m_bb->index (), insn->uid ());
+      print_node_brief (dump_file, "mem expr", base_expr, 0);
+      fprintf (dump_file, " [L=%d FP=%d, %smode, off=",
+              lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]);
+      print_dec (offset, dump_file);
+      fprintf (dump_file, "]\n");
+    }
+
+  return true;
+}
+
+void
+ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem)
+{
+  // We can't combine volatile MEMs, so punt on these.
+  if (MEM_VOLATILE_P (mem))
+    return;
+
+  const machine_mode mem_mode = GET_MODE (mem);
+  if (!ldp_operand_mode_p (mem_mode))
+    return;
+
+  // Note ldp_operand_mode_p already rejected VL modes.
+  const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant ();
+  const bool fpsimd_mode_p = GET_MODE_CLASS (mem_mode) != MODE_INT;
+
+  // For stores, validate the operand being stored.
+  if (!load_p)
+    {
+      rtx st_op = XEXP (PATTERN (insn->rtl ()), 1);
+      if (!register_operand (st_op, mem_mode)
+         && (fpsimd_mode_p || !aarch64_reg_or_zero (st_op, mem_mode)))
+       return;
+    }
+
+  // N.B. we only want to segregate FP/SIMD accesses from integer accesses
+  // before RA.
+  const bool fpsimd_bit_p = !reload_completed && fpsimd_mode_p;
+  const lfs_fields lfs = { load_p, fpsimd_bit_p, mem_size };
+
+  if (track_via_mem_expr (insn, mem, lfs))
+    return;
+
+  rtx addr = XEXP (mem, 0);
+
+  // TODO: handle auto-inc addressing modes.  We probably want to
+  // record these separately (from the main hash table) and find pair
+  // opportunities by looking for other load/store instructions that are
+  // related by dataflow on the base register.
+  poly_int64 poly_off;
+  rtx base = strip_offset (addr, &poly_off);
+  if (!REG_P (base))
+    return;
+
+  // Punt on accesses relative to the eliminable regs: since we don't
+  // know the elimination offset pre-RA, we should postpone forming
+  // pairs on such accesses until after RA.
+  if (!reload_completed
+      && (REGNO (base) == FRAME_POINTER_REGNUM
+         || REGNO (base) == ARG_POINTER_REGNUM))
+    return;
+
+  // No ldp instructions with variable-length addressing.
+  if (!poly_off.is_constant ())
+    return;
+
+  HOST_WIDE_INT offset = poly_off.to_constant ();
+
+  // ldp/stp only handle offsets that are a multiple of the access size.
+  if (offset % mem_size != 0)
+    return;
+
+  // Normalize the offset.
+  offset /= mem_size;
+
+  // Check if the offset is in range.
+  if (offset > LDP_MAX_IMM || offset < LDP_MIN_IMM)
+    return;
+
+  // Now need to find def of base register.
+  def_info *base_def;
+  use_info *base_use = nullptr;
+  for (auto use_info : insn->uses ())
+    if (use_info->is_reg () && use_info->regno () == REGNO (base))
+      {
+       base_use = use_info;
+       break;
+      }
+
+  gcc_assert (base_use);
+  base_def = base_use->def ();
+  if (!base_def)
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "base register (regno %d) of insn %d is undefined",
+                REGNO (base), insn->uid ());
+      return;
+    }
+
+  const auto key = std::make_pair (base_def, encode_lfs (lfs));
+  access_group &group = def_map.get_or_insert (key, NULL);
+  auto alloc = [&](access_record *access) { return node_alloc (access); };
+  group.track (alloc, poly_off, insn);
+
+  if (dump_file)
+    fprintf (dump_file,
+            "[bb %u] tracking insn %d [L=%d, FP=%d, %smode, off=%d]\n",
+            m_bb->index (), insn->uid (), lfs.load_p, lfs.fpsimd_p,
+            mode_name[mem_mode], (int)poly_off.to_constant ());
+}
+
+// Dummy predicate that never ignores any insns.
+static bool no_ignore (insn_info *) { return false; }
+
+// Return the latest dataflow hazard before INSN.
+//
+// If IGNORE is non-NULL, this points to a sub-rtx which we should
+// ignore for dataflow purposes.  This is needed when considering
+// changing the RTL base of an access discovered through a MEM_EXPR
+// base.
+//
+// N.B. we ignore any defs/uses of memory here as we deal with that
+// separately, making use of alias disambiguation.
+static insn_info *
+latest_hazard_before (insn_info *insn, rtx *ignore)
+{
+  insn_info *result = nullptr;
+  auto hazard = [insn, &result](insn_info *h)
+    {
+      gcc_checking_assert (*h < *insn);
+      if (!result || *h > *result)
+       result = h;
+    };
+
+  rtx pat = PATTERN (insn->rtl ());
+  auto ignore_use = [&](use_info *u)
+    {
+      if (u->is_mem ())
+       return true;
+
+      return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
+    };
+
+  // Find defs of uses in INSN (RaW).
+  for (auto use : insn->uses ())
+    if (!ignore_use (use) && use->def ())
+      hazard (use->def ()->insn ());
+
+  // Find previous defs (WaW) or previous uses (WaR) of defs in INSN.
+  for (auto def : insn->defs ())
+    {
+      if (def->is_mem ())
+       continue;
+
+      if (def->prev_def ())
+       {
+         hazard (def->prev_def ()->insn ()); // WaW
+
+         auto set = dyn_cast <set_info *> (def->prev_def ());
+         if (set && set->has_nondebug_insn_uses ())
+           for (auto use : set->reverse_nondebug_insn_uses ())
+             if (use->insn () != insn)
+               {
+                 hazard (use->insn ()); // WaR
+                 break;
+               }
+       }
+
+      if (!HARD_REGISTER_NUM_P (def->regno ()))
+       continue;
+
+      // Also need to check backwards for call clobbers (WaW).
+      for (auto call_group : def->ebb ()->call_clobbers ())
+       {
+         if (!call_group->clobbers (def->resource ()))
+           continue;
+
+         auto clobber_insn = prev_call_clobbers_ignoring (*call_group,
+                                                          def->insn (),
+                                                          no_ignore);
+         if (clobber_insn)
+           hazard (clobber_insn);
+       }
+
+    }
+
+  return result;
+}
+
+static insn_info *
+first_hazard_after (insn_info *insn, rtx *ignore)
+{
+  insn_info *result = nullptr;
+  auto hazard = [insn, &result](insn_info *h)
+    {
+      gcc_checking_assert (*h > *insn);
+      if (!result || *h < *result)
+       result = h;
+    };
+
+  rtx pat = PATTERN (insn->rtl ());
+  auto ignore_use = [&](use_info *u)
+    {
+      if (u->is_mem ())
+       return true;
+
+      return !refers_to_regno_p (u->regno (), u->regno () + 1, pat, ignore);
+    };
+
+  for (auto def : insn->defs ())
+    {
+      if (def->is_mem ())
+       continue;
+
+      if (def->next_def ())
+       hazard (def->next_def ()->insn ()); // WaW
+
+      auto set = dyn_cast <set_info *> (def);
+      if (set && set->has_nondebug_insn_uses ())
+       hazard (set->first_nondebug_insn_use ()->insn ()); // RaW
+
+      if (!HARD_REGISTER_NUM_P (def->regno ()))
+       continue;
+
+      // Also check for call clobbers of this def (WaW).
+      for (auto call_group : def->ebb ()->call_clobbers ())
+       {
+         if (!call_group->clobbers (def->resource ()))
+           continue;
+
+         auto clobber_insn = next_call_clobbers_ignoring (*call_group,
+                                                          def->insn (),
+                                                          no_ignore);
+         if (clobber_insn)
+           hazard (clobber_insn);
+       }
+    }
+
+  // Find any subsequent defs of uses in INSN (WaR).
+  for (auto use : insn->uses ())
+    {
+      if (ignore_use (use))
+       continue;
+
+      if (use->def ())
+       {
+         auto def = use->def ()->next_def ();
+         if (def && def->insn () == insn)
+           def = def->next_def ();
+
+         if (def)
+           hazard (def->insn ());
+       }
+
+      if (!HARD_REGISTER_NUM_P (use->regno ()))
+       continue;
+
+      // Also need to handle call clobbers of our uses (again WaR).
+      //
+      // See restrict_movement_for_uses_ignoring for why we don't
+      // need to check backwards for call clobbers.
+      for (auto call_group : use->ebb ()->call_clobbers ())
+       {
+         if (!call_group->clobbers (use->resource ()))
+           continue;
+
+         auto clobber_insn = next_call_clobbers_ignoring (*call_group,
+                                                          use->insn (),
+                                                          no_ignore);
+         if (clobber_insn)
+           hazard (clobber_insn);
+       }
+    }
+
+  return result;
+}
+
+
+enum change_strategy {
+  CHANGE,
+  DELETE,
+  TOMBSTONE,
+};
+
+// Given a change_strategy S, convert it to a string (for output in the
+// dump file).
+static const char *cs_to_string (change_strategy s)
+{
+#define C(x) case x: return #x
+  switch (s)
+    {
+      C (CHANGE);
+      C (DELETE);
+      C (TOMBSTONE);
+    }
+#undef C
+  gcc_unreachable ();
+}
+
+// TODO: should this live in RTL-SSA?
+static bool
+ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2)
+{
+  // If either range is empty, then their intersection is empty.
+  if (!r1 || !r2)
+    return false;
+
+  // When do they not overlap? When one range finishes before the other
+  // starts, i.e. (*r1.last < *r2.first || *r2.last < *r1.first).
+  // Inverting this, we get the below.
+  return *r1.last >= *r2.first && *r2.last >= *r1.first;
+}
+
+// Get the range of insns that def feeds.
+static insn_range_info get_def_range (def_info *def)
+{
+  insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn ();
+  return { def->insn (), last };
+}
+
+// Given a def (of memory), return the downwards range within which we
+// can safely move this def.
+static insn_range_info
+def_downwards_move_range (def_info *def)
+{
+  auto range = get_def_range (def);
+
+  auto set = dyn_cast <set_info *> (def);
+  if (!set || !set->has_any_uses ())
+    return range;
+
+  auto use = set->first_nondebug_insn_use ();
+  if (use)
+    range = move_earlier_than (range, use->insn ());
+
+  return range;
+}
+
+// Given a def (of memory), return the upwards range within which we can
+// safely move this def.
+static insn_range_info
+def_upwards_move_range (def_info *def)
+{
+  def_info *prev = def->prev_def ();
+  insn_range_info range { prev->insn (), def->insn () };
+
+  auto set = dyn_cast <set_info *> (prev);
+  if (!set || !set->has_any_uses ())
+    return range;
+
+  auto use = set->last_nondebug_insn_use ();
+  if (use)
+    range = move_later_than (range, use->insn ());
+
+  return range;
+}
+
+static def_info *
+decide_stp_strategy (change_strategy strategy[2],
+                    insn_info *first,
+                    insn_info *second,
+                    const insn_range_info &move_range)
+{
+  strategy[0] = CHANGE;
+  strategy[1] = DELETE;
+
+  unsigned viable = 0;
+  viable |= move_range.includes (first);
+  viable |= ((unsigned) move_range.includes (second)) << 1;
+
+  def_info * const defs[2] = {
+    memory_access (first->defs ()),
+    memory_access (second->defs ())
+  };
+  if (defs[0] == defs[1])
+    viable = 3; // No intervening store, either is viable.
+
+  if (!(viable & 1)
+      && ranges_overlap_p (move_range, def_downwards_move_range (defs[0])))
+    viable |= 1;
+  if (!(viable & 2)
+      && ranges_overlap_p (move_range, def_upwards_move_range (defs[1])))
+    viable |= 2;
+
+  if (viable == 2)
+    std::swap (strategy[0], strategy[1]);
+  else if (!viable)
+    // Tricky case: need to delete both accesses.
+    strategy[0] = DELETE;
+
+  for (int i = 0; i < 2; i++)
+    {
+      if (strategy[i] != DELETE)
+       continue;
+
+      // See if we can get away without a tombstone.
+      auto set = dyn_cast <set_info *> (defs[i]);
+      if (!set || !set->has_any_uses ())
+       continue; // We can indeed.
+
+      // If both sides are viable for re-purposing, and the other store's
+      // def doesn't have any uses, then we can delete the other store
+      // and re-purpose this store instead.
+      if (viable == 3)
+       {
+         gcc_assert (strategy[!i] == CHANGE);
+         auto other_set = dyn_cast <set_info *> (defs[!i]);
+         if (!other_set || !other_set->has_any_uses ())
+           {
+             strategy[i] = CHANGE;
+             strategy[!i] = DELETE;
+             break;
+           }
+       }
+
+      // Alas, we need a tombstone after all.
+      strategy[i] = TOMBSTONE;
+    }
+
+  for (int i = 0; i < 2; i++)
+    if (strategy[i] == CHANGE)
+      return defs[i];
+
+  return nullptr;
+}
+
+static GTY(()) rtx tombstone = NULL_RTX;
+
+// Generate the RTL pattern for a "tombstone"; used temporarily
+// during this pass to replace stores that are marked for deletion
+// where we can't immediately delete the store (e.g. if there are uses
+// hanging off its def of memory).
+//
+// These are deleted at the end of the pass and uses re-parented
+// appropriately at this point.
+static rtx
+gen_tombstone (void)
+{
+  if (!tombstone)
+    {
+      tombstone = gen_rtx_CLOBBER (VOIDmode,
+                                  gen_rtx_MEM (BLKmode,
+                                               gen_rtx_SCRATCH (Pmode)));
+      return tombstone;
+    }
+
+  return copy_rtx (tombstone);
+}
+
+static bool
+tombstone_insn_p (insn_info *insn)
+{
+  rtx x = tombstone ? tombstone : gen_tombstone ();
+  return rtx_equal_p (PATTERN (insn->rtl ()), x);
+}
+
+// Given a mode MODE, return a mode of the same size which we will
+// attempt to use as a canonical load/store mode.
+static machine_mode
+canonical_mode_for_mode (machine_mode mode)
+{
+  switch (GET_MODE_SIZE (mode).to_constant ())
+    {
+    case 4:
+      return E_SImode;
+    case 8:
+      return E_DImode;
+    case 16:
+      return E_V16QImode;
+    }
+
+  gcc_unreachable ();
+}
+
+// Try to convert accesses to happen in a canonical mode where possible.
+static void
+canonicalize_access_modes (rtx pats[2], bool load_p)
+{
+  rtx regs[2];
+  rtx mems[2];
+  machine_mode modes[2];
+  machine_mode canon_mode;
+
+  auto update_pat = [&](int i, rtx mem, rtx reg)
+    {
+      if (load_p)
+       pats[i] = gen_rtx_SET (reg, mem);
+      else
+       pats[i] = gen_rtx_SET (mem, reg);
+    };
+
+  for (int i = 0; i < 2; i++)
+    {
+      mems[i] = XEXP (pats[i], load_p);
+      regs[i] = XEXP (pats[i], !load_p);
+      modes[i] = GET_MODE (mems[i]);
+    }
+
+  canon_mode = canonical_mode_for_mode (modes[0]);
+  gcc_checking_assert (canon_mode == canonical_mode_for_mode (modes[1]));
+
+  if (modes[0] == canon_mode && modes[1] == canon_mode)
+    return;
+
+  // See if we can use a punning subreg to convert the register's
+  // mode, try this up front as it may not be possible.
+  rtx punned_regs[2] = {};
+  rtx adjusted_mems[2] = {};
+  for (int i = 0; i < 2; i++)
+    {
+      punned_regs[i] = lowpart_subreg (canon_mode, regs[i], modes[i]);
+      if (!punned_regs[i])
+       {
+         if (dump_file)
+           fprintf (dump_file,
+                    "  failed to canonicalize reg op mode: %s -> %s\n",
+                    mode_name[modes[i]], mode_name[canon_mode]);
+         continue;
+       }
+
+      adjusted_mems[i] = try_adjust_address (mems[i], canon_mode, 0);
+      if (!adjusted_mems[i])
+       {
+         if (dump_file)
+           fprintf (dump_file, "  failed to canonicalize mem mode: %s -> %s\n",
+                    mode_name[modes[i]], mode_name[canon_mode]);
+       }
+    }
+
+  if (adjusted_mems[0] && adjusted_mems[1])
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "  successfully canonicalized from (%s,%s) -> %smode\n",
+                mode_name[modes[0]],
+                mode_name[modes[1]],
+                mode_name[canon_mode]);
+
+      for (int i = 0; i < 2; i++)
+       update_pat (i, adjusted_mems[i], punned_regs[i]);
+      return;
+    }
+
+  // If we failed to switch to a canonical mode, at least
+  // try and make sure the modes are the same.
+  if (modes[0] == modes[1])
+    return;
+
+  for (int i = 0; i < 2; i++)
+    {
+      rtx punned_reg = lowpart_subreg (modes[!i], regs[i], modes[i]);
+      if (!punned_reg)
+       continue;
+
+      rtx adjusted_mem = try_adjust_address (mems[i], modes[!i], 0);
+      if (!adjusted_mem)
+       continue;
+
+      if (dump_file)
+       fprintf (dump_file,
+                "  failed to canonicalize, but made modes agree (%s -> %s)\n",
+                mode_name[modes[i]], mode_name[modes[!i]]);
+      update_pat (i, adjusted_mem, punned_reg);
+      return;
+    }
+
+  // Worst case, recog will bail out if the modes are still
+  // incompatible.
+}
+
+// If we have vector x scalar modes, pun into the scalar mode.
+// Otherwise, leave the modes unchanged.
+static bool
+unify_access_modes (rtx pats[2], bool load_p)
+{
+  machine_mode modes[2];
+  for (int i = 0; i < 2; i++)
+    modes[i] = GET_MODE (XEXP (pats[i], load_p));
+
+  if (VECTOR_MODE_P (modes[0]) == VECTOR_MODE_P (modes[1]))
+    return true;
+
+  const int vector_i = VECTOR_MODE_P (modes[1]);
+  const int scalar_i = !vector_i;
+
+  rtx vector_mem = XEXP (pats[vector_i], load_p);
+  rtx vector_reg = XEXP (pats[vector_i], !load_p);
+
+  if (!try_adjust_address (vector_mem, modes[scalar_i], 0))
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "failed to unify %smode -> %smode, fall back "
+                "to canonicalize modes\n",
+                mode_name[modes[vector_i]], mode_name[modes[scalar_i]]);
+      canonicalize_access_modes (pats, load_p);
+      return true;
+    }
+
+  rtx adjusted_mem = adjust_address (vector_mem, modes[scalar_i], 0);
+  rtx punned_reg = lowpart_subreg (modes[scalar_i],
+                                  vector_reg,
+                                  modes[vector_i]);
+  if (!punned_reg)
+    {
+      if (dump_file)
+       fprintf (dump_file, "Failed to unify %smode -> %smode\n",
+                mode_name[modes[vector_i]], mode_name[modes[scalar_i]]);
+      return false;
+    }
+
+  pats[vector_i] = load_p
+    ? gen_rtx_SET (punned_reg, adjusted_mem)
+    : gen_rtx_SET (adjusted_mem, punned_reg);
+  return true;
+}
+
+static rtx
+filter_notes (rtx note, rtx result, bool *eh_region)
+{
+  for (; note; note = XEXP (note, 1))
+    {
+      switch (REG_NOTE_KIND (note))
+       {
+         case REG_EQUAL:
+         case REG_EQUIV:
+         case REG_DEAD:
+         case REG_UNUSED:
+         case REG_NOALIAS:
+           // These can all be dropped.  For REG_EQU{AL,IV} they
+           // cannot apply to non-single_set insns, and
+           // REG_{DEAD,UNUSED} are re-computed by RTl-SSA, see
+           // rtl-ssa/changes.cc:update_notes.
+           //
+           // Similarly, REG_NOALIAS cannot apply to a parallel.
+           break;
+         case REG_EH_REGION:
+           gcc_assert (!*eh_region);
+           *eh_region = true;
+           result = alloc_reg_note (REG_EH_REGION, XEXP (note, 0), result);
+           break;
+         case REG_CFA_OFFSET:
+         case REG_CFA_RESTORE:
+           result = alloc_reg_note (REG_NOTE_KIND (note),
+                                    copy_rtx (XEXP (note, 0)),
+                                    result);
+           break;
+         default:
+           // Unexpected REG_NOTE kind.
+           gcc_unreachable ();
+       }
+    }
+
+  return result;
+}
+
+// Ensure we have a sensible scheme for combining REG_NOTEs
+// given two candidate insns I1 and I2.
+static rtx
+combine_reg_notes (insn_info *i1, insn_info *i2)
+{
+  bool found_eh_region = false;
+  rtx result = NULL_RTX;
+  result = filter_notes (REG_NOTES (i1->rtl ()), result, &found_eh_region);
+  return filter_notes (REG_NOTES (i2->rtl ()), result, &found_eh_region);
+}
+
+// Try and actually fuse the pair given by insns I1 and I2.
+static bool
+fuse_pair (bool load_p,
+          insn_info *i1,
+          insn_info *i2,
+          base_cand &base,
+          const insn_range_info &move_range,
+          bool &emitted_tombstone_p)
+{
+  auto attempt = crtl->ssa->new_change_attempt ();
+
+  auto make_change = [&attempt](insn_info *insn)
+    {
+      return crtl->ssa->change_alloc <insn_change> (attempt, insn);
+    };
+  auto make_delete = [&attempt](insn_info *insn)
+    {
+      return crtl->ssa->change_alloc <insn_change> (attempt,
+                                                   insn,
+                                                   insn_change::DELETE);
+    };
+
+  // Are we using a tombstone insn for this pair?
+  bool have_tombstone_p = false;
+
+  insn_info *first = (*i1 < *i2) ? i1 : i2;
+  insn_info *second = (first == i1) ? i2 : i1;
+
+  insn_info *insns[2] = { first, second };
+
+  auto_vec <insn_change *> changes;
+  changes.reserve (3);
+
+  rtx pats[2] = {
+    PATTERN (i1->rtl ()),
+    PATTERN (i2->rtl ())
+  };
+
+  use_array input_uses[2] = { first->uses (), second->uses () };
+
+  if (base.from_insn != -1)
+    {
+      // If we're not already using a shared base, we need
+      // to re-write one of the accesses to use the base from
+      // the other insn.
+      gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1);
+
+      const bool lower_base_p = (insns[base.from_insn] == i1);
+
+      rtx base_pat = PATTERN (insns[base.from_insn]->rtl ());
+      rtx change_pat = PATTERN (insns[!base.from_insn]->rtl ());
+      rtx base_mem = XEXP (base_pat, load_p);
+      rtx change_mem = XEXP (change_pat, load_p);
+
+      machine_mode mem_mode = GET_MODE (base_mem);
+      HOST_WIDE_INT adjust_amt = GET_MODE_SIZE (mem_mode).to_constant ();
+      if (!lower_base_p)
+       adjust_amt *= -1;
+
+      rtx change_reg = XEXP (change_pat, !load_p);
+      machine_mode mode_for_mem = GET_MODE (change_mem);
+      if (!try_adjust_address (base_mem, mode_for_mem, adjust_amt))
+       {
+         // We need to canonicalize the mode to make the adjustment.
+         // This should be guaranteed to work as we checked this in
+         // get_viable_bases.
+         mode_for_mem = canonical_mode_for_mode (mode_for_mem);
+         change_reg = lowpart_subreg (mode_for_mem,
+                                      change_reg,
+                                      GET_MODE (change_mem));
+         gcc_assert (change_reg);
+       }
+
+      rtx new_mem = adjust_address (base_mem, mode_for_mem, adjust_amt);
+      rtx new_set = load_p
+       ? gen_rtx_SET (change_reg, new_mem)
+       : gen_rtx_SET (new_mem, change_reg);
+
+      pats[lower_base_p] = new_set;
+
+      hash_set <use_info *> uses_to_drop;
+      use_array &uses_to_change = input_uses[!base.from_insn];
+
+      for (auto use : uses_to_change)
+       if (use->is_reg ()
+           && !refers_to_regno_p (use->regno (),
+                                  use->regno () + 1,
+                                  change_pat,
+                                  &XEXP (change_pat, load_p))
+           && uses_to_drop.add (use))
+         gcc_unreachable ();
+
+      if (!uses_to_drop.is_empty ())
+       {
+         access_array_builder builder (attempt);
+         gcc_checking_assert (uses_to_drop.elements ()
+                              <= uses_to_change.size ());
+         builder.reserve (uses_to_change.size () - uses_to_drop.elements ());
+         auto it = uses_to_change.begin ();
+         auto end = uses_to_change.end ();
+         for (; it != end; ++it)
+           if (!uses_to_drop.contains (*it))
+             builder.quick_push (*it);
+         uses_to_change = use_array (builder.finish ());
+       }
+    }
+
+  if (aarch64_ldp_canonicalize_modes)
+    canonicalize_access_modes (pats, load_p);
+  else if (!unify_access_modes (pats, load_p))
+    return false;
+
+  // Go through and drop uses that only occur in register notes,
+  // as we won't be preserving those.
+  for (int i = 0; i < 2; i++)
+    {
+      auto rti = insns[i]->rtl ();
+      if (!REG_NOTES (rti))
+       continue;
+
+      input_uses[i] = remove_note_accesses (attempt, input_uses[i]);
+    }
+
+  // Now that we know what base mem we're going to use, check if it's OK
+  // with the ldp/stp policy.
+  rtx first_mem = XEXP (pats[0], load_p);
+  if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem,
+                                               load_p,
+                                               GET_MODE (first_mem)))
+    {
+      if (dump_file)
+       fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n",
+                i1->uid (), i2->uid ());
+      return false;
+    }
+
+  rtx reg_notes = combine_reg_notes (i1, i2);
+
+  rtx pair_pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, pats[0], pats[1]));
+
+  insn_change *pair_change = nullptr;
+  auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) {
+      rtx_insn *rti = change->insn ()->rtl ();
+      gcc_assert (validate_unshare_change (rti, &PATTERN (rti), pair_pat,
+                                          true));
+      gcc_assert (validate_change (rti, &REG_NOTES (rti),
+                                  reg_notes, true));
+  };
+
+  if (load_p)
+    {
+      changes.quick_push (make_delete (first));
+      pair_change = make_change (second);
+      changes.quick_push (pair_change);
+
+      pair_change->move_range = move_range;
+      pair_change->new_defs = merge_access_arrays (attempt,
+                                             first->defs (),
+                                             second->defs ());
+      gcc_assert (pair_change->new_defs.is_valid ());
+
+      pair_change->new_uses
+       = merge_access_arrays (attempt,
+                              drop_memory_access (input_uses[0]),
+                              drop_memory_access (input_uses[1]));
+      gcc_assert (pair_change->new_uses.is_valid ());
+      set_pair_pat (pair_change);
+    }
+  else
+    {
+      change_strategy strategy[2];
+      def_info *stp_def = decide_stp_strategy (strategy, first, second,
+                                              move_range);
+      if (dump_file)
+       {
+         auto cs1 = cs_to_string (strategy[0]);
+         auto cs2 = cs_to_string (strategy[1]);
+         fprintf (dump_file,
+                  "  stp strategy for candidate insns (%d,%d): (%s,%s)\n",
+                  insns[0]->uid (), insns[1]->uid (), cs1, cs2);
+         if (stp_def)
+           fprintf (dump_file,
+                    "  re-using mem def from insn %d\n",
+                    stp_def->insn ()->uid ());
+       }
+
+      insn_change *change;
+      for (int i = 0; i < 2; i++)
+       {
+         switch (strategy[i])
+           {
+           case DELETE:
+             changes.quick_push (make_delete (insns[i]));
+             break;
+           case TOMBSTONE:
+           case CHANGE:
+             change = make_change (insns[i]);
+             if (strategy[i] == CHANGE)
+               {
+                 set_pair_pat (change);
+                 change->new_uses = merge_access_arrays (attempt,
+                                                         input_uses[0],
+                                                         input_uses[1]);
+                 auto d1 = drop_memory_access (first->defs ());
+                 auto d2 = drop_memory_access (second->defs ());
+                 change->new_defs = merge_access_arrays (attempt, d1, d2);
+                 gcc_assert (stp_def);
+                 change->new_defs = insert_access (attempt,
+                                                   stp_def,
+                                                   change->new_defs);
+                 change->move_range = move_range;
+                 pair_change = change;
+               }
+             else
+               {
+                 rtx_insn *rti = insns[i]->rtl ();
+                 gcc_assert (validate_change (rti, &PATTERN (rti),
+                                              gen_tombstone (), true));
+                 gcc_assert (validate_change (rti, &REG_NOTES (rti),
+                                              NULL_RTX, true));
+                 change->new_uses = use_array (nullptr, 0);
+                 have_tombstone_p = true;
+               }
+             gcc_assert (change->new_defs.is_valid ());
+             gcc_assert (change->new_uses.is_valid ());
+             changes.quick_push (change);
+             break;
+           }
+       }
+
+      if (!stp_def)
+       {
+         // Tricky case.  Cannot re-purpose existing insns for stp.
+         // Need to insert new insn.
+         if (dump_file)
+           fprintf (dump_file,
+                    "  stp fusion: cannot re-purpose candidate stores\n");
+
+         auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat);
+         change = make_change (new_insn);
+         change->move_range = move_range;
+         change->new_uses = merge_access_arrays (attempt,
+                                                 input_uses[0],
+                                                 input_uses[1]);
+         gcc_assert (change->new_uses.is_valid ());
+
+         auto d1 = drop_memory_access (first->defs ());
+         auto d2 = drop_memory_access (second->defs ());
+         change->new_defs = merge_access_arrays (attempt, d1, d2);
+         gcc_assert (change->new_defs.is_valid ());
+
+         auto new_set = crtl->ssa->create_set (attempt, new_insn, memory);
+         change->new_defs = insert_access (attempt, new_set,
+                                           change->new_defs);
+         gcc_assert (change->new_defs.is_valid ());
+         changes.safe_insert (1, change);
+         pair_change = change;
+       }
+    }
+
+  auto n_changes = changes.length ();
+  gcc_checking_assert (n_changes == 2 || n_changes == 3);
+
+  auto is_changing = insn_is_changing (changes);
+  for (unsigned i = 0; i < n_changes; i++)
+    gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], 
is_changing));
+
+  // Check the pair pattern is recog'd.
+  if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing))
+    {
+      if (dump_file)
+       fprintf (dump_file, "  failed to form pair, recog failed\n");
+
+      // Free any reg notes we allocated.
+      while (reg_notes)
+       {
+         rtx next = XEXP (reg_notes, 1);
+         free_EXPR_LIST_node (reg_notes);
+         reg_notes = next;
+       }
+      cancel_changes (0);
+      return false;
+    }
+
+  gcc_assert (crtl->ssa->verify_insn_changes (changes));
+
+  confirm_change_group ();
+  crtl->ssa->change_insns (changes);
+  emitted_tombstone_p |= have_tombstone_p;
+  return true;
+}
+
+// Return true if STORE_INSN may modify mem rtx MEM.  Make sure we keep
+// within our BUDGET for alias analysis.
+static bool
+store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget)
+{
+  if (tombstone_insn_p (store_insn))
+    return false;
+
+  if (!budget)
+    {
+      if (dump_file)
+       {
+         fprintf (dump_file,
+                  "exceeded budget, assuming store %d aliases with mem ",
+                  store_insn->uid ());
+         print_simple_rtl (dump_file, mem);
+         fprintf (dump_file, "\n");
+       }
+
+      return true;
+    }
+
+  budget--;
+  return memory_modified_in_insn_p (mem, store_insn->rtl ());
+}
+
+// Return true if LOAD may be modified by STORE.  Make sure we keep
+// within our BUDGET for alias analysis.
+static bool
+load_modified_by_store_p (insn_info *load,
+                         insn_info *store,
+                         int &budget)
+{
+  gcc_checking_assert (budget >= 0);
+
+  if (!budget)
+    {
+      if (dump_file)
+       {
+         fprintf (dump_file,
+                  "exceeded budget, assuming load %d aliases with store %d\n",
+                  load->uid (), store->uid ());
+       }
+      return true;
+    }
+
+  // It isn't safe to re-order stores over calls.
+  if (CALL_P (load->rtl ()))
+    return true;
+
+  budget--;
+  return modified_in_p (PATTERN (load->rtl ()), store->rtl ());
+}
+
+struct alias_walker
+{
+  virtual insn_info *insn () const = 0;
+  virtual bool valid () const = 0;
+  virtual bool conflict_p (int &budget) const = 0;
+  virtual void advance () = 0;
+};
+
+template<bool reverse>
+class store_walker : public alias_walker
+{
+  using def_iter_t = typename std::conditional <reverse,
+       reverse_def_iterator, def_iterator>::type;
+
+  def_iter_t def_iter;
+  rtx cand_mem;
+  insn_info *limit;
+
+public:
+  store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn) :
+    def_iter (mem_def), cand_mem (mem), limit (limit_insn) {}
+
+  bool valid () const override
+    {
+      if (!*def_iter)
+       return false;
+
+      if (reverse)
+       return *((*def_iter)->insn ()) > *limit;
+      else
+       return *((*def_iter)->insn ()) < *limit;
+    }
+  insn_info *insn () const override { return (*def_iter)->insn (); }
+  void advance () override { def_iter++; }
+  bool conflict_p (int &budget) const override
+  {
+    return store_modifies_mem_p (cand_mem, insn (), budget);
+  }
+};
+
+template<bool reverse>
+class load_walker : public alias_walker
+{
+  using def_iter_t = typename std::conditional <reverse,
+       reverse_def_iterator, def_iterator>::type;
+  using use_iter_t = typename std::conditional <reverse,
+       reverse_use_iterator, nondebug_insn_use_iterator>::type;
+
+  def_iter_t def_iter;
+  use_iter_t use_iter;
+  insn_info *cand_store;
+  insn_info *limit;
+
+  static use_info *start_use_chain (def_iter_t &def_iter)
+  {
+    set_info *set = nullptr;
+    for (; *def_iter; def_iter++)
+      {
+       set = dyn_cast <set_info *> (*def_iter);
+       if (!set)
+         continue;
+
+       use_info *use = reverse
+         ? set->last_nondebug_insn_use ()
+         : set->first_nondebug_insn_use ();
+
+       if (use)
+         return use;
+      }
+
+    return nullptr;
+  }
+
+public:
+  void advance () override
+  {
+    use_iter++;
+    if (*use_iter)
+      return;
+    def_iter++;
+    use_iter = start_use_chain (def_iter);
+  }
+
+  insn_info *insn () const override
+  {
+    gcc_checking_assert (*use_iter);
+    return (*use_iter)->insn ();
+  }
+
+  bool valid () const override
+  {
+    if (!*use_iter)
+      return false;
+
+    if (reverse)
+      return *((*use_iter)->insn ()) > *limit;
+    else
+      return *((*use_iter)->insn ()) < *limit;
+  }
+
+  bool conflict_p (int &budget) const override
+  {
+    return load_modified_by_store_p (insn (), cand_store, budget);
+  }
+
+  load_walker (def_info *def, insn_info *store, insn_info *limit_insn)
+    : def_iter (def), use_iter (start_use_chain (def_iter)),
+      cand_store (store), limit (limit_insn) {}
+};
+
+// Process our alias_walkers in a round-robin fashion, proceeding until
+// nothing more can be learned from alias analysis.
+//
+// We try to maintain the invariant that if a walker becomes invalid, we
+// set its pointer to null.
+static void
+do_alias_analysis (insn_info *alias_hazards[4],
+                  alias_walker *walkers[4],
+                  bool load_p)
+{
+  const int n_walkers = 2 + (2 * !load_p);
+  int budget = aarch64_ldp_alias_check_limit;
+
+  auto next_walker = [walkers,n_walkers](int current) -> int {
+    for (int j = 1; j <= n_walkers; j++)
+      {
+       int idx = (current + j) % n_walkers;
+       if (walkers[idx])
+         return idx;
+      }
+    return -1;
+  };
+
+  int i = -1;
+  for (int j = 0; j < n_walkers; j++)
+    {
+      alias_hazards[j] = nullptr;
+      if (!walkers[j])
+       continue;
+
+      if (!walkers[j]->valid ())
+       walkers[j] = nullptr;
+      else if (i == -1)
+       i = j;
+    }
+
+  while (i >= 0)
+    {
+      int insn_i = i % 2;
+      int paired_i = (i & 2) + !insn_i;
+      int pair_fst = (i & 2);
+      int pair_snd = (i & 2) + 1;
+
+      if (walkers[i]->conflict_p (budget))
+       {
+         alias_hazards[i] = walkers[i]->insn ();
+
+         // We got an aliasing conflict for this {load,store} walker,
+         // so we don't need to walk any further.
+         walkers[i] = nullptr;
+
+         // If we have a pair of alias conflicts that prevent
+         // forming the pair, stop.  There's no need to do further
+         // analysis.
+         if (alias_hazards[paired_i]
+             && (*alias_hazards[pair_fst] <= *alias_hazards[pair_snd]))
+           return;
+
+         if (!load_p)
+           {
+             int other_pair_fst = (pair_fst ? 0 : 2);
+             int other_paired_i = other_pair_fst + !insn_i;
+
+             int x_pair_fst = (i == pair_fst) ? i : other_paired_i;
+             int x_pair_snd = (i == pair_fst) ? other_paired_i : i;
+
+             // Similarly, handle the case where we have a {load,store}
+             // or {store,load} alias hazard pair that prevents forming
+             // the pair.
+             if (alias_hazards[other_paired_i]
+                 && *alias_hazards[x_pair_fst] <= *alias_hazards[x_pair_snd])
+               return;
+           }
+       }
+
+      if (walkers[i])
+       {
+         walkers[i]->advance ();
+
+         if (!walkers[i]->valid ())
+           walkers[i] = nullptr;
+       }
+
+      i = next_walker (i);
+    }
+}
+
+static void
+get_viable_bases (insn_info *insns[2],
+                 vec <base_cand> &base_cands,
+                 rtx cand_mems[2],
+                 rtx reg_ops[2],
+                 unsigned access_size,
+                 bool reversed)
+{
+  // We discovered this pair through a common MEM_EXPR base.
+  // Need to ensure that we have a common base register def
+  // that is live at both locations.
+  def_info *base_defs[2] = {};
+  for (int i = 0; i < 2; i++)
+    {
+      const bool is_lower = (i == reversed);
+      poly_int64 poly_off;
+      rtx addr = strip_offset (XEXP (cand_mems[i], 0), &poly_off);
+
+      if (!REG_P (addr) || !poly_off.is_constant ())
+       continue;
+
+      // Punt on accesses relative to eliminable regs.  Since we don't know the
+      // elimination offset pre-RA, we should postpone forming pairs on such
+      // accesses until after RA.
+      if (!reload_completed
+         && (REGNO (addr) == FRAME_POINTER_REGNUM
+             || REGNO (addr) == ARG_POINTER_REGNUM))
+       continue;
+
+      HOST_WIDE_INT base_off = poly_off.to_constant ();
+
+      // It should be unlikely that we ever punt here, since MEM_EXPR offset
+      // alignment should be a good proxy for register offset alignment.
+      if (base_off % access_size != 0)
+       {
+         if (dump_file)
+           fprintf (dump_file,
+                    "MEM_EXPR offset aligned, reg offset unaligned "
+                    "(insn %d)\n",
+                    insns[i]->uid ());
+         continue;
+       }
+
+      base_off /= access_size;
+
+      if (!is_lower)
+       base_off--;
+
+      if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM)
+       continue;
+
+      for (auto use : insns[i]->uses ())
+       if (use->is_reg () && use->regno () == REGNO (addr))
+         {
+           base_defs[i] = use->def ();
+           break;
+         }
+    }
+
+  if (!base_defs[0] && !base_defs[1])
+    {
+      if (dump_file)
+       fprintf (dump_file, "no viable base register for pair (%d,%d)\n",
+                insns[0]->uid (), insns[1]->uid ());
+      return;
+    }
+
+  if (base_defs[0] == base_defs[1])
+    {
+      // Easy case: insns already share the same base reg def.
+      base_cands.quick_push (base_defs[0]);
+      return;
+    }
+
+  if (base_defs[0] && base_defs[1]
+      && base_defs[0]->regno () == base_defs[1]->regno ())
+    {
+      // Accesses see different versions of the same
+      // base register, i.e. the base register gets updated
+      // between the two accesses.
+      //
+      // For now, we punt on this, but TODO: we should try
+      // to use an auto-inc ldp/stp where possible here.
+      if (dump_file)
+       fprintf (dump_file, "punting on base register update (%d,%d)\n",
+                insns[0]->uid (), insns[1]->uid ());
+      return;
+    }
+
+  for (int i = 0; i < 2; i++)
+    {
+      if (!base_defs[i])
+       continue;
+
+      // We already know that the offset is such that a valid pair insn
+      // can be formed, but given that we're changing a base, we need to
+      // check that we can safely adjust the mem to get a suitable
+      // paired mem while still satisfying "m".
+      const bool is_lower = (i == reversed);
+      rtx mem = cand_mems[i];
+      rtx other_mem = cand_mems[!i];
+      HOST_WIDE_INT adjust_amt = access_size;
+      if (!is_lower)
+       adjust_amt *= -1;
+      machine_mode this_mode = GET_MODE (mem);
+      machine_mode new_mode = GET_MODE (other_mem);
+      if (!try_adjust_address (mem, new_mode, adjust_amt))
+       {
+         auto canon_mode = canonical_mode_for_mode (new_mode);
+         if (canon_mode == new_mode)
+           {
+             if (dump_file)
+               fprintf (dump_file,
+                        "insn (%d): base not viable, can't adjust mem by %"
+                        PRId64 " (from %smode -> %smode)\n",
+                        insns[i]->uid (), adjust_amt,
+                        mode_name[this_mode], mode_name[new_mode]);
+             continue;
+           }
+
+         if (!try_adjust_address (mem, canon_mode, adjust_amt))
+           {
+
+             if (dump_file)
+               fprintf (dump_file,
+                        "insn (%d): base not viable, can't adjust mem by %"
+                        PRId64 " (%s -> %s) or in canonical %smode\n",
+                        insns[i]->uid (), adjust_amt,
+                        mode_name[this_mode], mode_name[new_mode],
+                        mode_name[canon_mode]);
+             continue;
+           }
+
+         rtx reg_op = lowpart_subreg (canon_mode, reg_ops[!i], new_mode);
+         if (!reg_op)
+           {
+             if (dump_file)
+               fprintf (dump_file,
+                        "insn (%d): base not viable, can only adjust mem "
+                        "in %smode, but reg op can't be punned from %smode\n",
+                        insns[i]->uid (),
+                        mode_name[canon_mode], mode_name[new_mode]);
+             continue;
+           }
+
+         if (dump_file)
+           fprintf (dump_file,
+                    "insn (%d): can't adjust mem by %" PRId64
+                    " (%s -> %s) but can in (canonical) %smode\n",
+                    insns[i]->uid (), adjust_amt,
+                    mode_name[this_mode], mode_name[new_mode],
+                    mode_name[canon_mode]);
+       }
+
+      base_cands.quick_push (base_cand {base_defs[i], i});
+    }
+}
+
+// Given two adjacent memory accesses of the same size, I1 and I2, try
+// and see if we can merge them into a ldp or stp.
+static bool
+try_fuse_pair (bool load_p,
+              unsigned access_size,
+              insn_info *i1,
+              insn_info *i2,
+              base_info binfo,
+              bool &emitted_tombstone_p)
+{
+  if (dump_file)
+    fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n",
+            load_p, i1->uid (), i2->uid ());
+
+  insn_info *insns[2];
+  bool reversed = false;
+  if (*i1 < *i2)
+    {
+      insns[0] = i1;
+      insns[1] = i2;
+    }
+  else
+    {
+      insns[0] = i2;
+      insns[1] = i1;
+      reversed = true;
+    }
+
+  rtx cand_mems[2];
+  rtx reg_ops[2];
+  for (int i = 0; i < 2; i++)
+    {
+      rtx pat = PATTERN (insns[i]->rtl ());
+      cand_mems[i] = XEXP (pat, load_p);
+      reg_ops[i] = XEXP (pat, !load_p);
+    }
+
+  if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1]))
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "punting on ldp due to reg conflcits (%d,%d)\n",
+                insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  if (cfun->can_throw_non_call_exceptions
+      && insn_could_throw_p (insns[0]->rtl ())
+      && insn_could_throw_p (insns[1]->rtl ()))
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "can't combine insns with EH side effects (%d,%d)\n",
+                insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  auto_vec <base_cand> base_cands;
+  base_cands.reserve (2);
+  if (binfo.is_reg ())
+    // Simple case: both accesses using the same base register def.
+    base_cands.quick_push (binfo.get_reg ());
+  else
+    {
+      get_viable_bases (insns, base_cands, cand_mems,
+                       reg_ops, access_size, reversed);
+      if (base_cands.is_empty ())
+       return false;
+    }
+
+  for (auto use : insns[1]->uses ())
+    if (!use->is_mem () && use->def () && use->def ()->insn () == insns[0])
+      {
+       if (dump_file)
+         fprintf (dump_file, "%d has true dependence on %d, rejecting pair\n",
+                  insns[1]->uid (), insns[0]->uid ());
+       return false;
+      }
+
+  unsigned i = 0;
+  while (i < base_cands.length ())
+    {
+      base_cand &cand = base_cands[i];
+
+      rtx *ignore[2] = {};
+      for (int j = 0; j < 2; j++)
+       if (cand.from_insn == -1 || cand.from_insn == !j)
+         ignore[j] = &XEXP (cand_mems[j], 0);
+
+      insn_info *h = first_hazard_after (insns[0], ignore[0]);
+      if (h && *h <= *insns[1])
+       cand.hazards[0] = h;
+
+      h = latest_hazard_before (insns[1], ignore[1]);
+      if (h && *h >= *insns[0])
+       cand.hazards[1] = h;
+
+      if (!cand.viable ())
+       {
+         if (dump_file)
+           fprintf (dump_file,
+                    "pair (%d,%d): rejecting base %d due to dataflow "
+                    "hazards (%d,%d)\n",
+                    insns[0]->uid (),
+                    insns[1]->uid (),
+                    cand.m_def->regno (),
+                    cand.hazards[0]->uid (),
+                    cand.hazards[1]->uid ());
+
+         base_cands.ordered_remove (i);
+       }
+      else
+       i++;
+    }
+
+  if (base_cands.is_empty ())
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "can't form pair (%d,%d) due to dataflow hazards\n",
+                insns[0]->uid (), insns[1]->uid ());
+      return false;
+    }
+
+  insn_info *alias_hazards[4] = {};
+
+  // First def of memory after the first insn, and last def of memory
+  // before the second insn, respectively.
+  def_info *mem_defs[2] = {};
+  if (load_p)
+    {
+      if (!MEM_READONLY_P (cand_mems[0]))
+       {
+         mem_defs[0] = memory_access (insns[0]->uses ())->def ();
+         gcc_checking_assert (mem_defs[0]);
+         mem_defs[0] = mem_defs[0]->next_def ();
+       }
+      if (!MEM_READONLY_P (cand_mems[1]))
+       {
+         mem_defs[1] = memory_access (insns[1]->uses ())->def ();
+         gcc_checking_assert (mem_defs[1]);
+       }
+    }
+  else
+    {
+      mem_defs[0] = memory_access (insns[0]->defs ())->next_def ();
+      mem_defs[1] = memory_access (insns[1]->defs ())->prev_def ();
+      gcc_checking_assert (mem_defs[0]);
+      gcc_checking_assert (mem_defs[1]);
+    }
+
+  store_walker<false> forward_store_walker (mem_defs[0],
+                                           cand_mems[0],
+                                           insns[1]);
+  store_walker<true> backward_store_walker (mem_defs[1],
+                                           cand_mems[1],
+                                           insns[0]);
+  alias_walker *walkers[4] = {};
+  if (mem_defs[0])
+    walkers[0] = &forward_store_walker;
+  if (mem_defs[1])
+    walkers[1] = &backward_store_walker;
+
+  if (load_p && (mem_defs[0] || mem_defs[1]))
+    do_alias_analysis (alias_hazards, walkers, load_p);
+  else
+    {
+      // We want to find any loads hanging off the first store.
+      mem_defs[0] = memory_access (insns[0]->defs ());
+      load_walker<false> forward_load_walker (mem_defs[0], insns[0], insns[1]);
+      load_walker<true> backward_load_walker (mem_defs[1], insns[1], insns[0]);
+      walkers[2] = &forward_load_walker;
+      walkers[3] = &backward_load_walker;
+      do_alias_analysis (alias_hazards, walkers, load_p);
+      // Now consolidate hazards back down.
+      if (alias_hazards[2]
+         && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0])))
+       alias_hazards[0] = alias_hazards[2];
+
+      if (alias_hazards[3]
+         && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1])))
+       alias_hazards[1] = alias_hazards[3];
+    }
+
+  if (alias_hazards[0] && alias_hazards[1]
+      && *alias_hazards[0] <= *alias_hazards[1])
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n",
+                i1->uid (), i2->uid (),
+                alias_hazards[0]->uid (), alias_hazards[1]->uid ());
+      return false;
+    }
+
+  // Now narrow the hazards on each base candidate using
+  // the alias hazards.
+  i = 0;
+  while (i < base_cands.length ())
+    {
+      base_cand &cand = base_cands[i];
+      if (alias_hazards[0] && (!cand.hazards[0]
+                              || *alias_hazards[0] < *cand.hazards[0]))
+       cand.hazards[0] = alias_hazards[0];
+      if (alias_hazards[1] && (!cand.hazards[1]
+                              || *alias_hazards[1] > *cand.hazards[1]))
+       cand.hazards[1] = alias_hazards[1];
+
+      if (cand.viable ())
+       i++;
+      else
+       {
+         if (dump_file)
+           fprintf (dump_file, "pair (%d,%d): rejecting base %d due to "
+                               "alias/dataflow hazards (%d,%d)",
+                               insns[0]->uid (), insns[1]->uid (),
+                               cand.m_def->regno (),
+                               cand.hazards[0]->uid (),
+                               cand.hazards[1]->uid ());
+
+         base_cands.ordered_remove (i);
+       }
+    }
+
+  if (base_cands.is_empty ())
+    {
+      if (dump_file)
+       fprintf (dump_file,
+                "cannot form pair (%d,%d) due to alias/dataflow hazards",
+                insns[0]->uid (), insns[1]->uid ());
+
+      return false;
+    }
+
+  base_cand *base = &base_cands[0];
+  if (base_cands.length () > 1)
+    {
+      // If there are still multiple viable bases, it makes sense
+      // to choose one that allows us to reduce register pressure,
+      // for loads this means moving further down, for stores this
+      // means moving further up.
+      gcc_checking_assert (base_cands.length () == 2);
+      const int hazard_i = !load_p;
+      if (base->hazards[hazard_i])
+       {
+         if (!base_cands[1].hazards[hazard_i])
+           base = &base_cands[1];
+         else if (load_p
+                  && *base_cands[1].hazards[hazard_i]
+                     > *(base->hazards[hazard_i]))
+           base = &base_cands[1];
+         else if (!load_p
+                  && *base_cands[1].hazards[hazard_i]
+                     < *(base->hazards[hazard_i]))
+           base = &base_cands[1];
+       }
+    }
+
+  // Otherwise, hazards[0] > hazards[1].
+  // Pair can be formed anywhere in (hazards[1], hazards[0]).
+  insn_range_info range (insns[0], insns[1]);
+  if (base->hazards[1])
+    range.first = base->hazards[1];
+  if (base->hazards[0])
+    range.last = base->hazards[0]->prev_nondebug_insn ();
+
+  // Placement strategy: push loads down and pull stores up, this should
+  // help register pressure by reducing live ranges.
+  if (load_p)
+    range.first = range.last;
+  else
+    range.last = range.first;
+
+  if (dump_file)
+    {
+      auto print_hazard = [](insn_info *i)
+       {
+         if (i)
+           fprintf (dump_file, "%d", i->uid ());
+         else
+           fprintf (dump_file, "-");
+       };
+      auto print_pair = [print_hazard](insn_info **i)
+       {
+         print_hazard (i[0]);
+         fprintf (dump_file, ",");
+         print_hazard (i[1]);
+       };
+
+      fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (",
+             load_p, insns[0]->uid (), insns[1]->uid (),
+             base->m_def->regno ());
+      print_pair (base->hazards);
+      fprintf (dump_file, "), move_range: (%d,%d)\n",
+              range.first->uid (), range.last->uid ());
+    }
+
+  return fuse_pair (load_p, i1, i2, *base, range, emitted_tombstone_p);
+}
+
+// Erase [l.begin (), i] inclusive, respecting iterator order.
+static insn_iter_t
+erase_prefix (insn_list_t &l, insn_iter_t i)
+{
+  l.erase (l.begin (), std::next (i));
+  return l.begin ();
+}
+
+static insn_iter_t
+erase_one (insn_list_t &l, insn_iter_t i, insn_iter_t begin)
+{
+  auto prev_or_next = (i == begin) ? std::next (i) : std::prev (i);
+  l.erase (i);
+  return prev_or_next;
+}
+
+static void
+dump_insn_list (FILE *f, const insn_list_t &l)
+{
+  fprintf (f, "(");
+
+  auto i = l.begin ();
+  auto end = l.end ();
+
+  if (i != end)
+    fprintf (f, "%d", (*i)->uid ());
+  i++;
+
+  for (; i != end; i++)
+    {
+      fprintf (f, ", %d", (*i)->uid ());
+    }
+
+  fprintf (f, ")");
+}
+
+DEBUG_FUNCTION void
+debug (const insn_list_t &l)
+{
+  dump_insn_list (stderr, l);
+  fprintf (stderr, "\n");
+}
+
+void
+merge_pairs (insn_iter_t l_begin,
+            insn_iter_t l_end,
+            insn_iter_t r_begin,
+            insn_iter_t r_end,
+            insn_list_t &left_list,
+            insn_list_t &right_list,
+            hash_set <insn_info *> &to_delete,
+            bool load_p,
+            unsigned access_size,
+            base_info binfo,
+            bool &emitted_tombstone_p)
+{
+  auto iter_l = l_begin;
+  auto iter_r = r_begin;
+
+  bool result;
+  while (l_begin != l_end && r_begin != r_end)
+    {
+      auto next_l = std::next (iter_l);
+      auto next_r = std::next (iter_r);
+      if (**iter_l < **iter_r
+         && next_l != l_end
+         && **next_l < **iter_r)
+       {
+         iter_l = next_l;
+         continue;
+       }
+      else if (**iter_r < **iter_l
+              && next_r != r_end
+              && **next_r < **iter_l)
+       {
+         iter_r = next_r;
+         continue;
+       }
+
+      bool update_l = false;
+      bool update_r = false;
+
+      result = try_fuse_pair (load_p, access_size,
+                             *iter_l, *iter_r, binfo,
+                             emitted_tombstone_p);
+      if (result)
+       {
+         update_l = update_r = true;
+         if (to_delete.add (*iter_r))
+           gcc_unreachable (); // Shouldn't get added twice.
+
+         iter_l = erase_one (left_list, iter_l, l_begin);
+         iter_r = erase_one (right_list, iter_r, r_begin);
+       }
+      else
+       {
+         // Here we know that the entire prefix we skipped
+         // over cannot merge with anything further on
+         // in iteration order (there are aliasing hazards
+         // on both sides), so delete the entire prefix.
+         if (**iter_l < **iter_r)
+           {
+             // Delete everything from l_begin to iter_l, inclusive.
+             update_l = true;
+             iter_l = erase_prefix (left_list, iter_l);
+           }
+         else
+           {
+             // Delete everything from r_begin to iter_r, inclusive.
+             update_r = true;
+             iter_r = erase_prefix (right_list, iter_r);
+           }
+       }
+
+      if (update_l)
+       {
+         l_begin = left_list.begin ();
+         l_end = left_list.end ();
+       }
+      if (update_r)
+       {
+         r_begin = right_list.begin ();
+         r_end = right_list.end ();
+       }
+    }
+}
+
+// Given a list of insns LEFT_ORIG with all accesses adjacent to
+// those in RIGHT_ORIG, try and form them into pairs.
+//
+// Return true iff we formed all the RIGHT_ORIG candidates into
+// pairs.
+bool
+ldp_bb_info::try_form_pairs (insn_list_t *left_orig,
+                            insn_list_t *right_orig,
+                            bool load_p, unsigned access_size,
+                            base_info binfo)
+{
+  // Make a copy of the right list which we can modify to
+  // exclude candidates locally for this invocation.
+  insn_list_t right_copy (*right_orig);
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "try_form_pairs [L=%d], cand vecs ", load_p);
+      dump_insn_list (dump_file, *left_orig);
+      fprintf (dump_file, " x ");
+      dump_insn_list (dump_file, right_copy);
+      fprintf (dump_file, "\n");
+    }
+
+  // List of candidate insns to delete from the original right_list
+  // (because they were formed into a pair).
+  hash_set <insn_info *> to_delete;
+
+  // Now we have a 2D matrix of candidates, traverse it to try and
+  // find a pair of insns that are already adjacent (within the
+  // merged list of accesses).
+  merge_pairs (left_orig->begin (), left_orig->end (),
+              right_copy.begin (), right_copy.end (),
+              *left_orig, right_copy,
+              to_delete, load_p, access_size, binfo,
+              m_emitted_tombstone);
+
+  // If we formed all right candidates into pairs,
+  // then we can skip the next iteration.
+  if (to_delete.elements () == right_orig->size ())
+    return true;
+
+  // Delete items from to_delete.
+  auto right_iter = right_orig->begin ();
+  auto right_end = right_orig->end ();
+  while (right_iter != right_end)
+    {
+      auto right_next = std::next (right_iter);
+
+      if (to_delete.contains (*right_iter))
+       {
+         right_orig->erase (right_iter);
+         right_end = right_orig->end ();
+       }
+
+      right_iter = right_next;
+    }
+
+  return false;
+}
+
+void
+ldp_bb_info::transform_for_base (base_info binfo,
+                                int encoded_lfs,
+                                access_group &group)
+{
+  const auto lfs = decode_lfs (encoded_lfs);
+  const unsigned access_size = lfs.size;
+
+  bool skip_next = true;
+  access_record *prev_access = nullptr;
+
+  for (auto &access : group.list)
+    {
+      if (skip_next)
+       skip_next = false;
+      else if (known_eq (access.offset, prev_access->offset + access_size))
+       skip_next = try_form_pairs (&prev_access->cand_insns,
+                                   &access.cand_insns,
+                                   lfs.load_p, access_size, binfo);
+
+      prev_access = &access;
+    }
+}
+
+void
+ldp_bb_info::cleanup_tombstones ()
+{
+  // No need to do anything if we didn't emit a tombstone insn for this bb.
+  if (!m_emitted_tombstone)
+    return;
+
+  insn_info *insn = m_bb->head_insn ();
+  while (insn)
+    {
+      insn_info *next = insn->next_nondebug_insn ();
+      if (!insn->is_real () || !tombstone_insn_p (insn))
+       {
+         insn = next;
+         continue;
+       }
+
+      auto def = memory_access (insn->defs ());
+      auto set = dyn_cast <set_info *> (def);
+      if (set && set->has_any_uses ())
+       {
+         def_info *prev_def = def->prev_def ();
+         auto prev_set = dyn_cast <set_info *> (prev_def);
+         if (!prev_set)
+           gcc_unreachable (); // TODO: handle this if needed.
+
+         while (set->first_use ())
+           crtl->ssa->reparent_use (set->first_use (), prev_set);
+       }
+
+      // Now set has no uses, we can delete it.
+      insn_change change (insn, insn_change::DELETE);
+      crtl->ssa->change_insn (change);
+      insn = next;
+    }
+}
+
+template<typename Map>
+void
+ldp_bb_info::traverse_base_map (Map &map)
+{
+  for (auto kv : map)
+    {
+      const auto &key = kv.first;
+      auto &value = kv.second;
+      const base_info binfo (key.first);
+      transform_for_base (binfo, key.second, value);
+    }
+}
+
+void
+ldp_bb_info::transform ()
+{
+  traverse_base_map (expr_map);
+  traverse_base_map (def_map);
+}
+
+static void
+ldp_fusion_init ()
+{
+  calculate_dominance_info (CDI_DOMINATORS);
+  df_analyze ();
+  crtl->ssa = new rtl_ssa::function_info (cfun);
+}
+
+static void
+ldp_fusion_destroy ()
+{
+  if (crtl->ssa->perform_pending_updates ())
+    cleanup_cfg (0);
+
+  free_dominance_info (CDI_DOMINATORS);
+
+  delete crtl->ssa;
+  crtl->ssa = nullptr;
+}
+
+void ldp_fusion_bb (bb_info *bb)
+{
+  const bool track_loads
+    = aarch64_tune_params.ldp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
+  const bool track_stores
+    = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER;
+
+  ldp_bb_info bb_state (bb);
+
+  for (auto insn : bb->nondebug_insns ())
+    {
+      rtx_insn *rti = insn->rtl ();
+
+      if (!rti || !INSN_P (rti))
+       continue;
+
+      rtx pat = PATTERN (rti);
+
+      if (GET_CODE (pat) != SET)
+       continue;
+
+      if (track_stores && MEM_P (XEXP (pat, 0)))
+       bb_state.track_access (insn, false, XEXP (pat, 0));
+      else if (track_loads && MEM_P (XEXP (pat, 1)))
+       bb_state.track_access (insn, true, XEXP (pat, 1));
+    }
+
+  bb_state.transform ();
+  bb_state.cleanup_tombstones ();
+}
+
+void ldp_fusion ()
+{
+  ldp_fusion_init ();
+
+  for (auto bb : crtl->ssa->bbs ())
+    ldp_fusion_bb (bb);
+
+  ldp_fusion_destroy ();
+}
+
+namespace {
+
+const pass_data pass_data_ldp_fusion =
+{
+  RTL_PASS, /* type */
+  "ldp_fusion", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_NONE, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_ldp_fusion : public rtl_opt_pass
+{
+public:
+  pass_ldp_fusion (gcc::context *ctx)
+    : rtl_opt_pass (pass_data_ldp_fusion, ctx)
+    {}
+
+  opt_pass *clone () override { return new pass_ldp_fusion (m_ctxt); }
+
+  bool gate (function *) final override
+    {
+      if (!optimize || optimize_debug)
+       return false;
+
+      // If the tuning policy says never to form ldps or stps, don't run
+      // the pass.
+      if ((aarch64_tune_params.ldp_policy_model
+          == AARCH64_LDP_STP_POLICY_NEVER)
+         && (aarch64_tune_params.stp_policy_model
+             == AARCH64_LDP_STP_POLICY_NEVER))
+       return false;
+
+      if (reload_completed)
+       return flag_aarch64_late_ldp_fusion;
+      else
+       return flag_aarch64_early_ldp_fusion;
+    }
+
+  unsigned execute (function *) final override
+    {
+      ldp_fusion ();
+      return 0;
+    }
+};
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_ldp_fusion (gcc::context *ctx)
+{
+  return new pass_ldp_fusion (ctx);
+}
+
+#include "gt-aarch64-ldp-fusion.h"
diff --git a/gcc/config/aarch64/aarch64-passes.def 
b/gcc/config/aarch64/aarch64-passes.def
index 6ace797b738..f38c642414e 100644
--- a/gcc/config/aarch64/aarch64-passes.def
+++ b/gcc/config/aarch64/aarch64-passes.def
@@ -23,3 +23,5 @@ INSERT_PASS_BEFORE (pass_reorder_blocks, 1, 
pass_track_speculation);
 INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
 INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
 INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
+INSERT_PASS_BEFORE (pass_early_remat, 1, pass_ldp_fusion);
+INSERT_PASS_BEFORE (pass_peephole2, 1, pass_ldp_fusion);
diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 60a55f4bc19..1ccc516622b 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1049,6 +1049,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context 
*);
 rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
 rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
 rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
+rtl_opt_pass *make_pass_ldp_fusion (gcc::context *);
 
 poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index f5a518202a1..5d63b2ec8ac 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -271,6 +271,16 @@ mtrack-speculation
 Target Var(aarch64_track_speculation)
 Generate code to track when the CPU might be speculating incorrectly.
 
+mearly-ldp-fusion
+Target Var(flag_aarch64_early_ldp_fusion) Optimization Init(1)
+Enable the pre-RA AArch64-specific pass to fuse loads and stores into
+ldp and stp instructions.
+
+mlate-ldp-fusion
+Target Var(flag_aarch64_late_ldp_fusion) Optimization Init(1)
+Enable the post-RA AArch64-specific pass to fuse loads and stores into
+ldp and stp instructions.
+
 mstack-protector-guard=
 Target RejectNegative Joined Enum(stack_protector_guard) 
Var(aarch64_stack_protector_guard) Init(SSP_GLOBAL)
 Use given stack-protector guard.
@@ -360,3 +370,13 @@ Enum(aarch64_ldp_stp_policy) String(never) 
Value(AARCH64_LDP_STP_POLICY_NEVER)
 
 EnumValue
 Enum(aarch64_ldp_stp_policy) String(aligned) 
Value(AARCH64_LDP_STP_POLICY_ALIGNED)
+
+-param=aarch64-ldp-alias-check-limit=
+Target Joined UInteger Var(aarch64_ldp_alias_check_limit) Init(8) 
IntegerRange(0, 65536) Param
+Limit on number of alias checks performed when attempting to form an ldp/stp.
+
+-param=aarch64-ldp-canonicalize-modes=
+Target Joined UInteger Var(aarch64_ldp_canonicalize_modes) Init(0) 
IntegerRange(0,1) Param
+Debugging param to change the strategy for adjusting modes when forming load
+and store pairs.  When set to 0, we only ensure modes agree on VECTOR_MODE_P.
+When set to 1, we canonicalize to a single mode for a given size.
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index a9a244ab6d6..37917344a54 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -176,6 +176,13 @@ aarch64-cc-fusion.o: 
$(srcdir)/config/aarch64/aarch64-cc-fusion.cc \
        $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
                $(srcdir)/config/aarch64/aarch64-cc-fusion.cc
 
+aarch64-ldp-fusion.o: $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc \
+    $(CONFIG_H) $(SYSTEM_H) $(CORETYPES_H) $(BACKEND_H) $(RTL_H) $(DF_H) \
+    $(RTL_SSA_H) cfgcleanup.h tree-pass.h ordered-hash-map.h tree-dfa.h \
+    fold-const.h tree-hash-traits.h print-tree.h
+       $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
+               $(srcdir)/config/aarch64/aarch64-ldp-fusion.cc
+
 comma=,
 MULTILIB_OPTIONS    = $(subst $(comma),/, $(patsubst %, mabi=%, $(subst 
$(comma),$(comma)mabi=,$(TM_MULTILIB_CONFIG))))
 MULTILIB_DIRNAMES   = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))

[PATCH v2 11/11] aarch64: Add new load/store pair fusion pass

Reply via email to