Hello Richard: On 23/02/24 1:19 am, Richard Sandiford wrote: > Ajit Agarwal <aagar...@linux.ibm.com> writes: >> Hello Alex/Richard: >> >> I have placed target indpendent and target dependent code in >> aarch64-ldp-fusion for load store fusion. >> >> Common infrastructure of load store pair fusion is divided into >> target independent and target dependent code. >> >> Target independent code is the Generic code with pure virtual >> function to interface betwwen target independent and dependent >> code. >> >> Target dependent code is the implementation of pure virtual >> function for aarch64 target and the call to target independent >> code. > > Thanks for the update. This is still quite hard to review though. > Sorry to ask for another round, but could you split it up further? > The ideal thing would be if patches that move code do nothing other > than move code, and if patches that change code do those changes > in-place. >
As per your suggestion I have submitted new patch with above changes. Sorry for inconvenience caused. Thanks & Regards Ajit > Richard > >> >> Bootstrapped in aarch64-linux-gnu. >> >> Thanks & Regards >> Ajit >> >> >> aarch64: Place target independent and dependent code in one file. >> >> Common infrastructure of load store pair fusion is divided into >> target independent and target dependent code. >> >> Target independent code is the Generic code with pure virtual >> function to interface betwwen target independent and dependent >> code. >> >> Target dependent code is the implementation of pure virtual >> function for aarch64 target and the call to target independent >> code. >> >> 2024-02-15 Ajit Kumar Agarwal <aagar...@linux.ibm.com> >> >> gcc/ChangeLog: >> >> * config/aarch64/aarch64-ldp-fusion.cc: Place target >> independent and dependent code. >> --- >> gcc/config/aarch64/aarch64-ldp-fusion.cc | 3513 ++++++++++++---------- >> 1 file changed, 1842 insertions(+), 1671 deletions(-) >> >> diff --git a/gcc/config/aarch64/aarch64-ldp-fusion.cc >> b/gcc/config/aarch64/aarch64-ldp-fusion.cc >> index 22ed95eb743..0ab842e2bbb 100644 >> --- a/gcc/config/aarch64/aarch64-ldp-fusion.cc >> +++ b/gcc/config/aarch64/aarch64-ldp-fusion.cc >> @@ -17,6 +17,7 @@ >> // along with GCC; see the file COPYING3. If not see >> // <http://www.gnu.org/licenses/>. >> >> + >> #define INCLUDE_ALGORITHM >> #define INCLUDE_FUNCTIONAL >> #define INCLUDE_LIST >> @@ -37,13 +38,12 @@ >> #include "tree-hash-traits.h" >> #include "print-tree.h" >> #include "insn-attr.h" >> - >> using namespace rtl_ssa; >> >> -static constexpr HOST_WIDE_INT LDP_IMM_BITS = 7; >> -static constexpr HOST_WIDE_INT LDP_IMM_SIGN_BIT = (1 << (LDP_IMM_BITS - 1)); >> -static constexpr HOST_WIDE_INT LDP_MAX_IMM = LDP_IMM_SIGN_BIT - 1; >> -static constexpr HOST_WIDE_INT LDP_MIN_IMM = -LDP_MAX_IMM - 1; >> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_BITS = 7; >> +static constexpr HOST_WIDE_INT PAIR_MEM_IMM_SIGN_BIT = (1 << >> (PAIR_MEM_IMM_BITS - 1)); >> +static constexpr HOST_WIDE_INT PAIR_MEM_MAX_IMM = PAIR_MEM_IMM_SIGN_BIT - 1; >> +static constexpr HOST_WIDE_INT PAIR_MEM_MIN_IMM = -PAIR_MEM_MAX_IMM - 1; >> >> // We pack these fields (load_p, fpsimd_p, and size) into an integer >> // (LFS) which we use as part of the key into the main hash tables. >> @@ -138,8 +138,144 @@ struct alt_base >> poly_int64 offset; >> }; >> >> +// Class that implements a state machine for building the changes needed to >> form >> +// a store pair instruction. This allows us to easily build the changes in >> +// program order, as required by rtl-ssa. >> +struct stp_change_builder >> +{ >> + enum class state >> + { >> + FIRST, >> + INSERT, >> + FIXUP_USE, >> + LAST, >> + DONE >> + }; >> + >> + enum class action >> + { >> + TOMBSTONE, >> + CHANGE, >> + INSERT, >> + FIXUP_USE >> + }; >> + >> + struct change >> + { >> + action type; >> + insn_info *insn; >> + }; >> + >> + bool done () const { return m_state == state::DONE; } >> + >> + stp_change_builder (insn_info *insns[2], >> + insn_info *repurpose, >> + insn_info *dest) >> + : m_state (state::FIRST), m_insns { insns[0], insns[1] }, >> + m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {} >> + >> + change get_change () const >> + { >> + switch (m_state) >> + { >> + case state::FIRST: >> + return { >> + m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE, >> + m_insns[0] >> + }; >> + case state::LAST: >> + return { >> + m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE, >> + m_insns[1] >> + }; >> + case state::INSERT: >> + return { action::INSERT, m_dest }; >> + case state::FIXUP_USE: >> + return { action::FIXUP_USE, m_use->insn () }; >> + case state::DONE: >> + break; >> + } >> + >> + gcc_unreachable (); >> + } >> + >> + // Transition to the next state. >> + void advance () >> + { >> + switch (m_state) >> + { >> + case state::FIRST: >> + if (m_repurpose) >> + m_state = state::LAST; >> + else >> + m_state = state::INSERT; >> + break; >> + case state::INSERT: >> + { >> + def_info *def = memory_access (m_insns[0]->defs ()); >> + while (*def->next_def ()->insn () <= *m_dest) >> + def = def->next_def (); >> + >> + // Now we know DEF feeds the insertion point for the new stp. >> + // Look for any uses of DEF that will consume the new stp. >> + gcc_assert (*def->insn () <= *m_dest >> + && *def->next_def ()->insn () > *m_dest); >> + >> + auto set = as_a<set_info *> (def); >> + for (auto use : set->nondebug_insn_uses ()) >> + if (*use->insn () > *m_dest) >> + { >> + m_use = use; >> + break; >> + } >> + >> + if (m_use) >> + m_state = state::FIXUP_USE; >> + else >> + m_state = state::LAST; >> + break; >> + } >> + case state::FIXUP_USE: >> + m_use = m_use->next_nondebug_insn_use (); >> + if (!m_use) >> + m_state = state::LAST; >> + break; >> + case state::LAST: >> + m_state = state::DONE; >> + break; >> + case state::DONE: >> + gcc_unreachable (); >> + } >> + } >> + >> +private: >> + state m_state; >> + >> + // Original candidate stores. >> + insn_info *m_insns[2]; >> + >> + // If non-null, this is a candidate insn to change into an stp. >> Otherwise we >> + // are deleting both original insns and inserting a new insn for the stp. >> + insn_info *m_repurpose; >> + >> + // Destionation of the stp, it will be placed immediately after m_dest. >> + insn_info *m_dest; >> + >> + // Current nondebug use that needs updating due to stp insertion. >> + use_info *m_use; >> +}; >> + >> +// Virtual base class for load/store walkers used in alias analysis. >> +struct alias_walker >> +{ >> + virtual bool conflict_p (int &budget) const = 0; >> + virtual insn_info *insn () const = 0; >> + virtual bool valid () const = 0; >> + virtual void advance () = 0; >> +}; >> + >> // State used by the pass for a given basic block. >> -struct ldp_bb_info >> +struct pair_fusion >> { >> using def_hash = nofree_ptr_hash<def_info>; >> using expr_key_t = pair_hash<tree_operand_hash, int_hash<int, -1, -2>>; >> @@ -161,13 +297,13 @@ struct ldp_bb_info >> static const size_t obstack_alignment = sizeof (void *); >> bb_info *m_bb; >> >> - ldp_bb_info (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false) >> + pair_fusion (bb_info *bb) : m_bb (bb), m_emitted_tombstone (false) >> { >> obstack_specify_allocation (&m_obstack, OBSTACK_CHUNK_SIZE, >> obstack_alignment, obstack_chunk_alloc, >> obstack_chunk_free); >> } >> - ~ldp_bb_info () >> + ~pair_fusion () >> { >> obstack_free (&m_obstack, nullptr); >> >> @@ -177,10 +313,50 @@ struct ldp_bb_info >> bitmap_obstack_release (&m_bitmap_obstack); >> } >> } >> + void track_access (insn_info *, bool load, rtx mem); >> + void transform (); >> + void cleanup_tombstones (); >> + virtual void set_multiword_subreg (insn_info *i1, insn_info *i2, >> + bool load_p) = 0; >> + virtual rtx gen_load_store_pair (rtx *pats, rtx writeback, >> + bool load_p) = 0; >> + void merge_pairs (insn_list_t &, insn_list_t &, >> + bool load_p, unsigned access_size); >> + virtual void transform_for_base (int load_size, access_group &group) = 0; >> + >> + bool try_fuse_pair (bool load_p, unsigned access_size, >> + insn_info *i1, insn_info *i2); >> + >> + bool fuse_pair (bool load_p, unsigned access_size, >> + int writeback, >> + insn_info *i1, insn_info *i2, >> + base_cand &base, >> + const insn_range_info &move_range); >> + >> + void do_alias_analysis (insn_info *alias_hazards[4], >> + alias_walker *walkers[4], >> + bool load_p); >> + >> + void track_tombstone (int uid); >> + >> + bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs); >> >> - inline void track_access (insn_info *, bool load, rtx mem); >> - inline void transform (); >> - inline void cleanup_tombstones (); >> + virtual bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, >> + bool load_p) = 0; >> + >> + virtual bool pair_operand_mode_ok_p (machine_mode mode) = 0; >> + virtual bool pair_trailing_writeback_p () = 0; >> + virtual bool pair_check_register_operand (bool load_p, rtx reg_op, >> + machine_mode mem_mode) = 0; >> + virtual int pair_mem_alias_check_limit () = 0; >> + virtual bool pair_is_writeback () = 0 ; >> + virtual bool pair_mem_ok_policy (rtx first_mem, bool load_p, >> + machine_mode mode) = 0; >> + virtual bool fuseable_store_p (insn_info *i1, insn_info *i2) = 0; >> + virtual bool fuseable_load_p (insn_info *info) = 0; >> + >> + template<typename Map> >> + void traverse_base_map (Map &map); >> >> private: >> obstack m_obstack; >> @@ -191,100 +367,292 @@ private: >> bool m_emitted_tombstone; >> >> inline splay_tree_node<access_record *> *node_alloc (access_record *); >> - >> - template<typename Map> >> - inline void traverse_base_map (Map &map); >> - inline void transform_for_base (int load_size, access_group &group); >> - >> - inline void merge_pairs (insn_list_t &, insn_list_t &, >> - bool load_p, unsigned access_size); >> - >> - inline bool try_fuse_pair (bool load_p, unsigned access_size, >> - insn_info *i1, insn_info *i2); >> - >> - inline bool fuse_pair (bool load_p, unsigned access_size, >> - int writeback, >> - insn_info *i1, insn_info *i2, >> - base_cand &base, >> - const insn_range_info &move_range); >> - >> - inline void track_tombstone (int uid); >> - >> - inline bool track_via_mem_expr (insn_info *, rtx mem, lfs_fields lfs); >> }; >> - >> -splay_tree_node<access_record *> * >> -ldp_bb_info::node_alloc (access_record *access) >> -{ >> - using T = splay_tree_node<access_record *>; >> - void *addr = obstack_alloc (&m_obstack, sizeof (T)); >> - return new (addr) T (access); >> -} >> - >> -// Given a mem MEM, if the address has side effects, return a MEM that >> accesses >> -// the same address but without the side effects. Otherwise, return >> -// MEM unchanged. >> -static rtx >> -drop_writeback (rtx mem) >> +// Track the access INSN at offset OFFSET in this access group. >> +// ALLOC_NODE is used to allocate splay tree nodes. >> +template<typename Alloc> >> +void >> +access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn) >> { >> - rtx addr = XEXP (mem, 0); >> + auto insert_before = [&](std::list<access_record>::iterator after) >> + { >> + auto it = list.emplace (after, offset); >> + it->cand_insns.push_back (insn); >> + it->place = it; >> + return &*it; >> + }; >> >> - if (!side_effects_p (addr)) >> - return mem; >> + if (!list.size ()) >> + { >> + auto access = insert_before (list.end ()); >> + tree.insert_max_node (alloc_node (access)); >> + return; >> + } >> >> - switch (GET_CODE (addr)) >> + auto compare = [&](splay_tree_node<access_record *> *node) >> { >> - case PRE_MODIFY: >> - addr = XEXP (addr, 1); >> - break; >> - case POST_MODIFY: >> - case POST_INC: >> - case POST_DEC: >> - addr = XEXP (addr, 0); >> - break; >> - case PRE_INC: >> - case PRE_DEC: >> + return compare_sizes_for_sort (offset, node->value ()->offset); >> + }; >> + auto result = tree.lookup (compare); >> + splay_tree_node<access_record *> *node = tree.root (); >> + if (result == 0) >> + node->value ()->cand_insns.push_back (insn); >> + else >> { >> - poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem)); >> - if (GET_CODE (addr) == PRE_DEC) >> - adjustment *= -1; >> - addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment); >> - break; >> - } >> - default: >> - gcc_unreachable (); >> + auto it = node->value ()->place; >> + auto after = (result > 0) ? std::next (it) : it; >> + auto access = insert_before (after); >> + tree.insert_child (node, result > 0, alloc_node (access)); >> } >> - >> - return change_address (mem, GET_MODE (mem), addr); >> } >> >> -// Convenience wrapper around strip_offset that can also look through >> -// RTX_AUTOINC addresses. The interface is like strip_offset except we >> take a >> -// MEM so that we know the mode of the access. >> -static rtx >> -ldp_strip_offset (rtx mem, poly_int64 *offset) >> +bool >> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget); >> +bool load_modified_by_store_p (insn_info *load, >> + insn_info *store, >> + int &budget); >> + >> +// Implement some common functionality used by both store_walker >> +// and load_walker. >> +template<bool reverse> >> +class def_walker : public alias_walker >> { >> - rtx addr = XEXP (mem, 0); >> +protected: >> + using def_iter_t = typename std::conditional<reverse, >> + reverse_def_iterator, def_iterator>::type; >> >> - switch (GET_CODE (addr)) >> - { >> - case PRE_MODIFY: >> - case POST_MODIFY: >> - addr = strip_offset (XEXP (addr, 1), offset); >> - gcc_checking_assert (REG_P (addr)); >> - gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr)); >> - break; >> - case PRE_INC: >> - case POST_INC: >> - addr = XEXP (addr, 0); >> - *offset = GET_MODE_SIZE (GET_MODE (mem)); >> - gcc_checking_assert (REG_P (addr)); >> - break; >> - case PRE_DEC: >> - case POST_DEC: >> - addr = XEXP (addr, 0); >> - *offset = -GET_MODE_SIZE (GET_MODE (mem)); >> - gcc_checking_assert (REG_P (addr)); >> + static use_info *start_use_chain (def_iter_t &def_iter) >> + { >> + set_info *set = nullptr; >> + for (; *def_iter; def_iter++) >> + { >> + set = dyn_cast<set_info *> (*def_iter); >> + if (!set) >> + continue; >> + >> + use_info *use = reverse >> + ? set->last_nondebug_insn_use () >> + : set->first_nondebug_insn_use (); >> + >> + if (use) >> + return use; >> + } >> + >> + return nullptr; >> + } >> + >> + def_iter_t def_iter; >> + insn_info *limit; >> + def_walker (def_info *def, insn_info *limit) : >> + def_iter (def), limit (limit) {} >> + >> + virtual bool iter_valid () const { return *def_iter; } >> + >> +public: >> + insn_info *insn () const override { return (*def_iter)->insn (); } >> + void advance () override { def_iter++; } >> + bool valid () const override final >> + { >> + if (!iter_valid ()) >> + return false; >> + >> + if (reverse) >> + return *(insn ()) > *limit; >> + else >> + return *(insn ()) < *limit; >> + } >> +}; >> + >> +// alias_walker that iterates over stores. >> +template<bool reverse, typename InsnPredicate> >> +class store_walker : public def_walker<reverse> >> +{ >> + rtx cand_mem; >> + InsnPredicate tombstone_p; >> + >> +public: >> + store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn, >> + InsnPredicate tombstone_fn) : >> + def_walker<reverse> (mem_def, limit_insn), >> + cand_mem (mem), tombstone_p (tombstone_fn) {} >> + bool conflict_p (int &budget) const override final >> + { >> + if (tombstone_p (this->insn ())) >> + return false; >> + >> + return store_modifies_mem_p (cand_mem, this->insn (), budget); >> + } >> +}; >> + >> +// alias_walker that iterates over loads. >> +template<bool reverse> >> +class load_walker : public def_walker<reverse> >> +{ >> + using Base = def_walker<reverse>; >> + using use_iter_t = typename std::conditional<reverse, >> + reverse_use_iterator, nondebug_insn_use_iterator>::type; >> + >> + use_iter_t use_iter; >> + insn_info *cand_store; >> + >> + bool iter_valid () const override final { return *use_iter; } >> + >> +public: >> + void advance () override final >> + { >> + use_iter++; >> + if (*use_iter) >> + return; >> + this->def_iter++; >> + use_iter = Base::start_use_chain (this->def_iter); >> + } >> + >> + insn_info *insn () const override final >> + { >> + return (*use_iter)->insn (); >> + } >> + bool conflict_p (int &budget) const override final >> + { >> + return load_modified_by_store_p (insn (), cand_store, budget); >> + } >> + load_walker (def_info *def, insn_info *store, insn_info *limit_insn) >> + : Base (def, limit_insn), >> + use_iter (Base::start_use_chain (this->def_iter)), >> + cand_store (store) {} >> +}; >> + >> +extern insn_info * >> +try_repurpose_store (insn_info *first, >> + insn_info *second, >> + const insn_range_info &move_range); >> + >> +void reset_debug_use (use_info *use); >> + >> +extern void >> +fixup_debug_uses (obstack_watermark &attempt, >> + insn_info *insns[2], >> + rtx orig_rtl[2], >> + insn_info *pair_dst, >> + insn_info *trailing_add, >> + bool load_p, >> + int writeback, >> + rtx writeback_effect, >> + unsigned base_regno); >> + >> +void >> +fixup_debug_uses_trailing_add (obstack_watermark &attempt, >> + insn_info *pair_dst, >> + insn_info *trailing_add, >> + rtx writeback_effect); >> + >> + >> +extern void >> +fixup_debug_use (obstack_watermark &attempt, >> + use_info *use, >> + def_info *def, >> + rtx base, >> + poly_int64 wb_offset); >> + >> +extern insn_info * >> +find_trailing_add (insn_info *insns[2], >> + const insn_range_info &pair_range, >> + int initial_writeback, >> + rtx *writeback_effect, >> + def_info **add_def, >> + def_info *base_def, >> + poly_int64 initial_offset, >> + unsigned access_size); >> + >> +rtx drop_writeback (rtx mem); >> +rtx pair_mem_strip_offset (rtx mem, poly_int64 *offset); >> +bool any_pre_modify_p (rtx x); >> +bool any_post_modify_p (rtx x); >> +int encode_lfs (lfs_fields fields); >> +extern insn_info * latest_hazard_before (insn_info *insn, rtx *ignore, >> + insn_info *ignore_insn = nullptr); >> +insn_info * first_hazard_after (insn_info *insn, rtx *ignore); >> +bool ranges_overlap_p (const insn_range_info &r1, const insn_range_info >> &r2); >> +insn_range_info get_def_range (def_info *def); >> +insn_range_info def_downwards_move_range (def_info *def); >> +insn_range_info def_upwards_move_range (def_info *def); >> +rtx gen_tombstone (void); >> +rtx filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr); >> +rtx combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p); >> +rtx extract_writebacks (bool load_p, rtx pats[2], int changed); >> +void do_alias_analysis (insn_info *alias_hazards[4], >> + alias_walker *walkers[4], >> + bool load_p); >> +int get_viable_bases (insn_info *insns[2], >> + vec<base_cand> &base_cands, >> + rtx cand_mems[2], >> + unsigned access_size, >> + bool reversed); >> +void dump_insn_list (FILE *f, const insn_list_t &l); >> + >> +// Given a mem MEM, if the address has side effects, return a MEM that >> accesses >> +// the same address but without the side effects. Otherwise, return >> +// MEM unchanged. >> +rtx >> +drop_writeback (rtx mem) >> +{ >> + rtx addr = XEXP (mem, 0); >> + >> + if (!side_effects_p (addr)) >> + return mem; >> + >> + switch (GET_CODE (addr)) >> + { >> + case PRE_MODIFY: >> + addr = XEXP (addr, 1); >> + break; >> + case POST_MODIFY: >> + case POST_INC: >> + case POST_DEC: >> + addr = XEXP (addr, 0); >> + break; >> + case PRE_INC: >> + case PRE_DEC: >> + { >> + poly_int64 adjustment = GET_MODE_SIZE (GET_MODE (mem)); >> + if (GET_CODE (addr) == PRE_DEC) >> + adjustment *= -1; >> + addr = plus_constant (GET_MODE (addr), XEXP (addr, 0), adjustment); >> + break; >> + } >> + default: >> + gcc_unreachable (); >> + } >> + >> + return change_address (mem, GET_MODE (mem), addr); >> +} >> + >> +// Convenience wrapper around strip_offset that can also look through >> +// RTX_AUTOINC addresses. The interface is like strip_offset except we >> take a >> +// MEM so that we know the mode of the access. >> +rtx >> +pair_mem_strip_offset (rtx mem, poly_int64 *offset) >> +{ >> + rtx addr = XEXP (mem, 0); >> + >> + switch (GET_CODE (addr)) >> + { >> + case PRE_MODIFY: >> + case POST_MODIFY: >> + addr = strip_offset (XEXP (addr, 1), offset); >> + gcc_checking_assert (REG_P (addr)); >> + gcc_checking_assert (rtx_equal_p (XEXP (XEXP (mem, 0), 0), addr)); >> + break; >> + case PRE_INC: >> + case POST_INC: >> + addr = XEXP (addr, 0); >> + *offset = GET_MODE_SIZE (GET_MODE (mem)); >> + gcc_checking_assert (REG_P (addr)); >> + break; >> + case PRE_DEC: >> + case POST_DEC: >> + addr = XEXP (addr, 0); >> + *offset = -GET_MODE_SIZE (GET_MODE (mem)); >> + gcc_checking_assert (REG_P (addr)); >> break; >> >> default: >> @@ -295,7 +663,7 @@ ldp_strip_offset (rtx mem, poly_int64 *offset) >> } >> >> // Return true if X is a PRE_{INC,DEC,MODIFY} rtx. >> -static bool >> +bool >> any_pre_modify_p (rtx x) >> { >> const auto code = GET_CODE (x); >> @@ -303,318 +671,42 @@ any_pre_modify_p (rtx x) >> } >> >> // Return true if X is a POST_{INC,DEC,MODIFY} rtx. >> -static bool >> +bool >> any_post_modify_p (rtx x) >> { >> const auto code = GET_CODE (x); >> return code == POST_INC || code == POST_DEC || code == POST_MODIFY; >> } >> >> -// Return true if we should consider forming ldp/stp insns from memory >> -// accesses with operand mode MODE at this stage in compilation. >> -static bool >> -ldp_operand_mode_ok_p (machine_mode mode) >> -{ >> - const bool allow_qregs >> - = !(aarch64_tune_params.extra_tuning_flags >> - & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS); >> - >> - if (!aarch64_ldpstp_operand_mode_p (mode)) >> - return false; >> - >> - const auto size = GET_MODE_SIZE (mode).to_constant (); >> - if (size == 16 && !allow_qregs) >> - return false; >> - >> - // We don't pair up TImode accesses before RA because TImode is >> - // special in that it can be allocated to a pair of GPRs or a single >> - // FPR, and the RA is best placed to make that decision. >> - return reload_completed || mode != TImode; >> -} >> - >> // Given LFS (load_p, fpsimd_p, size) fields in FIELDS, encode these >> // into an integer for use as a hash table key. >> -static int >> +int >> encode_lfs (lfs_fields fields) >> { >> int size_log2 = exact_log2 (fields.size); >> - gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); >> + //gcc_checking_assert (size_log2 >= 2 && size_log2 <= 4); >> return ((int)fields.load_p << 3) >> | ((int)fields.fpsimd_p << 2) >> | (size_log2 - 2); >> } >> >> -// Inverse of encode_lfs. >> -static lfs_fields >> -decode_lfs (int lfs) >> -{ >> - bool load_p = (lfs & (1 << 3)); >> - bool fpsimd_p = (lfs & (1 << 2)); >> - unsigned size = 1U << ((lfs & 3) + 2); >> - return { load_p, fpsimd_p, size }; >> -} >> +// Dummy predicate that never ignores any insns. >> +static bool no_ignore (insn_info *) { return false; } >> >> -// Track the access INSN at offset OFFSET in this access group. >> -// ALLOC_NODE is used to allocate splay tree nodes. >> -template<typename Alloc> >> -void >> -access_group::track (Alloc alloc_node, poly_int64 offset, insn_info *insn) >> -{ >> - auto insert_before = [&](std::list<access_record>::iterator after) >> - { >> - auto it = list.emplace (after, offset); >> - it->cand_insns.push_back (insn); >> - it->place = it; >> - return &*it; >> - }; >> - >> - if (!list.size ()) >> - { >> - auto access = insert_before (list.end ()); >> - tree.insert_max_node (alloc_node (access)); >> - return; >> - } >> - >> - auto compare = [&](splay_tree_node<access_record *> *node) >> - { >> - return compare_sizes_for_sort (offset, node->value ()->offset); >> - }; >> - auto result = tree.lookup (compare); >> - splay_tree_node<access_record *> *node = tree.root (); >> - if (result == 0) >> - node->value ()->cand_insns.push_back (insn); >> - else >> - { >> - auto it = node->value ()->place; >> - auto after = (result > 0) ? std::next (it) : it; >> - auto access = insert_before (after); >> - tree.insert_child (node, result > 0, alloc_node (access)); >> - } >> -} >> - >> -// Given a candidate access INSN (with mem MEM), see if it has a suitable >> -// MEM_EXPR base (i.e. a tree decl) relative to which we can track the >> access. >> -// LFS is used as part of the key to the hash table, see track_access. >> -bool >> -ldp_bb_info::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs) >> -{ >> - if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem)) >> - return false; >> - >> - poly_int64 offset; >> - tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem), >> - &offset); >> - if (!base_expr || !DECL_P (base_expr)) >> - return false; >> - >> - offset += MEM_OFFSET (mem); >> - >> - const machine_mode mem_mode = GET_MODE (mem); >> - const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant (); >> - >> - // Punt on misaligned offsets. LDP/STP instructions require offsets to >> be a >> - // multiple of the access size, and we believe that misaligned offsets on >> - // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL >> bases. >> - if (!multiple_p (offset, mem_size)) >> - return false; >> - >> - const auto key = std::make_pair (base_expr, encode_lfs (lfs)); >> - access_group &group = expr_map.get_or_insert (key, NULL); >> - auto alloc = [&](access_record *access) { return node_alloc (access); }; >> - group.track (alloc, offset, insn); >> - >> - if (dump_file) >> - { >> - fprintf (dump_file, "[bb %u] tracking insn %d via ", >> - m_bb->index (), insn->uid ()); >> - print_node_brief (dump_file, "mem expr", base_expr, 0); >> - fprintf (dump_file, " [L=%d FP=%d, %smode, off=", >> - lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]); >> - print_dec (offset, dump_file); >> - fprintf (dump_file, "]\n"); >> - } >> - >> - return true; >> -} >> - >> -// Main function to begin pair discovery. Given a memory access INSN, >> -// determine whether it could be a candidate for fusing into an ldp/stp, >> -// and if so, track it in the appropriate data structure for this basic >> -// block. LOAD_P is true if the access is a load, and MEM is the mem >> -// rtx that occurs in INSN. >> -void >> -ldp_bb_info::track_access (insn_info *insn, bool load_p, rtx mem) >> -{ >> - // We can't combine volatile MEMs, so punt on these. >> - if (MEM_VOLATILE_P (mem)) >> - return; >> - >> - // Ignore writeback accesses if the param says to do so. >> - if (!aarch64_ldp_writeback >> - && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) >> - return; >> - >> - const machine_mode mem_mode = GET_MODE (mem); >> - if (!ldp_operand_mode_ok_p (mem_mode)) >> - return; >> - >> - rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p); >> - >> - // Ignore the access if the register operand isn't suitable for ldp/stp. >> - if (load_p >> - ? !aarch64_ldp_reg_operand (reg_op, mem_mode) >> - : !aarch64_stp_reg_operand (reg_op, mem_mode)) >> - return; >> - >> - // We want to segregate FP/SIMD accesses from GPR accesses. >> - // >> - // Before RA, we use the modes, noting that stores of constant zero >> - // operands use GPRs (even in non-integer modes). After RA, we use >> - // the hard register numbers. >> - const bool fpsimd_op_p >> - = reload_completed >> - ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op))) >> - : (GET_MODE_CLASS (mem_mode) != MODE_INT >> - && (load_p || !aarch64_const_zero_rtx_p (reg_op))); >> - >> - // Note ldp_operand_mode_ok_p already rejected VL modes. >> - const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant (); >> - const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size }; >> - >> - if (track_via_mem_expr (insn, mem, lfs)) >> - return; >> - >> - poly_int64 mem_off; >> - rtx addr = XEXP (mem, 0); >> - const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC; >> - rtx base = ldp_strip_offset (mem, &mem_off); >> - if (!REG_P (base)) >> - return; >> - >> - // Need to calculate two (possibly different) offsets: >> - // - Offset at which the access occurs. >> - // - Offset of the new base def. >> - poly_int64 access_off; >> - if (autoinc_p && any_post_modify_p (addr)) >> - access_off = 0; >> - else >> - access_off = mem_off; >> - >> - poly_int64 new_def_off = mem_off; >> - >> - // Punt on accesses relative to eliminable regs. Since we don't know the >> - // elimination offset pre-RA, we should postpone forming pairs on such >> - // accesses until after RA. >> - // >> - // As it stands, addresses with offsets in range for LDR but not >> - // in range for LDP/STP are currently reloaded inefficiently, >> - // ending up with a separate base register for each pair. >> - // >> - // In theory LRA should make use of >> - // targetm.legitimize_address_displacement to promote sharing of >> - // bases among multiple (nearby) address reloads, but the current >> - // LRA code returns early from process_address_1 for operands that >> - // satisfy "m", even if they don't satisfy the real (relaxed) address >> - // constraint; this early return means we never get to the code >> - // that calls targetm.legitimize_address_displacement. >> - // >> - // So for now, it's better to punt when we can't be sure that the >> - // offset is in range for LDP/STP. Out-of-range cases can then be >> - // handled after RA by the out-of-range LDP/STP peepholes. Eventually, it >> - // would be nice to handle known out-of-range opportunities in the >> - // pass itself (for stack accesses, this would be in the post-RA pass). >> - if (!reload_completed >> - && (REGNO (base) == FRAME_POINTER_REGNUM >> - || REGNO (base) == ARG_POINTER_REGNUM)) >> - return; >> - >> - // Now need to find def of base register. >> - use_info *base_use = find_access (insn->uses (), REGNO (base)); >> - gcc_assert (base_use); >> - def_info *base_def = base_use->def (); >> - if (!base_def) >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - "base register (regno %d) of insn %d is undefined", >> - REGNO (base), insn->uid ()); >> - return; >> - } >> - >> - alt_base *canon_base = canon_base_map.get (base_def); >> - if (canon_base) >> - { >> - // Express this as the combined offset from the canonical base. >> - base_def = canon_base->base; >> - new_def_off += canon_base->offset; >> - access_off += canon_base->offset; >> - } >> - >> - if (autoinc_p) >> - { >> - auto def = find_access (insn->defs (), REGNO (base)); >> - gcc_assert (def); >> - >> - // Record that DEF = BASE_DEF + MEM_OFF. >> - if (dump_file) >> - { >> - pretty_printer pp; >> - pp_access (&pp, def, 0); >> - pp_string (&pp, " = "); >> - pp_access (&pp, base_def, 0); >> - fprintf (dump_file, "[bb %u] recording %s + ", >> - m_bb->index (), pp_formatted_text (&pp)); >> - print_dec (new_def_off, dump_file); >> - fprintf (dump_file, "\n"); >> - } >> - >> - alt_base base_rec { base_def, new_def_off }; >> - if (canon_base_map.put (def, base_rec)) >> - gcc_unreachable (); // Base defs should be unique. >> - } >> - >> - // Punt on misaligned offsets. LDP/STP require offsets to be a multiple >> of >> - // the access size. >> - if (!multiple_p (mem_off, mem_size)) >> - return; >> - >> - const auto key = std::make_pair (base_def, encode_lfs (lfs)); >> - access_group &group = def_map.get_or_insert (key, NULL); >> - auto alloc = [&](access_record *access) { return node_alloc (access); }; >> - group.track (alloc, access_off, insn); >> - >> - if (dump_file) >> - { >> - pretty_printer pp; >> - pp_access (&pp, base_def, 0); >> - >> - fprintf (dump_file, "[bb %u] tracking insn %d via %s", >> - m_bb->index (), insn->uid (), pp_formatted_text (&pp)); >> - fprintf (dump_file, >> - " [L=%d, WB=%d, FP=%d, %smode, off=", >> - lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]); >> - print_dec (access_off, dump_file); >> - fprintf (dump_file, "]\n"); >> - } >> -} >> - >> -// Dummy predicate that never ignores any insns. >> -static bool no_ignore (insn_info *) { return false; } >> - >> -// Return the latest dataflow hazard before INSN. >> -// >> -// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore >> for >> -// dataflow purposes. This is needed when considering changing the RTL >> base of >> -// an access discovered through a MEM_EXPR base. >> -// >> -// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising >> -// from that insn. >> -// >> -// N.B. we ignore any defs/uses of memory here as we deal with that >> separately, >> -// making use of alias disambiguation. >> -static insn_info * >> -latest_hazard_before (insn_info *insn, rtx *ignore, >> - insn_info *ignore_insn = nullptr) >> +// Return the latest dataflow hazard before INSN. >> +// >> +// If IGNORE is non-NULL, this points to a sub-rtx which we should ignore >> for >> +// dataflow purposes. This is needed when considering changing the RTL >> base of >> +// an access discovered through a MEM_EXPR base. >> +// >> +// If IGNORE_INSN is non-NULL, we should further ignore any hazards arising >> +// from that insn. >> +// >> +// N.B. we ignore any defs/uses of memory here as we deal with that >> separately, >> +// making use of alias disambiguation. >> +insn_info * >> +latest_hazard_before (insn_info *insn, rtx *ignore, >> + insn_info *ignore_insn)// = nullptr) >> { >> insn_info *result = nullptr; >> >> @@ -698,7 +790,7 @@ latest_hazard_before (insn_info *insn, rtx *ignore, >> // >> // N.B. we ignore any defs/uses of memory here as we deal with that >> separately, >> // making use of alias disambiguation. >> -static insn_info * >> +insn_info * >> first_hazard_after (insn_info *insn, rtx *ignore) >> { >> insn_info *result = nullptr; >> @@ -787,7 +879,7 @@ first_hazard_after (insn_info *insn, rtx *ignore) >> } >> >> // Return true iff R1 and R2 overlap. >> -static bool >> +bool >> ranges_overlap_p (const insn_range_info &r1, const insn_range_info &r2) >> { >> // If either range is empty, then their intersection is empty. >> @@ -799,9 +891,8 @@ ranges_overlap_p (const insn_range_info &r1, const >> insn_range_info &r2) >> // Inverting this, we get the below. >> return *r1.last >= *r2.first && *r2.last >= *r1.first; >> } >> - >> // Get the range of insns that def feeds. >> -static insn_range_info get_def_range (def_info *def) >> + insn_range_info get_def_range (def_info *def) >> { >> insn_info *last = def->next_def ()->insn ()->prev_nondebug_insn (); >> return { def->insn (), last }; >> @@ -809,7 +900,7 @@ static insn_range_info get_def_range (def_info *def) >> >> // Given a def (of memory), return the downwards range within which we >> // can safely move this def. >> -static insn_range_info >> +insn_range_info >> def_downwards_move_range (def_info *def) >> { >> auto range = get_def_range (def); >> @@ -827,7 +918,7 @@ def_downwards_move_range (def_info *def) >> >> // Given a def (of memory), return the upwards range within which we can >> // safely move this def. >> -static insn_range_info >> +insn_range_info >> def_upwards_move_range (def_info *def) >> { >> def_info *prev = def->prev_def (); >> @@ -844,189 +935,18 @@ def_upwards_move_range (def_info *def) >> return range; >> } >> >> -// Class that implements a state machine for building the changes needed to >> form >> -// a store pair instruction. This allows us to easily build the changes in >> -// program order, as required by rtl-ssa. >> -struct stp_change_builder >> +// Generate the RTL pattern for a "tombstone"; used temporarily during this >> pass >> +// to replace stores that are marked for deletion where we can't immediately >> +// delete the store (since there are uses of mem hanging off the store). >> +// >> +// These are deleted at the end of the pass and uses re-parented >> appropriately >> +// at this point. >> +rtx >> +gen_tombstone (void) >> { >> - enum class state >> - { >> - FIRST, >> - INSERT, >> - FIXUP_USE, >> - LAST, >> - DONE >> - }; >> - >> - enum class action >> - { >> - TOMBSTONE, >> - CHANGE, >> - INSERT, >> - FIXUP_USE >> - }; >> - >> - struct change >> - { >> - action type; >> - insn_info *insn; >> - }; >> - >> - bool done () const { return m_state == state::DONE; } >> - >> - stp_change_builder (insn_info *insns[2], >> - insn_info *repurpose, >> - insn_info *dest) >> - : m_state (state::FIRST), m_insns { insns[0], insns[1] }, >> - m_repurpose (repurpose), m_dest (dest), m_use (nullptr) {} >> - >> - change get_change () const >> - { >> - switch (m_state) >> - { >> - case state::FIRST: >> - return { >> - m_insns[0] == m_repurpose ? action::CHANGE : action::TOMBSTONE, >> - m_insns[0] >> - }; >> - case state::LAST: >> - return { >> - m_insns[1] == m_repurpose ? action::CHANGE : action::TOMBSTONE, >> - m_insns[1] >> - }; >> - case state::INSERT: >> - return { action::INSERT, m_dest }; >> - case state::FIXUP_USE: >> - return { action::FIXUP_USE, m_use->insn () }; >> - case state::DONE: >> - break; >> - } >> - >> - gcc_unreachable (); >> - } >> - >> - // Transition to the next state. >> - void advance () >> - { >> - switch (m_state) >> - { >> - case state::FIRST: >> - if (m_repurpose) >> - m_state = state::LAST; >> - else >> - m_state = state::INSERT; >> - break; >> - case state::INSERT: >> - { >> - def_info *def = memory_access (m_insns[0]->defs ()); >> - while (*def->next_def ()->insn () <= *m_dest) >> - def = def->next_def (); >> - >> - // Now we know DEF feeds the insertion point for the new stp. >> - // Look for any uses of DEF that will consume the new stp. >> - gcc_assert (*def->insn () <= *m_dest >> - && *def->next_def ()->insn () > *m_dest); >> - >> - auto set = as_a<set_info *> (def); >> - for (auto use : set->nondebug_insn_uses ()) >> - if (*use->insn () > *m_dest) >> - { >> - m_use = use; >> - break; >> - } >> - >> - if (m_use) >> - m_state = state::FIXUP_USE; >> - else >> - m_state = state::LAST; >> - break; >> - } >> - case state::FIXUP_USE: >> - m_use = m_use->next_nondebug_insn_use (); >> - if (!m_use) >> - m_state = state::LAST; >> - break; >> - case state::LAST: >> - m_state = state::DONE; >> - break; >> - case state::DONE: >> - gcc_unreachable (); >> - } >> - } >> - >> -private: >> - state m_state; >> - >> - // Original candidate stores. >> - insn_info *m_insns[2]; >> - >> - // If non-null, this is a candidate insn to change into an stp. >> Otherwise we >> - // are deleting both original insns and inserting a new insn for the stp. >> - insn_info *m_repurpose; >> - >> - // Destionation of the stp, it will be placed immediately after m_dest. >> - insn_info *m_dest; >> - >> - // Current nondebug use that needs updating due to stp insertion. >> - use_info *m_use; >> -}; >> - >> -// Given candidate store insns FIRST and SECOND, see if we can re-purpose >> one >> -// of them (together with its def of memory) for the stp insn. If so, >> return >> -// that insn. Otherwise, return null. >> -static insn_info * >> -try_repurpose_store (insn_info *first, >> - insn_info *second, >> - const insn_range_info &move_range) >> -{ >> - def_info * const defs[2] = { >> - memory_access (first->defs ()), >> - memory_access (second->defs ()) >> - }; >> - >> - if (move_range.includes (first) >> - || ranges_overlap_p (move_range, def_downwards_move_range (defs[0]))) >> - return first; >> - >> - if (move_range.includes (second) >> - || ranges_overlap_p (move_range, def_upwards_move_range (defs[1]))) >> - return second; >> - >> - return nullptr; >> -} >> - >> -// Generate the RTL pattern for a "tombstone"; used temporarily during this >> pass >> -// to replace stores that are marked for deletion where we can't immediately >> -// delete the store (since there are uses of mem hanging off the store). >> -// >> -// These are deleted at the end of the pass and uses re-parented >> appropriately >> -// at this point. >> -static rtx >> -gen_tombstone (void) >> -{ >> - return gen_rtx_CLOBBER (VOIDmode, >> - gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode))); >> -} >> - >> -// Given a pair mode MODE, return a canonical mode to be used for a single >> -// operand of such a pair. Currently we only use this when promoting a >> -// non-writeback pair into a writeback pair, as it isn't otherwise clear >> -// which mode to use when storing a modeless CONST_INT. >> -static machine_mode >> -aarch64_operand_mode_for_pair_mode (machine_mode mode) >> -{ >> - switch (mode) >> - { >> - case E_V2x4QImode: >> - return SImode; >> - case E_V2x8QImode: >> - return DImode; >> - case E_V2x16QImode: >> - return V16QImode; >> - default: >> - gcc_unreachable (); >> - } >> -} >> + return gen_rtx_CLOBBER (VOIDmode, >> + gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode))); >> +} >> >> // Go through the reg notes rooted at NOTE, dropping those that we should >> drop, >> // and preserving those that we want to keep by prepending them to (and >> @@ -1034,7 +954,7 @@ aarch64_operand_mode_for_pair_mode (machine_mode mode) >> // REG_EH_REGION note in the resulting list. FR_EXPR is used to return any >> // REG_FRAME_RELATED_EXPR note we find, as these can need special handling >> in >> // combine_reg_notes. >> -static rtx >> +rtx >> filter_notes (rtx note, rtx result, bool *eh_region, rtx *fr_expr) >> { >> for (; note; note = XEXP (note, 1)) >> @@ -1084,7 +1004,7 @@ filter_notes (rtx note, rtx result, bool *eh_region, >> rtx *fr_expr) >> >> // Return the notes that should be attached to a combination of I1 and I2, >> where >> // *I1 < *I2. LOAD_P is true for loads. >> -static rtx >> +rtx >> combine_reg_notes (insn_info *i1, insn_info *i2, bool load_p) >> { >> // Temporary storage for REG_FRAME_RELATED_EXPR notes. >> @@ -1100,8 +1020,8 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool >> load_p) >> if (!load_p) >> { >> // Simple frame-related sp-relative saves don't need CFI notes, but >> when >> - // we combine them into an stp we will need a CFI note as dwarf2cfi >> can't >> - // interpret the unspec pair representation directly. >> + // we combine them into an pair mem store we will need a CFI note as >> + // dwarf2cfi can't interpret the unspec pair representation directly. >> if (RTX_FRAME_RELATED_P (i1->rtl ()) && !fr_expr[0]) >> fr_expr[0] = copy_rtx (PATTERN (i1->rtl ())); >> if (RTX_FRAME_RELATED_P (i2->rtl ()) && !fr_expr[1]) >> @@ -1133,7 +1053,7 @@ combine_reg_notes (insn_info *i1, insn_info *i2, bool >> load_p) >> // relative to the initial value of the base register, and output these >> // in PATS. Return an rtx that represents the overall change to the >> // base register. >> -static rtx >> +rtx >> extract_writebacks (bool load_p, rtx pats[2], int changed) >> { >> rtx base_reg = NULL_RTX; >> @@ -1150,7 +1070,7 @@ extract_writebacks (bool load_p, rtx pats[2], int >> changed) >> const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC; >> >> poly_int64 offset; >> - rtx this_base = ldp_strip_offset (mem, &offset); >> + rtx this_base = pair_mem_strip_offset (mem, &offset); >> gcc_assert (REG_P (this_base)); >> if (base_reg) >> gcc_assert (rtx_equal_p (base_reg, this_base)); >> @@ -1207,7 +1127,7 @@ extract_writebacks (bool load_p, rtx pats[2], int >> changed) >> // base register. If there is one, we choose the first such update after >> // PAIR_DST that is still in the same BB as our pair. We return the new >> def in >> // *ADD_DEF and the resulting writeback effect in *WRITEBACK_EFFECT. >> -static insn_info * >> +insn_info * >> find_trailing_add (insn_info *insns[2], >> const insn_range_info &pair_range, >> int initial_writeback, >> @@ -1286,7 +1206,7 @@ find_trailing_add (insn_info *insns[2], >> >> off_hwi /= access_size; >> >> - if (off_hwi < LDP_MIN_IMM || off_hwi > LDP_MAX_IMM) >> + if (off_hwi < PAIR_MEM_MIN_IMM || off_hwi > PAIR_MEM_MAX_IMM) >> return nullptr; >> >> auto dump_prefix = [&]() >> @@ -1325,26 +1245,93 @@ find_trailing_add (insn_info *insns[2], >> return nullptr; >> } >> >> -// We just emitted a tombstone with uid UID, track it in a bitmap for >> -// this BB so we can easily identify it later when cleaning up tombstones. >> -void >> -ldp_bb_info::track_tombstone (int uid) >> +// Return true if STORE_INSN may modify mem rtx MEM. Make sure we keep >> +// within our BUDGET for alias analysis. >> +bool >> +store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget) >> { >> - if (!m_emitted_tombstone) >> + if (!budget) >> { >> - // Lazily initialize the bitmap for tracking tombstone insns. >> - bitmap_obstack_initialize (&m_bitmap_obstack); >> - bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack); >> - m_emitted_tombstone = true; >> + if (dump_file) >> + { >> + fprintf (dump_file, >> + "exceeded budget, assuming store %d aliases with mem ", >> + store_insn->uid ()); >> + print_simple_rtl (dump_file, mem); >> + fprintf (dump_file, "\n"); >> + } >> + >> + return true; >> } >> >> - if (!bitmap_set_bit (&m_tombstone_bitmap, uid)) >> - gcc_unreachable (); // Bit should have changed. >> + budget--; >> + return memory_modified_in_insn_p (mem, store_insn->rtl ()); >> +} >> + >> +// Return true if LOAD may be modified by STORE. Make sure we keep >> +// within our BUDGET for alias analysis. >> +bool >> +load_modified_by_store_p (insn_info *load, >> + insn_info *store, >> + int &budget) >> +{ >> + gcc_checking_assert (budget >= 0); >> + >> + if (!budget) >> + { >> + if (dump_file) >> + { >> + fprintf (dump_file, >> + "exceeded budget, assuming load %d aliases with store %d\n", >> + load->uid (), store->uid ()); >> + } >> + return true; >> + } >> + >> + // It isn't safe to re-order stores over calls. >> + if (CALL_P (load->rtl ())) >> + return true; >> + >> + budget--; >> + >> + // Iterate over all MEMs in the load, seeing if any alias with >> + // our store. >> + subrtx_var_iterator::array_type array; >> + rtx pat = PATTERN (load->rtl ()); >> + FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST) >> + if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ())) >> + return true; >> + >> + return false; >> +} >> +// Given candidate store insns FIRST and SECOND, see if we can re-purpose >> one >> +// of them (together with its def of memory) for the stp insn. If so, >> return >> +// that insn. Otherwise, return null. >> +insn_info * >> +try_repurpose_store (insn_info *first, >> + insn_info *second, >> + const insn_range_info &move_range) >> +{ >> + def_info * const defs[2] = { >> + memory_access (first->defs ()), >> + memory_access (second->defs ()) >> + }; >> + >> + if (move_range.includes (first) >> + || ranges_overlap_p (move_range, def_downwards_move_range (defs[0]))) >> + return first; >> + >> + if (move_range.includes (second) >> + || ranges_overlap_p (move_range, def_upwards_move_range (defs[1]))) >> + return second; >> + >> + return nullptr; >> } >> >> + >> // Reset the debug insn containing USE (the debug insn has been >> // optimized away). >> -static void >> +void >> reset_debug_use (use_info *use) >> { >> auto use_insn = use->insn (); >> @@ -1355,12 +1342,43 @@ reset_debug_use (use_info *use) >> crtl->ssa->change_insn (change); >> } >> >> +// Update debug uses when folding in a trailing add insn to form a >> +// writeback pair. >> +// >> +// ATTEMPT is used to allocate RTL-SSA temporaries for the changes, >> +// the final pair is placed immediately after PAIR_DST, TRAILING_ADD >> +// is a trailing add insn which is being folded into the pair to make it >> +// use writeback addressing, and WRITEBACK_EFFECT is the pattern for >> +// TRAILING_ADD. >> +void >> +fixup_debug_uses_trailing_add (obstack_watermark &attempt, >> + insn_info *pair_dst, >> + insn_info *trailing_add, >> + rtx writeback_effect) >> +{ >> + rtx base = SET_DEST (writeback_effect); >> + >> + poly_int64 wb_offset; >> + rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset); >> + gcc_checking_assert (rtx_equal_p (base, base2)); >> + >> + auto defs = trailing_add->defs (); >> + gcc_checking_assert (defs.size () == 1); >> + def_info *def = defs[0]; >> + >> + if (auto set = safe_dyn_cast<set_info *> (def->prev_def ())) >> + for (auto use : iterate_safely (set->debug_insn_uses ())) >> + if (*use->insn () > *pair_dst) >> + // DEF is getting re-ordered above USE, fix up USE accordingly. >> + fixup_debug_use (attempt, use, def, base, wb_offset); >> +} >> + >> // USE is a debug use that needs updating because DEF (a def of the same >> // register) is being re-ordered over it. If BASE is non-null, then DEF >> // is an update of the register BASE by a constant, given by WB_OFFSET, >> // and we can preserve debug info by accounting for the change in side >> // effects. >> -static void >> +void >> fixup_debug_use (obstack_watermark &attempt, >> use_info *use, >> def_info *def, >> @@ -1455,37 +1473,6 @@ fixup_debug_use (obstack_watermark &attempt, >> } >> } >> >> -// Update debug uses when folding in a trailing add insn to form a >> -// writeback pair. >> -// >> -// ATTEMPT is used to allocate RTL-SSA temporaries for the changes, >> -// the final pair is placed immediately after PAIR_DST, TRAILING_ADD >> -// is a trailing add insn which is being folded into the pair to make it >> -// use writeback addressing, and WRITEBACK_EFFECT is the pattern for >> -// TRAILING_ADD. >> -static void >> -fixup_debug_uses_trailing_add (obstack_watermark &attempt, >> - insn_info *pair_dst, >> - insn_info *trailing_add, >> - rtx writeback_effect) >> -{ >> - rtx base = SET_DEST (writeback_effect); >> - >> - poly_int64 wb_offset; >> - rtx base2 = strip_offset (SET_SRC (writeback_effect), &wb_offset); >> - gcc_checking_assert (rtx_equal_p (base, base2)); >> - >> - auto defs = trailing_add->defs (); >> - gcc_checking_assert (defs.size () == 1); >> - def_info *def = defs[0]; >> - >> - if (auto set = safe_dyn_cast<set_info *> (def->prev_def ())) >> - for (auto use : iterate_safely (set->debug_insn_uses ())) >> - if (*use->insn () > *pair_dst) >> - // DEF is getting re-ordered above USE, fix up USE accordingly. >> - fixup_debug_use (attempt, use, def, base, wb_offset); >> -} >> - >> // Called from fuse_pair, fixes up any debug uses that will be affected >> // by the changes. >> // >> @@ -1500,7 +1487,7 @@ fixup_debug_uses_trailing_add (obstack_watermark >> &attempt, >> // writeback, and WRITEBACK_EFFECT is an rtx describing the overall update >> to >> // the base register in the final pair (if any). BASE_REGNO gives the >> register >> // number of the base register used in the final pair. >> -static void >> +void >> fixup_debug_uses (obstack_watermark &attempt, >> insn_info *insns[2], >> rtx orig_rtl[2], >> @@ -1528,7 +1515,7 @@ fixup_debug_uses (obstack_watermark &attempt, >> gcc_checking_assert (GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) >> == RTX_AUTOINC); >> >> - base = ldp_strip_offset (mem, &offset); >> + base = pair_mem_strip_offset (mem, &offset); >> gcc_checking_assert (REG_P (base) && REGNO (base) == base_regno); >> } >> fixup_debug_use (attempt, use, def, base, offset); >> @@ -1651,621 +1638,846 @@ fixup_debug_uses (obstack_watermark &attempt, >> writeback_effect); >> } >> >> -// Try and actually fuse the pair given by insns I1 and I2. >> -// >> -// Here we've done enough analysis to know this is safe, we only >> -// reject the pair at this stage if either the tuning policy says to, >> -// or recog fails on the final pair insn. >> -// >> -// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each >> -// candidate insn. Bit i of WRITEBACK is set if the ith insn (in program >> -// order) uses writeback. >> +// Given INSNS (in program order) which are known to be adjacent, look >> +// to see if either insn has a suitable RTL (register) base that we can >> +// use to form a pair. Push these to BASE_CANDS if we find any. CAND_MEMs >> +// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the >> +// size of a single candidate access, and REVERSED says whether the accesses >> +// are inverted in offset order. >> // >> -// BASE gives the chosen base candidate for the pair and MOVE_RANGE is >> -// a singleton range which says where to place the pair. >> -bool >> -ldp_bb_info::fuse_pair (bool load_p, >> - unsigned access_size, >> - int writeback, >> - insn_info *i1, insn_info *i2, >> - base_cand &base, >> - const insn_range_info &move_range) >> +// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback >> +// addressing. >> +int >> +get_viable_bases (insn_info *insns[2], >> + vec<base_cand> &base_cands, >> + rtx cand_mems[2], >> + unsigned access_size, >> + bool reversed) >> { >> - auto attempt = crtl->ssa->new_change_attempt (); >> - >> - auto make_change = [&attempt](insn_info *insn) >> - { >> - return crtl->ssa->change_alloc<insn_change> (attempt, insn); >> - }; >> - auto make_delete = [&attempt](insn_info *insn) >> - { >> - return crtl->ssa->change_alloc<insn_change> (attempt, >> - insn, >> - insn_change::DELETE); >> - }; >> - >> - insn_info *first = (*i1 < *i2) ? i1 : i2; >> - insn_info *second = (first == i1) ? i2 : i1; >> - >> - insn_info *pair_dst = move_range.singleton (); >> - gcc_assert (pair_dst); >> - >> - insn_info *insns[2] = { first, second }; >> - >> - auto_vec<insn_change *> changes; >> - auto_vec<int, 2> tombstone_uids (2); >> - >> - rtx pats[2] = { >> - PATTERN (first->rtl ()), >> - PATTERN (second->rtl ()) >> - }; >> - >> - // Make copies of the patterns as we might need to refer to the original >> RTL >> - // later, for example when updating debug uses (which is after we've >> updated >> - // one or both of the patterns in the candidate insns). >> - rtx orig_rtl[2]; >> + // We discovered this pair through a common base. Need to ensure that >> + // we have a common base register that is live at both locations. >> + def_info *base_defs[2] = {}; >> + int writeback = 0; >> for (int i = 0; i < 2; i++) >> - orig_rtl[i] = copy_rtx (pats[i]); >> - >> - use_array input_uses[2] = { first->uses (), second->uses () }; >> - def_array input_defs[2] = { first->defs (), second->defs () }; >> - >> - int changed_insn = -1; >> - if (base.from_insn != -1) >> { >> - // If we're not already using a shared base, we need >> - // to re-write one of the accesses to use the base from >> - // the other insn. >> - gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1); >> - changed_insn = !base.from_insn; >> - >> - rtx base_pat = pats[base.from_insn]; >> - rtx change_pat = pats[changed_insn]; >> - rtx base_mem = XEXP (base_pat, load_p); >> - rtx change_mem = XEXP (change_pat, load_p); >> + const bool is_lower = (i == reversed); >> + poly_int64 poly_off; >> + rtx base = pair_mem_strip_offset (cand_mems[i], &poly_off); >> + if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC) >> + writeback |= (1 << i); >> >> - const bool lower_base_p = (insns[base.from_insn] == i1); >> - HOST_WIDE_INT adjust_amt = access_size; >> - if (!lower_base_p) >> - adjust_amt *= -1; >> + if (!REG_P (base) || !poly_off.is_constant ()) >> + continue; >> >> - rtx change_reg = XEXP (change_pat, !load_p); >> - machine_mode mode_for_mem = GET_MODE (change_mem); >> - rtx effective_base = drop_writeback (base_mem); >> - rtx new_mem = adjust_address_nv (effective_base, >> - mode_for_mem, >> - adjust_amt); >> - rtx new_set = load_p >> - ? gen_rtx_SET (change_reg, new_mem) >> - : gen_rtx_SET (new_mem, change_reg); >> + // Punt on accesses relative to eliminable regs. See the comment in >> + // pair_fusion::track_access for a detailed explanation of this. >> + if (!reload_completed >> + && (REGNO (base) == FRAME_POINTER_REGNUM >> + || REGNO (base) == ARG_POINTER_REGNUM)) >> + continue; >> >> - pats[changed_insn] = new_set; >> + HOST_WIDE_INT base_off = poly_off.to_constant (); >> >> - auto keep_use = [&](use_info *u) >> + // It should be unlikely that we ever punt here, since MEM_EXPR offset >> + // alignment should be a good proxy for register offset alignment. >> + if (base_off % access_size != 0) >> { >> - return refers_to_regno_p (u->regno (), u->regno () + 1, >> - change_pat, &XEXP (change_pat, load_p)); >> - }; >> - >> - // Drop any uses that only occur in the old address. >> - input_uses[changed_insn] = filter_accesses (attempt, >> - input_uses[changed_insn], >> - keep_use); >> - } >> - >> - rtx writeback_effect = NULL_RTX; >> - if (writeback) >> - writeback_effect = extract_writebacks (load_p, pats, changed_insn); >> + if (dump_file) >> + fprintf (dump_file, >> + "base not viable, offset misaligned (insn %d)\n", >> + insns[i]->uid ()); >> + continue; >> + } >> >> - const auto base_regno = base.def->regno (); >> + base_off /= access_size; >> >> - if (base.from_insn == -1 && (writeback & 1)) >> - { >> - // If the first of the candidate insns had a writeback form, we'll >> need to >> - // drop the use of the updated base register from the second insn's >> uses. >> - // >> - // N.B. we needn't worry about the base register occurring as a store >> - // operand, as we checked that there was no non-address true >> dependence >> - // between the insns in try_fuse_pair. >> - gcc_checking_assert (find_access (input_uses[1], base_regno)); >> - input_uses[1] = check_remove_regno_access (attempt, >> - input_uses[1], >> - base_regno); >> - } >> + if (!is_lower) >> + base_off--; >> >> - // Go through and drop uses that only occur in register notes, >> - // as we won't be preserving those. >> - for (int i = 0; i < 2; i++) >> - { >> - auto rti = insns[i]->rtl (); >> - if (!REG_NOTES (rti)) >> + if (base_off < PAIR_MEM_MIN_IMM || base_off > PAIR_MEM_MAX_IMM) >> continue; >> >> - input_uses[i] = remove_note_accesses (attempt, input_uses[i]); >> + use_info *use = find_access (insns[i]->uses (), REGNO (base)); >> + gcc_assert (use); >> + base_defs[i] = use->def (); >> } >> >> - // Edge case: if the first insn is a writeback load and the >> - // second insn is a non-writeback load which transfers into the base >> - // register, then we should drop the writeback altogether as the >> - // update of the base register from the second load should prevail. >> - // >> - // For example: >> - // ldr x2, [x1], #8 >> - // ldr x1, [x1] >> - // --> >> - // ldp x2, x1, [x1] >> - if (writeback == 1 >> - && load_p >> - && find_access (input_defs[1], base_regno)) >> + if (!base_defs[0] && !base_defs[1]) >> { >> if (dump_file) >> - fprintf (dump_file, >> - " ldp: i%d has wb but subsequent i%d has non-wb " >> - "update of base (r%d), dropping wb\n", >> - insns[0]->uid (), insns[1]->uid (), base_regno); >> - gcc_assert (writeback_effect); >> - writeback_effect = NULL_RTX; >> + fprintf (dump_file, "no viable base register for pair (%d,%d)\n", >> + insns[0]->uid (), insns[1]->uid ()); >> + return writeback; >> } >> >> - // So far the patterns have been in instruction order, >> - // now we want them in offset order. >> - if (i1 != first) >> - std::swap (pats[0], pats[1]); >> - >> - poly_int64 offsets[2]; >> for (int i = 0; i < 2; i++) >> - { >> - rtx mem = XEXP (pats[i], load_p); >> - gcc_checking_assert (MEM_P (mem)); >> - rtx base = strip_offset (XEXP (mem, 0), offsets + i); >> - gcc_checking_assert (REG_P (base)); >> - gcc_checking_assert (base_regno == REGNO (base)); >> + if ((writeback & (1 << i)) && !base_defs[i]) >> + { >> + if (dump_file) >> + fprintf (dump_file, "insn %d has writeback but base isn't viable\n", >> + insns[i]->uid ()); >> + return writeback; >> + } >> + >> + if (writeback == 3 >> + && base_defs[0]->regno () != base_defs[1]->regno ()) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "pair (%d,%d): double writeback with distinct regs (%d,%d): " >> + "punting\n", >> + insns[0]->uid (), insns[1]->uid (), >> + base_defs[0]->regno (), base_defs[1]->regno ()); >> + return writeback; >> } >> >> - // If either of the original insns had writeback, but the resulting pair >> insn >> - // does not (can happen e.g. in the ldp edge case above, or if the >> writeback >> - // effects cancel out), then drop the def(s) of the base register as >> - // appropriate. >> + if (base_defs[0] && base_defs[1] >> + && base_defs[0]->regno () == base_defs[1]->regno ()) >> + { >> + // Easy case: insns already share the same base reg. >> + base_cands.quick_push (base_defs[0]); >> + return writeback; >> + } >> + >> + // Otherwise, we know that one of the bases must change. >> // >> - // Also drop the first def in the case that both of the original insns had >> - // writeback. The second def could well have uses, but the first def >> should >> - // only be used by the second insn (and we dropped that use above). >> + // Note that if there is writeback we must use the writeback base >> + // (we know now there is exactly one). >> for (int i = 0; i < 2; i++) >> - if ((!writeback_effect && (writeback & (1 << i))) >> - || (i == 0 && writeback == 3)) >> - input_defs[i] = check_remove_regno_access (attempt, >> - input_defs[i], >> - base_regno); >> + if (base_defs[i] && (!writeback || (writeback & (1 << i)))) >> + base_cands.quick_push (base_cand { base_defs[i], i }); >> + >> + return writeback; >> +} >> + >> +void >> +dump_insn_list (FILE *f, const insn_list_t &l) >> +{ >> + fprintf (f, "("); >> + >> + auto i = l.begin (); >> + auto end = l.end (); >> + >> + if (i != end) >> + fprintf (f, "%d", (*i)->uid ()); >> + i++; >> + >> + for (; i != end; i++) >> + fprintf (f, ", %d", (*i)->uid ()); >> + >> + fprintf (f, ")"); >> +} >> +splay_tree_node<access_record *> * >> +pair_fusion::node_alloc (access_record *access) >> +{ >> + using T = splay_tree_node<access_record *>; >> + void *addr = obstack_alloc (&m_obstack, sizeof (T)); >> + return new (addr) T (access); >> +} >> +// Given a candidate access INSN (with mem MEM), see if it has a suitable >> +// MEM_EXPR base (i.e. a tree decl) relative to which we can track the >> access. >> +// LFS is used as part of the key to the hash table, see track_access. >> +bool >> +pair_fusion::track_via_mem_expr (insn_info *insn, rtx mem, lfs_fields lfs) >> +{ >> + if (!MEM_EXPR (mem) || !MEM_OFFSET_KNOWN_P (mem)) >> + return false; >> + >> + poly_int64 offset; >> + tree base_expr = get_addr_base_and_unit_offset (MEM_EXPR (mem), >> + &offset); >> + if (!base_expr || !DECL_P (base_expr)) >> + return false; >> + >> + offset += MEM_OFFSET (mem); >> + >> + const machine_mode mem_mode = GET_MODE (mem); >> + const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant (); >> + >> + // Punt on misaligned offsets. PAIR MEM instructions require offsets to >> be a >> + // multiple of the access size, and we believe that misaligned offsets on >> + // MEM_EXPR bases are likely to lead to misaligned offsets w.r.t. RTL >> bases. >> + if (!multiple_p (offset, mem_size)) >> + return false; >> + >> + const auto key = std::make_pair (base_expr, encode_lfs (lfs)); >> + access_group &group = expr_map.get_or_insert (key, NULL); >> + auto alloc = [&](access_record *access) { return node_alloc (access); }; >> + group.track (alloc, offset, insn); >> + >> + if (dump_file) >> + { >> + fprintf (dump_file, "[bb %u] tracking insn %d via ", >> + m_bb->index (), insn->uid ()); >> + print_node_brief (dump_file, "mem expr", base_expr, 0); >> + fprintf (dump_file, " [L=%d FP=%d, %smode, off=", >> + lfs.load_p, lfs.fpsimd_p, mode_name[mem_mode]); >> + print_dec (offset, dump_file); >> + fprintf (dump_file, "]\n"); >> + } >> + >> + return true; >> +} >> +// Main function to begin pair discovery. Given a memory access INSN, >> +// determine whether it could be a candidate for fusing into an pair mem, >> +// and if so, track it in the appropriate data structure for this basic >> +// block. LOAD_P is true if the access is a load, and MEM is the mem >> +// rtx that occurs in INSN. >> +void >> +pair_fusion::track_access (insn_info *insn, bool load_p, rtx mem) >> +{ >> + // We can't combine volatile MEMs, so punt on these. >> + if (MEM_VOLATILE_P (mem)) >> + return; >> + >> + // Ignore writeback accesses if the param says to do so >> + if (pair_is_writeback () >> + && GET_RTX_CLASS (GET_CODE (XEXP (mem, 0))) == RTX_AUTOINC) >> + return; >> + >> + const machine_mode mem_mode = GET_MODE (mem); >> + >> + if (!pair_operand_mode_ok_p (mem_mode)) >> + return; >> + >> + rtx reg_op = XEXP (PATTERN (insn->rtl ()), !load_p); >> + >> + if (pair_check_register_operand (load_p, reg_op, mem_mode)) >> + return; >> + // We want to segregate FP/SIMD accesses from GPR accesses. >> + // >> + // Before RA, we use the modes, noting that stores of constant zero >> + // operands use GPRs (even in non-integer modes). After RA, we use >> + // the hard register numbers. >> + const bool fpsimd_op_p = is_fpsimd_op_p (reg_op, mem_mode, load_p); >> + // Note pair_operand_mode_ok_p already rejected VL modes. >> + const HOST_WIDE_INT mem_size = GET_MODE_SIZE (mem_mode).to_constant (); >> + const lfs_fields lfs = { load_p, fpsimd_op_p, mem_size }; >> + >> + if (track_via_mem_expr (insn, mem, lfs)) >> + return; >> + >> + poly_int64 mem_off; >> + rtx addr = XEXP (mem, 0); >> + const bool autoinc_p = GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC; >> + rtx base = pair_mem_strip_offset (mem, &mem_off); >> + if (!REG_P (base)) >> + return; >> + >> + // Need to calculate two (possibly different) offsets: >> + // - Offset at which the access occurs. >> + // - Offset of the new base def. >> + poly_int64 access_off; >> + if (autoinc_p && any_post_modify_p (addr)) >> + access_off = 0; >> + else >> + access_off = mem_off; >> + >> + poly_int64 new_def_off = mem_off; >> + >> + // Punt on accesses relative to eliminable regs. Since we don't know the >> + // elimination offset pre-RA, we should postpone forming pairs on such >> + // accesses until after RA. >> + // >> + // As it stands, addresses with offsets in range for LDR but not >> + // in range for PAIR MEM LOAD STORE are currently reloaded inefficiently, >> + // ending up with a separate base register for each pair. >> + // >> + // In theory LRA should make use of >> + // targetm.legitimize_address_displacement to promote sharing of >> + // bases among multiple (nearby) address reloads, but the current >> + // LRA code returns early from process_address_1 for operands that >> + // satisfy "m", even if they don't satisfy the real (relaxed) address >> + // constraint; this early return means we never get to the code >> + // that calls targetm.legitimize_address_displacement. >> + // >> + // So for now, it's better to punt when we can't be sure that the >> + // offset is in range for PAIR MEM LOAD STORE. Out-of-range cases can >> then be >> + // handled after RA by the out-of-range PAIR MEM peepholes. Eventually, >> it >> + // would be nice to handle known out-of-range opportunities in the >> + // pass itself (for stack accesses, this would be in the post-RA pass). >> + if (!reload_completed >> + && (REGNO (base) == FRAME_POINTER_REGNUM >> + || REGNO (base) == ARG_POINTER_REGNUM)) >> + return; >> + >> + // Now need to find def of base register. >> + use_info *base_use = find_access (insn->uses (), REGNO (base)); >> + gcc_assert (base_use); >> + def_info *base_def = base_use->def (); >> + if (!base_def) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "base register (regno %d) of insn %d is undefined", >> + REGNO (base), insn->uid ()); >> + return; >> + } >> + >> + alt_base *canon_base = canon_base_map.get (base_def); >> + if (canon_base) >> + { >> + // Express this as the combined offset from the canonical base. >> + base_def = canon_base->base; >> + new_def_off += canon_base->offset; >> + access_off += canon_base->offset; >> + } >> + >> + if (autoinc_p) >> + { >> + auto def = find_access (insn->defs (), REGNO (base)); >> + gcc_assert (def); >> + >> + // Record that DEF = BASE_DEF + MEM_OFF. >> + if (dump_file) >> + { >> + pretty_printer pp; >> + pp_access (&pp, def, 0); >> + pp_string (&pp, " = "); >> + pp_access (&pp, base_def, 0); >> + fprintf (dump_file, "[bb %u] recording %s + ", >> + m_bb->index (), pp_formatted_text (&pp)); >> + print_dec (new_def_off, dump_file); >> + fprintf (dump_file, "\n"); >> + } >> + >> + alt_base base_rec { base_def, new_def_off }; >> + if (canon_base_map.put (def, base_rec)) >> + gcc_unreachable (); // Base defs should be unique. >> + } >> + >> + // Punt on misaligned offsets. PAIR MEM require offsets to be a >> multiple of >> + // the access size. >> + if (!multiple_p (mem_off, mem_size)) >> + return; >> + >> + const auto key = std::make_pair (base_def, encode_lfs (lfs)); >> + access_group &group = def_map.get_or_insert (key, NULL); >> + auto alloc = [&](access_record *access) { return node_alloc (access); }; >> + group.track (alloc, access_off, insn); >> + >> + if (dump_file) >> + { >> + pretty_printer pp; >> + pp_access (&pp, base_def, 0); >> + >> + fprintf (dump_file, "[bb %u] tracking insn %d via %s", >> + m_bb->index (), insn->uid (), pp_formatted_text (&pp)); >> + fprintf (dump_file, >> + " [L=%d, WB=%d, FP=%d, %smode, off=", >> + lfs.load_p, autoinc_p, lfs.fpsimd_p, mode_name[mem_mode]); >> + print_dec (access_off, dump_file); >> + fprintf (dump_file, "]\n"); >> + } >> +} >> + >> +// We just emitted a tombstone with uid UID, track it in a bitmap for >> +// this BB so we can easily identify it later when cleaning up tombstones. >> +void >> +pair_fusion::track_tombstone (int uid) >> +{ >> + if (!m_emitted_tombstone) >> + { >> + // Lazily initialize the bitmap for tracking tombstone insns. >> + bitmap_obstack_initialize (&m_bitmap_obstack); >> + bitmap_initialize (&m_tombstone_bitmap, &m_bitmap_obstack); >> + m_emitted_tombstone = true; >> + } >> + >> + if (!bitmap_set_bit (&m_tombstone_bitmap, uid)) >> + gcc_unreachable (); // Bit should have changed. >> +} >> + >> +// Given two adjacent memory accesses of the same size, I1 and I2, try >> +// and see if we can merge them into a pair mem load and store. >> +// >> +// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true >> +// if the accesses are both loads, otherwise they are both stores. >> +bool >> +pair_fusion::try_fuse_pair (bool load_p, unsigned access_size, >> + insn_info *i1, insn_info *i2) >> +{ >> + if (dump_file) >> + fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n", >> + load_p, i1->uid (), i2->uid ()); >> + >> + insn_info *insns[2]; >> + bool reversed = false; >> + if (*i1 < *i2) >> + { >> + insns[0] = i1; >> + insns[1] = i2; >> + } >> + else >> + { >> + insns[0] = i2; >> + insns[1] = i1; >> + reversed = true; >> + } >> + >> + rtx cand_mems[2]; >> + rtx reg_ops[2]; >> + rtx pats[2]; >> + for (int i = 0; i < 2; i++) >> + { >> + pats[i] = PATTERN (insns[i]->rtl ()); >> + cand_mems[i] = XEXP (pats[i], load_p); >> + reg_ops[i] = XEXP (pats[i], !load_p); >> + } >> + >> + if (!load_p && !fuseable_store_p (i1, i2)) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "punting on store-mem-pairs due to non fuseable cand >> (%d,%d)\n", >> + insns[0]->uid (), insns[1]->uid ()); >> + return false; >> + } >> + >> + >> + if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "punting on pair mem load due to reg conflcits (%d,%d)\n", >> + insns[0]->uid (), insns[1]->uid ()); >> + return false; >> + } >> + >> + if (cfun->can_throw_non_call_exceptions >> + && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX) >> + && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX)) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "can't combine insns with EH side effects (%d,%d)\n", >> + insns[0]->uid (), insns[1]->uid ()); >> + return false; >> + } >> + >> + auto_vec<base_cand, 2> base_cands (2); >> + >> + int writeback = get_viable_bases (insns, base_cands, cand_mems, >> + access_size, reversed); >> + if (base_cands.is_empty ()) >> + { >> + if (dump_file) >> + fprintf (dump_file, "no viable base for pair (%d,%d)\n", >> + insns[0]->uid (), insns[1]->uid ()); >> + return false; >> + } >> + >> + // Punt on frame-related insns with writeback. We probably won't see >> + // these in practice, but this is conservative and ensures we don't >> + // have to worry about these later on. >> + if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ()) >> + || RTX_FRAME_RELATED_P (i2->rtl ()))) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "rejecting pair (%d,%d): frame-related insn with writeback\n", >> + i1->uid (), i2->uid ()); >> + return false; >> + } >> + >> + rtx *ignore = &XEXP (pats[1], load_p); >> + for (auto use : insns[1]->uses ()) >> + if (!use->is_mem () >> + && refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore) >> + && use->def () && use->def ()->insn () == insns[0]) >> + { >> + // N.B. we allow a true dependence on the base address, as this >> + // happens in the case of auto-inc accesses. Consider a post-increment >> + // load followed by a regular indexed load, for example. >> + if (dump_file) >> + fprintf (dump_file, >> + "%d has non-address true dependence on %d, rejecting pair\n", >> + insns[1]->uid (), insns[0]->uid ()); >> + return false; >> + } >> >> - // If we don't currently have a writeback pair, and we don't have >> - // a load that clobbers the base register, look for a trailing destructive >> - // update of the base register and try and fold it in to make this into a >> - // writeback pair. >> - insn_info *trailing_add = nullptr; >> - if (aarch64_ldp_writeback > 1 >> - && !writeback_effect >> - && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1, >> - XEXP (pats[0], 0), nullptr) >> - && !refers_to_regno_p (base_regno, base_regno + 1, >> - XEXP (pats[1], 0), nullptr)))) >> + unsigned i = 0; >> + while (i < base_cands.length ()) >> { >> - def_info *add_def; >> - trailing_add = find_trailing_add (insns, move_range, writeback, >> - &writeback_effect, >> - &add_def, base.def, offsets[0], >> - access_size); >> - if (trailing_add) >> + base_cand &cand = base_cands[i]; >> + >> + rtx *ignore[2] = {}; >> + for (int j = 0; j < 2; j++) >> + if (cand.from_insn == !j) >> + ignore[j] = &XEXP (cand_mems[j], 0); >> + >> + insn_info *h = first_hazard_after (insns[0], ignore[0]); >> + if (h && *h < *insns[1]) >> + cand.hazards[0] = h; >> + >> + h = latest_hazard_before (insns[1], ignore[1]); >> + if (h && *h > *insns[0]) >> + cand.hazards[1] = h; >> + >> + if (!cand.viable ()) >> { >> - // The def of the base register from the trailing add should prevail. >> - input_defs[0] = insert_access (attempt, add_def, input_defs[0]); >> - gcc_assert (input_defs[0].is_valid ()); >> + if (dump_file) >> + fprintf (dump_file, >> + "pair (%d,%d): rejecting base %d due to dataflow " >> + "hazards (%d,%d)\n", >> + insns[0]->uid (), >> + insns[1]->uid (), >> + cand.def->regno (), >> + cand.hazards[0]->uid (), >> + cand.hazards[1]->uid ()); >> + >> + base_cands.ordered_remove (i); >> } >> + else >> + i++; >> } >> >> - // Now that we know what base mem we're going to use, check if it's OK >> - // with the ldp/stp policy. >> - rtx first_mem = XEXP (pats[0], load_p); >> - if (!aarch64_mem_ok_with_ldpstp_policy_model (first_mem, >> - load_p, >> - GET_MODE (first_mem))) >> + if (base_cands.is_empty ()) >> { >> if (dump_file) >> - fprintf (dump_file, "punting on pair (%d,%d), ldp/stp policy says no\n", >> - i1->uid (), i2->uid ()); >> + fprintf (dump_file, >> + "can't form pair (%d,%d) due to dataflow hazards\n", >> + insns[0]->uid (), insns[1]->uid ()); >> return false; >> } >> >> - rtx reg_notes = combine_reg_notes (first, second, load_p); >> + insn_info *alias_hazards[4] = {}; >> >> - rtx pair_pat; >> - if (writeback_effect) >> + // First def of memory after the first insn, and last def of memory >> + // before the second insn, respectively. >> + def_info *mem_defs[2] = {}; >> + if (load_p) >> { >> - auto patvec = gen_rtvec (3, writeback_effect, pats[0], pats[1]); >> - pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec); >> + if (!MEM_READONLY_P (cand_mems[0])) >> + { >> + mem_defs[0] = memory_access (insns[0]->uses ())->def (); >> + gcc_checking_assert (mem_defs[0]); >> + mem_defs[0] = mem_defs[0]->next_def (); >> + } >> + if (!MEM_READONLY_P (cand_mems[1])) >> + { >> + mem_defs[1] = memory_access (insns[1]->uses ())->def (); >> + gcc_checking_assert (mem_defs[1]); >> + } >> } >> - else if (load_p) >> - pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0), >> - XEXP (pats[1], 0), >> - XEXP (pats[0], 1)); >> else >> - pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0), >> - XEXP (pats[0], 1), >> - XEXP (pats[1], 1)); >> + { >> + mem_defs[0] = memory_access (insns[0]->defs ())->next_def (); >> + mem_defs[1] = memory_access (insns[1]->defs ())->prev_def (); >> + gcc_checking_assert (mem_defs[0]); >> + gcc_checking_assert (mem_defs[1]); >> + } >> >> - insn_change *pair_change = nullptr; >> - auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { >> - rtx_insn *rti = change->insn ()->rtl (); >> - validate_unshare_change (rti, &PATTERN (rti), pair_pat, true); >> - validate_change (rti, ®_NOTES (rti), reg_notes, true); >> + auto tombstone_p = [&](insn_info *insn) -> bool { >> + return m_emitted_tombstone >> + && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()); >> }; >> >> - if (load_p) >> - { >> - changes.safe_push (make_delete (first)); >> - pair_change = make_change (second); >> - changes.safe_push (pair_change); >> + store_walker<false, decltype(tombstone_p)> >> + forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p); >> >> - pair_change->move_range = move_range; >> - pair_change->new_defs = merge_access_arrays (attempt, >> - input_defs[0], >> - input_defs[1]); >> - gcc_assert (pair_change->new_defs.is_valid ()); >> + store_walker<true, decltype(tombstone_p)> >> + backward_store_walker (mem_defs[1], cand_mems[1], insns[0], >> tombstone_p); >> >> - pair_change->new_uses >> - = merge_access_arrays (attempt, >> - drop_memory_access (input_uses[0]), >> - drop_memory_access (input_uses[1])); >> - gcc_assert (pair_change->new_uses.is_valid ()); >> - set_pair_pat (pair_change); >> - } >> + alias_walker *walkers[4] = {}; >> + if (mem_defs[0]) >> + walkers[0] = &forward_store_walker; >> + if (mem_defs[1]) >> + walkers[1] = &backward_store_walker; >> + >> + if (load_p && (mem_defs[0] || mem_defs[1])) >> + do_alias_analysis (alias_hazards, walkers, load_p); >> else >> { >> - using Action = stp_change_builder::action; >> - insn_info *store_to_change = try_repurpose_store (first, second, >> - move_range); >> - stp_change_builder builder (insns, store_to_change, pair_dst); >> - insn_change *change; >> - set_info *new_set = nullptr; >> - for (; !builder.done (); builder.advance ()) >> - { >> - auto action = builder.get_change (); >> - change = (action.type == Action::INSERT) >> - ? nullptr : make_change (action.insn); >> - switch (action.type) >> - { >> - case Action::CHANGE: >> - { >> - set_pair_pat (change); >> - change->new_uses = merge_access_arrays (attempt, >> - input_uses[0], >> - input_uses[1]); >> - auto d1 = drop_memory_access (input_defs[0]); >> - auto d2 = drop_memory_access (input_defs[1]); >> - change->new_defs = merge_access_arrays (attempt, d1, d2); >> - gcc_assert (change->new_defs.is_valid ()); >> - def_info *stp_def = memory_access (change->insn ()->defs ()); >> - change->new_defs = insert_access (attempt, >> - stp_def, >> - change->new_defs); >> - gcc_assert (change->new_defs.is_valid ()); >> - change->move_range = move_range; >> - pair_change = change; >> - break; >> - } >> - case Action::TOMBSTONE: >> - { >> - tombstone_uids.quick_push (change->insn ()->uid ()); >> - rtx_insn *rti = change->insn ()->rtl (); >> - validate_change (rti, &PATTERN (rti), gen_tombstone (), true); >> - validate_change (rti, ®_NOTES (rti), NULL_RTX, true); >> - change->new_uses = use_array (nullptr, 0); >> - break; >> - } >> - case Action::INSERT: >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - " stp: cannot re-purpose candidate stores\n"); >> - >> - auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat); >> - change = make_change (new_insn); >> - change->move_range = move_range; >> - change->new_uses = merge_access_arrays (attempt, >> - input_uses[0], >> - input_uses[1]); >> - gcc_assert (change->new_uses.is_valid ()); >> + // We want to find any loads hanging off the first store. >> + mem_defs[0] = memory_access (insns[0]->defs ()); >> + load_walker<false> forward_load_walker (mem_defs[0], insns[0], >> insns[1]); >> + load_walker<true> backward_load_walker (mem_defs[1], insns[1], >> insns[0]); >> + walkers[2] = &forward_load_walker; >> + walkers[3] = &backward_load_walker; >> + do_alias_analysis (alias_hazards, walkers, load_p); >> + // Now consolidate hazards back down. >> + if (alias_hazards[2] >> + && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0]))) >> + alias_hazards[0] = alias_hazards[2]; >> >> - auto d1 = drop_memory_access (input_defs[0]); >> - auto d2 = drop_memory_access (input_defs[1]); >> - change->new_defs = merge_access_arrays (attempt, d1, d2); >> - gcc_assert (change->new_defs.is_valid ()); >> + if (alias_hazards[3] >> + && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1]))) >> + alias_hazards[1] = alias_hazards[3]; >> + } >> >> - new_set = crtl->ssa->create_set (attempt, new_insn, memory); >> - change->new_defs = insert_access (attempt, new_set, >> - change->new_defs); >> - gcc_assert (change->new_defs.is_valid ()); >> - pair_change = change; >> - break; >> - } >> - case Action::FIXUP_USE: >> - { >> - // This use now needs to consume memory from our stp. >> - if (dump_file) >> - fprintf (dump_file, >> - " stp: changing i%d to use mem from new stp " >> - "(after i%d)\n", >> - action.insn->uid (), pair_dst->uid ()); >> - change->new_uses = drop_memory_access (change->new_uses); >> - gcc_assert (new_set); >> - auto new_use = crtl->ssa->create_use (attempt, action.insn, >> - new_set); >> - change->new_uses = insert_access (attempt, new_use, >> - change->new_uses); >> - break; >> - } >> - } >> - changes.safe_push (change); >> - } >> + if (alias_hazards[0] && alias_hazards[1] >> + && *alias_hazards[0] <= *alias_hazards[1]) >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n", >> + i1->uid (), i2->uid (), >> + alias_hazards[0]->uid (), alias_hazards[1]->uid ()); >> + return false; >> } >> >> - if (trailing_add) >> - changes.safe_push (make_delete (trailing_add)); >> - else if ((writeback & 2) && !writeback_effect) >> + // Now narrow the hazards on each base candidate using >> + // the alias hazards. >> + i = 0; >> + while (i < base_cands.length ()) >> { >> - // The second insn initially had writeback but now the pair does not, >> - // need to update any nondebug uses of the base register def in the >> - // second insn. We'll take care of debug uses later. >> - auto def = find_access (insns[1]->defs (), base_regno); >> - gcc_assert (def); >> - auto set = dyn_cast<set_info *> (def); >> - if (set && set->has_nondebug_uses ()) >> - { >> - auto orig_use = find_access (insns[0]->uses (), base_regno); >> - for (auto use : set->nondebug_insn_uses ()) >> - { >> - auto change = make_change (use->insn ()); >> - change->new_uses = check_remove_regno_access (attempt, >> - change->new_uses, >> - base_regno); >> - change->new_uses = insert_access (attempt, >> - orig_use, >> - change->new_uses); >> - changes.safe_push (change); >> - } >> + base_cand &cand = base_cands[i]; >> + if (alias_hazards[0] && (!cand.hazards[0] >> + || *alias_hazards[0] < *cand.hazards[0])) >> + cand.hazards[0] = alias_hazards[0]; >> + if (alias_hazards[1] && (!cand.hazards[1] >> + || *alias_hazards[1] > *cand.hazards[1])) >> + cand.hazards[1] = alias_hazards[1]; >> + >> + if (cand.viable ()) >> + i++; >> + else >> + { >> + if (dump_file) >> + fprintf (dump_file, "pair (%d,%d): rejecting base %d due to " >> + "alias/dataflow hazards (%d,%d)", >> + insns[0]->uid (), insns[1]->uid (), >> + cand.def->regno (), >> + cand.hazards[0]->uid (), >> + cand.hazards[1]->uid ()); >> + >> + base_cands.ordered_remove (i); >> } >> } >> >> - auto is_changing = insn_is_changing (changes); >> - for (unsigned i = 0; i < changes.length (); i++) >> - gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], >> is_changing)); >> - >> - // Check the pair pattern is recog'd. >> - if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing)) >> + if (base_cands.is_empty ()) >> { >> if (dump_file) >> - fprintf (dump_file, " failed to form pair, recog failed\n"); >> + fprintf (dump_file, >> + "cannot form pair (%d,%d) due to alias/dataflow hazards", >> + insns[0]->uid (), insns[1]->uid ()); >> >> - // Free any reg notes we allocated. >> - while (reg_notes) >> - { >> - rtx next = XEXP (reg_notes, 1); >> - free_EXPR_LIST_node (reg_notes); >> - reg_notes = next; >> - } >> - cancel_changes (0); >> return false; >> } >> >> - gcc_assert (crtl->ssa->verify_insn_changes (changes)); >> - >> - // Fix up any debug uses that will be affected by the changes. >> - if (MAY_HAVE_DEBUG_INSNS) >> - fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add, >> - load_p, writeback, writeback_effect, base_regno); >> - >> - confirm_change_group (); >> - crtl->ssa->change_insns (changes); >> - >> - gcc_checking_assert (tombstone_uids.length () <= 2); >> - for (auto uid : tombstone_uids) >> - track_tombstone (uid); >> - >> - return true; >> -} >> - >> -// Return true if STORE_INSN may modify mem rtx MEM. Make sure we keep >> -// within our BUDGET for alias analysis. >> -static bool >> -store_modifies_mem_p (rtx mem, insn_info *store_insn, int &budget) >> -{ >> - if (!budget) >> + base_cand *base = &base_cands[0]; >> + if (base_cands.length () > 1) >> { >> - if (dump_file) >> + // If there are still multiple viable bases, it makes sense >> + // to choose one that allows us to reduce register pressure, >> + // for loads this means moving further down, for stores this >> + // means moving further up. >> + gcc_checking_assert (base_cands.length () == 2); >> + const int hazard_i = !load_p; >> + if (base->hazards[hazard_i]) >> { >> - fprintf (dump_file, >> - "exceeded budget, assuming store %d aliases with mem ", >> - store_insn->uid ()); >> - print_simple_rtl (dump_file, mem); >> - fprintf (dump_file, "\n"); >> + if (!base_cands[1].hazards[hazard_i]) >> + base = &base_cands[1]; >> + else if (load_p >> + && *base_cands[1].hazards[hazard_i] >> + > *(base->hazards[hazard_i])) >> + base = &base_cands[1]; >> + else if (!load_p >> + && *base_cands[1].hazards[hazard_i] >> + < *(base->hazards[hazard_i])) >> + base = &base_cands[1]; >> } >> - >> - return true; >> } >> >> - budget--; >> - return memory_modified_in_insn_p (mem, store_insn->rtl ()); >> -} >> - >> -// Return true if LOAD may be modified by STORE. Make sure we keep >> -// within our BUDGET for alias analysis. >> -static bool >> -load_modified_by_store_p (insn_info *load, >> - insn_info *store, >> - int &budget) >> -{ >> - gcc_checking_assert (budget >= 0); >> + // Otherwise, hazards[0] > hazards[1]. >> + // Pair can be formed anywhere in (hazards[1], hazards[0]). >> + insn_range_info range (insns[0], insns[1]); >> + if (base->hazards[1]) >> + range.first = base->hazards[1]; >> + if (base->hazards[0]) >> + range.last = base->hazards[0]->prev_nondebug_insn (); >> >> - if (!budget) >> + // If the second insn can throw, narrow the move range to exactly that >> insn. >> + // This prevents us trying to move the second insn from the end of the BB. >> + if (cfun->can_throw_non_call_exceptions >> + && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX)) >> { >> - if (dump_file) >> - { >> - fprintf (dump_file, >> - "exceeded budget, assuming load %d aliases with store %d\n", >> - load->uid (), store->uid ()); >> - } >> - return true; >> + gcc_assert (range.includes (insns[1])); >> + range = insn_range_info (insns[1]); >> } >> >> - // It isn't safe to re-order stores over calls. >> - if (CALL_P (load->rtl ())) >> - return true; >> + // Placement strategy: push loads down and pull stores up, this should >> + // help register pressure by reducing live ranges. >> + if (load_p) >> + range.first = range.last; >> + else >> + range.last = range.first; >> >> - budget--; >> + if (dump_file) >> + { >> + auto print_hazard = [](insn_info *i) >> + { >> + if (i) >> + fprintf (dump_file, "%d", i->uid ()); >> + else >> + fprintf (dump_file, "-"); >> + }; >> + auto print_pair = [print_hazard](insn_info **i) >> + { >> + print_hazard (i[0]); >> + fprintf (dump_file, ","); >> + print_hazard (i[1]); >> + }; >> >> - // Iterate over all MEMs in the load, seeing if any alias with >> - // our store. >> - subrtx_var_iterator::array_type array; >> - rtx pat = PATTERN (load->rtl ()); >> - FOR_EACH_SUBRTX_VAR (iter, array, pat, NONCONST) >> - if (MEM_P (*iter) && memory_modified_in_insn_p (*iter, store->rtl ())) >> - return true; >> + fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (", >> + load_p, insns[0]->uid (), insns[1]->uid (), >> + base->def->regno ()); >> + print_pair (base->hazards); >> + fprintf (dump_file, "), move_range: (%d,%d)\n", >> + range.first->uid (), range.last->uid ()); >> + } >> >> - return false; >> + return fuse_pair (load_p, access_size, writeback, >> + i1, i2, *base, range); >> } >> >> -// Virtual base class for load/store walkers used in alias analysis. >> -struct alias_walker >> -{ >> - virtual bool conflict_p (int &budget) const = 0; >> - virtual insn_info *insn () const = 0; >> - virtual bool valid () const = 0; >> - virtual void advance () = 0; >> -}; >> - >> -// Implement some common functionality used by both store_walker >> -// and load_walker. >> -template<bool reverse> >> -class def_walker : public alias_walker >> -{ >> -protected: >> - using def_iter_t = typename std::conditional<reverse, >> - reverse_def_iterator, def_iterator>::type; >> - >> - static use_info *start_use_chain (def_iter_t &def_iter) >> - { >> - set_info *set = nullptr; >> - for (; *def_iter; def_iter++) >> - { >> - set = dyn_cast<set_info *> (*def_iter); >> - if (!set) >> - continue; >> - >> - use_info *use = reverse >> - ? set->last_nondebug_insn_use () >> - : set->first_nondebug_insn_use (); >> - >> - if (use) >> - return use; >> - } >> - >> - return nullptr; >> - } >> - >> - def_iter_t def_iter; >> - insn_info *limit; >> - def_walker (def_info *def, insn_info *limit) : >> - def_iter (def), limit (limit) {} >> - >> - virtual bool iter_valid () const { return *def_iter; } >> - >> -public: >> - insn_info *insn () const override { return (*def_iter)->insn (); } >> - void advance () override { def_iter++; } >> - bool valid () const override final >> - { >> - if (!iter_valid ()) >> - return false; >> - >> - if (reverse) >> - return *(insn ()) > *limit; >> - else >> - return *(insn ()) < *limit; >> - } >> -}; >> >> -// alias_walker that iterates over stores. >> -template<bool reverse, typename InsnPredicate> >> -class store_walker : public def_walker<reverse> >> +// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all >> insns >> +// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST. >> +// >> +// This function traverses the resulting 2D matrix of possible pair >> candidates >> +// and attempts to merge them into pairs. >> +// >> +// The algorithm is straightforward: if we consider a combined list of >> +// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program >> order, >> +// then we advance through X until we reach a crossing point (where X[i] and >> +// X[i+1] come from different source lists). >> +// >> +// At this point we know X[i] and X[i+1] are adjacent accesses, and we try >> to >> +// fuse them into a pair. If this succeeds, we remove X[i] and X[i+1] from >> +// their original lists and continue as above. >> +// >> +// In the failure case, we advance through the source list containing X[i] >> and >> +// continue as above (proceeding to the next crossing point). >> +// >> +// The rationale for skipping over groups of consecutive candidates from the >> +// same source list is as follows: >> +// >> +// In the store case, the insns in the group can't be re-ordered over each >> +// other as they are guaranteed to store to the same location, so we're >> +// guaranteed not to lose opportunities by doing this. >> +// >> +// In the load case, subsequent loads from the same location are either >> +// redundant (in which case they should have been cleaned up by an earlier >> +// optimization pass) or there is an intervening aliasing hazard, in which >> case >> +// we can't re-order them anyway, so provided earlier passes have cleaned up >> +// redundant loads, we shouldn't miss opportunities by doing this. >> +void >> +pair_fusion::merge_pairs (insn_list_t &left_list, >> + insn_list_t &right_list, >> + bool load_p, >> + unsigned access_size) >> { >> - rtx cand_mem; >> - InsnPredicate tombstone_p; >> - >> -public: >> - store_walker (def_info *mem_def, rtx mem, insn_info *limit_insn, >> - InsnPredicate tombstone_fn) : >> - def_walker<reverse> (mem_def, limit_insn), >> - cand_mem (mem), tombstone_p (tombstone_fn) {} >> - >> - bool conflict_p (int &budget) const override final >> - { >> - if (tombstone_p (this->insn ())) >> - return false; >> + if (dump_file) >> + { >> + fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p); >> + dump_insn_list (dump_file, left_list); >> + fprintf (dump_file, " x "); >> + dump_insn_list (dump_file, right_list); >> + fprintf (dump_file, "\n"); >> + } >> >> - return store_modifies_mem_p (cand_mem, this->insn (), budget); >> - } >> -}; >> + auto iter_l = left_list.begin (); >> + auto iter_r = right_list.begin (); >> >> -// alias_walker that iterates over loads. >> -template<bool reverse> >> -class load_walker : public def_walker<reverse> >> + while (iter_l != left_list.end () && iter_r != right_list.end ()) >> + { >> + auto next_l = std::next (iter_l); >> + auto next_r = std::next (iter_r); >> + if (**iter_l < **iter_r >> + && next_l != left_list.end () >> + && **next_l < **iter_r) >> + iter_l = next_l; >> + else if (**iter_r < **iter_l >> + && next_r != right_list.end () >> + && **next_r < **iter_l) >> + iter_r = next_r; >> + else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r)) >> + { >> + left_list.erase (iter_l); >> + iter_l = next_l; >> + right_list.erase (iter_r); >> + iter_r = next_r; >> + } >> + else if (**iter_l < **iter_r) >> + iter_l = next_l; >> + else >> + iter_r = next_r; >> + } >> +} >> +// If we emitted tombstone insns for this BB, iterate through the BB >> +// and remove all the tombstone insns, being sure to reparent any uses >> +// of mem to previous defs when we do this. >> +void >> +pair_fusion::cleanup_tombstones () >> { >> - using Base = def_walker<reverse>; >> - using use_iter_t = typename std::conditional<reverse, >> - reverse_use_iterator, nondebug_insn_use_iterator>::type; >> + // No need to do anything if we didn't emit a tombstone insn for this BB. >> + if (!m_emitted_tombstone) >> + return; >> >> - use_iter_t use_iter; >> - insn_info *cand_store; >> + insn_info *insn = m_bb->head_insn (); >> + while (insn) >> + { >> + insn_info *next = insn->next_nondebug_insn (); >> + if (!insn->is_real () >> + || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ())) >> + { >> + insn = next; >> + continue; >> + } >> >> - bool iter_valid () const override final { return *use_iter; } >> + auto def = memory_access (insn->defs ()); >> + auto set = dyn_cast<set_info *> (def); >> + if (set && set->has_any_uses ()) >> + { >> + def_info *prev_def = def->prev_def (); >> + auto prev_set = dyn_cast<set_info *> (prev_def); >> + if (!prev_set) >> + gcc_unreachable (); >> >> -public: >> - void advance () override final >> - { >> - use_iter++; >> - if (*use_iter) >> - return; >> - this->def_iter++; >> - use_iter = Base::start_use_chain (this->def_iter); >> - } >> + while (set->first_use ()) >> + crtl->ssa->reparent_use (set->first_use (), prev_set); >> + } >> >> - insn_info *insn () const override final >> - { >> - return (*use_iter)->insn (); >> - } >> + // Now set has no uses, we can delete it. >> + insn_change change (insn, insn_change::DELETE); >> + crtl->ssa->change_insn (change); >> + insn = next; >> + } >> +} >> >> - bool conflict_p (int &budget) const override final >> - { >> - return load_modified_by_store_p (insn (), cand_store, budget); >> - } >> +template<typename Map> >> +void >> +pair_fusion::traverse_base_map (Map &map) >> +{ >> + for (auto kv : map) >> + { >> + const auto &key = kv.first; >> + auto &value = kv.second; >> + transform_for_base (key.second, value); >> + } >> +} >> >> - load_walker (def_info *def, insn_info *store, insn_info *limit_insn) >> - : Base (def, limit_insn), >> - use_iter (Base::start_use_chain (this->def_iter)), >> - cand_store (store) {} >> -}; >> +void >> +pair_fusion::transform () >> +{ >> + traverse_base_map (expr_map); >> + traverse_base_map (def_map); >> +} >> >> // Process our alias_walkers in a round-robin fashion, proceeding until >> // nothing more can be learned from alias analysis. >> // >> // We try to maintain the invariant that if a walker becomes invalid, we >> // set its pointer to null. >> -static void >> -do_alias_analysis (insn_info *alias_hazards[4], >> +void >> +pair_fusion::do_alias_analysis (insn_info *alias_hazards[4], >> alias_walker *walkers[4], >> bool load_p) >> { >> const int n_walkers = 2 + (2 * !load_p); >> - int budget = aarch64_ldp_alias_check_limit; >> + int budget = pair_mem_alias_check_limit(); >> >> auto next_walker = [walkers,n_walkers](int current) -> int { >> for (int j = 1; j <= n_walkers; j++) >> @@ -2341,548 +2553,554 @@ do_alias_analysis (insn_info *alias_hazards[4], >> } >> } >> >> -// Given INSNS (in program order) which are known to be adjacent, look >> -// to see if either insn has a suitable RTL (register) base that we can >> -// use to form a pair. Push these to BASE_CANDS if we find any. CAND_MEMs >> -// gives the relevant mems from the candidate insns, ACCESS_SIZE gives the >> -// size of a single candidate access, and REVERSED says whether the accesses >> -// are inverted in offset order. >> +// Try and actually fuse the pair given by insns I1 and I2. >> // >> -// Returns an integer where bit (1 << i) is set if INSNS[i] uses writeback >> -// addressing. >> -static int >> -get_viable_bases (insn_info *insns[2], >> - vec<base_cand> &base_cands, >> - rtx cand_mems[2], >> - unsigned access_size, >> - bool reversed) >> +// Here we've done enough analysis to know this is safe, we only >> +// reject the pair at this stage if either the tuning policy says to, >> +// or recog fails on the final pair insn. >> +// >> +// LOAD_P is true for loads, ACCESS_SIZE gives the access size of each >> +// candidate insn. Bit i of WRITEBACK is set if the ith insn (in program >> +// order) uses writeback. >> +// >> +// BASE gives the chosen base candidate for the pair and MOVE_RANGE is >> +// a singleton range which says where to place the pair. >> +bool >> +pair_fusion::fuse_pair (bool load_p, >> + unsigned access_size, >> + int writeback, >> + insn_info *i1, insn_info *i2, >> + base_cand &base, >> + const insn_range_info &move_range) >> { >> - // We discovered this pair through a common base. Need to ensure that >> - // we have a common base register that is live at both locations. >> - def_info *base_defs[2] = {}; >> - int writeback = 0; >> - for (int i = 0; i < 2; i++) >> - { >> - const bool is_lower = (i == reversed); >> - poly_int64 poly_off; >> - rtx base = ldp_strip_offset (cand_mems[i], &poly_off); >> - if (GET_RTX_CLASS (GET_CODE (XEXP (cand_mems[i], 0))) == RTX_AUTOINC) >> - writeback |= (1 << i); >> - >> - if (!REG_P (base) || !poly_off.is_constant ()) >> - continue; >> - >> - // Punt on accesses relative to eliminable regs. See the comment in >> - // ldp_bb_info::track_access for a detailed explanation of this. >> - if (!reload_completed >> - && (REGNO (base) == FRAME_POINTER_REGNUM >> - || REGNO (base) == ARG_POINTER_REGNUM)) >> - continue; >> - >> - HOST_WIDE_INT base_off = poly_off.to_constant (); >> - >> - // It should be unlikely that we ever punt here, since MEM_EXPR offset >> - // alignment should be a good proxy for register offset alignment. >> - if (base_off % access_size != 0) >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - "base not viable, offset misaligned (insn %d)\n", >> - insns[i]->uid ()); >> - continue; >> - } >> - >> - base_off /= access_size; >> - >> - if (!is_lower) >> - base_off--; >> - >> - if (base_off < LDP_MIN_IMM || base_off > LDP_MAX_IMM) >> - continue; >> - >> - use_info *use = find_access (insns[i]->uses (), REGNO (base)); >> - gcc_assert (use); >> - base_defs[i] = use->def (); >> - } >> + auto attempt = crtl->ssa->new_change_attempt (); >> >> - if (!base_defs[0] && !base_defs[1]) >> + auto make_change = [&attempt](insn_info *insn) >> { >> - if (dump_file) >> - fprintf (dump_file, "no viable base register for pair (%d,%d)\n", >> - insns[0]->uid (), insns[1]->uid ()); >> - return writeback; >> - } >> - >> - for (int i = 0; i < 2; i++) >> - if ((writeback & (1 << i)) && !base_defs[i]) >> - { >> - if (dump_file) >> - fprintf (dump_file, "insn %d has writeback but base isn't viable\n", >> - insns[i]->uid ()); >> - return writeback; >> - } >> - >> - if (writeback == 3 >> - && base_defs[0]->regno () != base_defs[1]->regno ()) >> + return crtl->ssa->change_alloc<insn_change> (attempt, insn); >> + }; >> + auto make_delete = [&attempt](insn_info *insn) >> { >> - if (dump_file) >> - fprintf (dump_file, >> - "pair (%d,%d): double writeback with distinct regs (%d,%d): " >> - "punting\n", >> - insns[0]->uid (), insns[1]->uid (), >> - base_defs[0]->regno (), base_defs[1]->regno ()); >> - return writeback; >> - } >> + return crtl->ssa->change_alloc<insn_change> (attempt, >> + insn, >> + insn_change::DELETE); >> + }; >> >> - if (base_defs[0] && base_defs[1] >> - && base_defs[0]->regno () == base_defs[1]->regno ()) >> - { >> - // Easy case: insns already share the same base reg. >> - base_cands.quick_push (base_defs[0]); >> - return writeback; >> - } >> + if (*i1 > *i2) >> + return false; >> >> - // Otherwise, we know that one of the bases must change. >> - // >> - // Note that if there is writeback we must use the writeback base >> - // (we know now there is exactly one). >> - for (int i = 0; i < 2; i++) >> - if (base_defs[i] && (!writeback || (writeback & (1 << i)))) >> - base_cands.quick_push (base_cand { base_defs[i], i }); >> + insn_info *first = (*i1 < *i2) ? i1 : i2; >> + insn_info *second = (first == i1) ? i2 : i1; >> >> - return writeback; >> -} >> + insn_info *pair_dst = move_range.singleton (); >> + gcc_assert (pair_dst); >> + >> + insn_info *insns[2] = { first, second }; >> >> -// Given two adjacent memory accesses of the same size, I1 and I2, try >> -// and see if we can merge them into a ldp or stp. >> -// >> -// ACCESS_SIZE gives the (common) size of a single access, LOAD_P is true >> -// if the accesses are both loads, otherwise they are both stores. >> -bool >> -ldp_bb_info::try_fuse_pair (bool load_p, unsigned access_size, >> - insn_info *i1, insn_info *i2) >> -{ >> - if (dump_file) >> - fprintf (dump_file, "analyzing pair (load=%d): (%d,%d)\n", >> - load_p, i1->uid (), i2->uid ()); >> + auto_vec<insn_change *> changes; >> + auto_vec<int, 2> tombstone_uids (2); >> >> - insn_info *insns[2]; >> - bool reversed = false; >> - if (*i1 < *i2) >> - { >> - insns[0] = i1; >> - insns[1] = i2; >> - } >> - else >> - { >> - insns[0] = i2; >> - insns[1] = i1; >> - reversed = true; >> - } >> + rtx pats[2] = { >> + PATTERN (first->rtl ()), >> + PATTERN (second->rtl ()) >> + }; >> >> - rtx cand_mems[2]; >> - rtx reg_ops[2]; >> - rtx pats[2]; >> + // Make copies of the patterns as we might need to refer to the original >> RTL >> + // later, for example when updating debug uses (which is after we've >> updated >> + // one or both of the patterns in the candidate insns). >> + rtx orig_rtl[2]; >> for (int i = 0; i < 2; i++) >> - { >> - pats[i] = PATTERN (insns[i]->rtl ()); >> - cand_mems[i] = XEXP (pats[i], load_p); >> - reg_ops[i] = XEXP (pats[i], !load_p); >> - } >> + orig_rtl[i] = copy_rtx (pats[i]); >> >> - if (load_p && reg_overlap_mentioned_p (reg_ops[0], reg_ops[1])) >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - "punting on ldp due to reg conflcits (%d,%d)\n", >> - insns[0]->uid (), insns[1]->uid ()); >> - return false; >> - } >> + use_array input_uses[2] = { first->uses (), second->uses () }; >> + def_array input_defs[2] = { first->defs (), second->defs () }; >> >> - if (cfun->can_throw_non_call_exceptions >> - && find_reg_note (insns[0]->rtl (), REG_EH_REGION, NULL_RTX) >> - && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX)) >> + int changed_insn = -1; >> + if (base.from_insn != -1) >> { >> - if (dump_file) >> - fprintf (dump_file, >> - "can't combine insns with EH side effects (%d,%d)\n", >> - insns[0]->uid (), insns[1]->uid ()); >> - return false; >> - } >> + // If we're not already using a shared base, we need >> + // to re-write one of the accesses to use the base from >> + // the other insn. >> + gcc_checking_assert (base.from_insn == 0 || base.from_insn == 1); >> + changed_insn = !base.from_insn; >> >> - auto_vec<base_cand, 2> base_cands (2); >> + rtx base_pat = pats[base.from_insn]; >> + rtx change_pat = pats[changed_insn]; >> + rtx base_mem = XEXP (base_pat, load_p); >> + rtx change_mem = XEXP (change_pat, load_p); >> >> - int writeback = get_viable_bases (insns, base_cands, cand_mems, >> - access_size, reversed); >> - if (base_cands.is_empty ()) >> - { >> - if (dump_file) >> - fprintf (dump_file, "no viable base for pair (%d,%d)\n", >> - insns[0]->uid (), insns[1]->uid ()); >> - return false; >> - } >> + const bool lower_base_p = (insns[base.from_insn] == i1); >> + HOST_WIDE_INT adjust_amt = access_size; >> + if (!lower_base_p) >> + adjust_amt *= -1; >> >> - // Punt on frame-related insns with writeback. We probably won't see >> - // these in practice, but this is conservative and ensures we don't >> - // have to worry about these later on. >> - if (writeback && (RTX_FRAME_RELATED_P (i1->rtl ()) >> - || RTX_FRAME_RELATED_P (i2->rtl ()))) >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - "rejecting pair (%d,%d): frame-related insn with writeback\n", >> - i1->uid (), i2->uid ()); >> - return false; >> - } >> + rtx change_reg = XEXP (change_pat, !load_p); >> + machine_mode mode_for_mem = GET_MODE (change_mem); >> + rtx effective_base = drop_writeback (base_mem); >> + rtx new_mem = adjust_address_nv (effective_base, >> + mode_for_mem, >> + adjust_amt); >> + rtx new_set = load_p >> + ? gen_rtx_SET (change_reg, new_mem) >> + : gen_rtx_SET (new_mem, change_reg); >> >> - rtx *ignore = &XEXP (pats[1], load_p); >> - for (auto use : insns[1]->uses ()) >> - if (!use->is_mem () >> - && refers_to_regno_p (use->regno (), use->regno () + 1, pats[1], ignore) >> - && use->def () && use->def ()->insn () == insns[0]) >> - { >> - // N.B. we allow a true dependence on the base address, as this >> - // happens in the case of auto-inc accesses. Consider a post-increment >> - // load followed by a regular indexed load, for example. >> - if (dump_file) >> - fprintf (dump_file, >> - "%d has non-address true dependence on %d, rejecting pair\n", >> - insns[1]->uid (), insns[0]->uid ()); >> - return false; >> - } >> + pats[changed_insn] = new_set; >> >> - unsigned i = 0; >> - while (i < base_cands.length ()) >> - { >> - base_cand &cand = base_cands[i]; >> + auto keep_use = [&](use_info *u) >> + { >> + return refers_to_regno_p (u->regno (), u->regno () + 1, >> + change_pat, &XEXP (change_pat, load_p)); >> + }; >> >> - rtx *ignore[2] = {}; >> - for (int j = 0; j < 2; j++) >> - if (cand.from_insn == !j) >> - ignore[j] = &XEXP (cand_mems[j], 0); >> + // Drop any uses that only occur in the old address. >> + input_uses[changed_insn] = filter_accesses (attempt, >> + input_uses[changed_insn], >> + keep_use); >> + } >> >> - insn_info *h = first_hazard_after (insns[0], ignore[0]); >> - if (h && *h < *insns[1]) >> - cand.hazards[0] = h; >> + rtx writeback_effect = NULL_RTX; >> + if (writeback) >> + writeback_effect = extract_writebacks (load_p, pats, changed_insn); >> >> - h = latest_hazard_before (insns[1], ignore[1]); >> - if (h && *h > *insns[0]) >> - cand.hazards[1] = h; >> + const auto base_regno = base.def->regno (); >> >> - if (!cand.viable ()) >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - "pair (%d,%d): rejecting base %d due to dataflow " >> - "hazards (%d,%d)\n", >> - insns[0]->uid (), >> - insns[1]->uid (), >> - cand.def->regno (), >> - cand.hazards[0]->uid (), >> - cand.hazards[1]->uid ()); >> + if (base.from_insn == -1 && (writeback & 1)) >> + { >> + // If the first of the candidate insns had a writeback form, we'll >> need to >> + // drop the use of the updated base register from the second insn's >> uses. >> + // >> + // N.B. we needn't worry about the base register occurring as a store >> + // operand, as we checked that there was no non-address true >> dependence >> + // between the insns in try_fuse_pair. >> + gcc_checking_assert (find_access (input_uses[1], base_regno)); >> + input_uses[1] = check_remove_regno_access (attempt, >> + input_uses[1], >> + base_regno); >> + } >> >> - base_cands.ordered_remove (i); >> - } >> - else >> - i++; >> + // Go through and drop uses that only occur in register notes, >> + // as we won't be preserving those. >> + for (int i = 0; i < 2; i++) >> + { >> + auto rti = insns[i]->rtl (); >> + if (!REG_NOTES (rti)) >> + continue; >> + >> + input_uses[i] = remove_note_accesses (attempt, input_uses[i]); >> } >> >> - if (base_cands.is_empty ()) >> + // Edge case: if the first insn is a writeback load and the >> + // second insn is a non-writeback load which transfers into the base >> + // register, then we should drop the writeback altogether as the >> + // update of the base register from the second load should prevail. >> + // >> + // For example: >> + // ldr x2, [x1], #8 >> + // ldr x1, [x1] >> + // --> >> + // ldp x2, x1, [x1] >> + if (writeback == 1 >> + && load_p >> + && find_access (input_defs[1], base_regno)) >> { >> if (dump_file) >> fprintf (dump_file, >> - "can't form pair (%d,%d) due to dataflow hazards\n", >> - insns[0]->uid (), insns[1]->uid ()); >> - return false; >> + " pair_mem: i%d has wb but subsequent i%d has non-wb " >> + "update of base (r%d), dropping wb\n", >> + insns[0]->uid (), insns[1]->uid (), base_regno); >> + gcc_assert (writeback_effect); >> + writeback_effect = NULL_RTX; >> } >> >> - insn_info *alias_hazards[4] = {}; >> + // So far the patterns have been in instruction order, >> + // now we want them in offset order. >> + if (i1 != first) >> + std::swap (pats[0], pats[1]); >> >> - // First def of memory after the first insn, and last def of memory >> - // before the second insn, respectively. >> - def_info *mem_defs[2] = {}; >> - if (load_p) >> + poly_int64 offsets[2]; >> + for (int i = 0; i < 2; i++) >> { >> - if (!MEM_READONLY_P (cand_mems[0])) >> - { >> - mem_defs[0] = memory_access (insns[0]->uses ())->def (); >> - gcc_checking_assert (mem_defs[0]); >> - mem_defs[0] = mem_defs[0]->next_def (); >> - } >> - if (!MEM_READONLY_P (cand_mems[1])) >> + rtx mem = XEXP (pats[i], load_p); >> + gcc_checking_assert (MEM_P (mem)); >> + rtx base = strip_offset (XEXP (mem, 0), offsets + i); >> + gcc_checking_assert (REG_P (base)); >> + gcc_checking_assert (base_regno == REGNO (base)); >> + } >> + >> + // If either of the original insns had writeback, but the resulting pair >> insn >> + // does not (can happen e.g. in the pair mem edge case above, or if the >> writeback >> + // effects cancel out), then drop the def(s) of the base register as >> + // appropriate. >> + // >> + // Also drop the first def in the case that both of the original insns had >> + // writeback. The second def could well have uses, but the first def >> should >> + // only be used by the second insn (and we dropped that use above). >> + for (int i = 0; i < 2; i++) >> + if ((!writeback_effect && (writeback & (1 << i))) >> + || (i == 0 && writeback == 3)) >> + input_defs[i] = check_remove_regno_access (attempt, >> + input_defs[i], >> + base_regno); >> + >> + // If we don't currently have a writeback pair, and we don't have >> + // a load that clobbers the base register, look for a trailing destructive >> + // update of the base register and try and fold it in to make this into a >> + // writeback pair. >> + insn_info *trailing_add = nullptr; >> + if (pair_trailing_writeback_p () >> + && !writeback_effect >> + && (!load_p || (!refers_to_regno_p (base_regno, base_regno + 1, >> + XEXP (pats[0], 0), nullptr) >> + && !refers_to_regno_p (base_regno, base_regno + 1, >> + XEXP (pats[1], 0), nullptr)))) >> + { >> + def_info *add_def; >> + trailing_add = find_trailing_add (insns, move_range, writeback, >> + &writeback_effect, >> + &add_def, base.def, offsets[0], >> + access_size); >> + if (trailing_add) >> { >> - mem_defs[1] = memory_access (insns[1]->uses ())->def (); >> - gcc_checking_assert (mem_defs[1]); >> + // The def of the base register from the trailing add should prevail. >> + input_defs[0] = insert_access (attempt, add_def, input_defs[0]); >> + gcc_assert (input_defs[0].is_valid ()); >> } >> } >> - else >> + >> + // Now that we know what base mem we're going to use, check if it's OK >> + // with the pair mem policy. >> + rtx first_mem = XEXP (pats[0], load_p); >> + if (pair_mem_ok_policy (first_mem, >> + load_p, >> + GET_MODE (first_mem))) >> { >> - mem_defs[0] = memory_access (insns[0]->defs ())->next_def (); >> - mem_defs[1] = memory_access (insns[1]->defs ())->prev_def (); >> - gcc_checking_assert (mem_defs[0]); >> - gcc_checking_assert (mem_defs[1]); >> + if (dump_file) >> + fprintf (dump_file, "punting on pair (%d,%d), pair mem policy says >> no\n", >> + i1->uid (), i2->uid ()); >> + return false; >> } >> >> - auto tombstone_p = [&](insn_info *insn) -> bool { >> - return m_emitted_tombstone >> - && bitmap_bit_p (&m_tombstone_bitmap, insn->uid ()); >> - }; >> + rtx reg_notes = combine_reg_notes (first, second, load_p); >> >> - store_walker<false, decltype(tombstone_p)> >> - forward_store_walker (mem_defs[0], cand_mems[0], insns[1], tombstone_p); >> + rtx pair_pat; >> >> - store_walker<true, decltype(tombstone_p)> >> - backward_store_walker (mem_defs[1], cand_mems[1], insns[0], >> tombstone_p); >> + set_multiword_subreg (first, second, load_p); >> >> - alias_walker *walkers[4] = {}; >> - if (mem_defs[0]) >> - walkers[0] = &forward_store_walker; >> - if (mem_defs[1]) >> - walkers[1] = &backward_store_walker; >> + pair_pat = gen_load_store_pair (pats, writeback_effect, load_p); >> + if (pair_pat == NULL_RTX) >> + return false; >> + insn_change *pair_change = nullptr; >> + auto set_pair_pat = [pair_pat,reg_notes](insn_change *change) { >> + rtx_insn *rti = change->insn ()->rtl (); >> + validate_unshare_change (rti, &PATTERN (rti), pair_pat, true); >> + validate_change (rti, ®_NOTES (rti), reg_notes, true); >> + }; >> >> - if (load_p && (mem_defs[0] || mem_defs[1])) >> - do_alias_analysis (alias_hazards, walkers, load_p); >> - else >> + if (load_p) >> { >> - // We want to find any loads hanging off the first store. >> - mem_defs[0] = memory_access (insns[0]->defs ()); >> - load_walker<false> forward_load_walker (mem_defs[0], insns[0], >> insns[1]); >> - load_walker<true> backward_load_walker (mem_defs[1], insns[1], >> insns[0]); >> - walkers[2] = &forward_load_walker; >> - walkers[3] = &backward_load_walker; >> - do_alias_analysis (alias_hazards, walkers, load_p); >> - // Now consolidate hazards back down. >> - if (alias_hazards[2] >> - && (!alias_hazards[0] || (*alias_hazards[2] < *alias_hazards[0]))) >> - alias_hazards[0] = alias_hazards[2]; >> + changes.safe_push (make_delete (first)); >> + pair_change = make_change (second); >> + changes.safe_push (pair_change); >> >> - if (alias_hazards[3] >> - && (!alias_hazards[1] || (*alias_hazards[3] > *alias_hazards[1]))) >> - alias_hazards[1] = alias_hazards[3]; >> - } >> + pair_change->move_range = move_range; >> + pair_change->new_defs = merge_access_arrays (attempt, >> + input_defs[0], >> + input_defs[1]); >> + gcc_assert (pair_change->new_defs.is_valid ()); >> >> - if (alias_hazards[0] && alias_hazards[1] >> - && *alias_hazards[0] <= *alias_hazards[1]) >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - "cannot form pair (%d,%d) due to alias conflicts (%d,%d)\n", >> - i1->uid (), i2->uid (), >> - alias_hazards[0]->uid (), alias_hazards[1]->uid ()); >> - return false; >> + pair_change->new_uses >> + = merge_access_arrays (attempt, >> + drop_memory_access (input_uses[0]), >> + drop_memory_access (input_uses[1])); >> + gcc_assert (pair_change->new_uses.is_valid ()); >> + set_pair_pat (pair_change); >> } >> - >> - // Now narrow the hazards on each base candidate using >> - // the alias hazards. >> - i = 0; >> - while (i < base_cands.length ()) >> + else >> { >> - base_cand &cand = base_cands[i]; >> - if (alias_hazards[0] && (!cand.hazards[0] >> - || *alias_hazards[0] < *cand.hazards[0])) >> - cand.hazards[0] = alias_hazards[0]; >> - if (alias_hazards[1] && (!cand.hazards[1] >> - || *alias_hazards[1] > *cand.hazards[1])) >> - cand.hazards[1] = alias_hazards[1]; >> - >> - if (cand.viable ()) >> - i++; >> - else >> + using Action = stp_change_builder::action; >> + insn_info *store_to_change = try_repurpose_store (first, second, >> + move_range); >> + stp_change_builder builder (insns, store_to_change, pair_dst); >> + insn_change *change; >> + set_info *new_set = nullptr; >> + for (; !builder.done (); builder.advance ()) >> { >> - if (dump_file) >> - fprintf (dump_file, "pair (%d,%d): rejecting base %d due to " >> - "alias/dataflow hazards (%d,%d)", >> - insns[0]->uid (), insns[1]->uid (), >> - cand.def->regno (), >> - cand.hazards[0]->uid (), >> - cand.hazards[1]->uid ()); >> - >> - base_cands.ordered_remove (i); >> - } >> - } >> + auto action = builder.get_change (); >> + change = (action.type == Action::INSERT) >> + ? nullptr : make_change (action.insn); >> + switch (action.type) >> + { >> + case Action::CHANGE: >> + { >> + set_pair_pat (change); >> + change->new_uses = merge_access_arrays (attempt, >> + input_uses[0], >> + input_uses[1]); >> + auto d1 = drop_memory_access (input_defs[0]); >> + auto d2 = drop_memory_access (input_defs[1]); >> + change->new_defs = merge_access_arrays (attempt, d1, d2); >> + gcc_assert (change->new_defs.is_valid ()); >> + def_info *stp_def = memory_access (change->insn ()->defs ()); >> + change->new_defs = insert_access (attempt, >> + stp_def, >> + change->new_defs); >> + gcc_assert (change->new_defs.is_valid ()); >> + change->move_range = move_range; >> + pair_change = change; >> + break; >> + } >> + case Action::TOMBSTONE: >> + { >> + tombstone_uids.quick_push (change->insn ()->uid ()); >> + rtx_insn *rti = change->insn ()->rtl (); >> + validate_change (rti, &PATTERN (rti), gen_tombstone (), true); >> + validate_change (rti, ®_NOTES (rti), NULL_RTX, true); >> + change->new_uses = use_array (nullptr, 0); >> + break; >> + } >> + case Action::INSERT: >> + { >> + if (dump_file) >> + fprintf (dump_file, >> + " stp: cannot re-purpose candidate stores\n"); >> >> - if (base_cands.is_empty ()) >> - { >> - if (dump_file) >> - fprintf (dump_file, >> - "cannot form pair (%d,%d) due to alias/dataflow hazards", >> - insns[0]->uid (), insns[1]->uid ()); >> + auto new_insn = crtl->ssa->create_insn (attempt, INSN, pair_pat); >> + change = make_change (new_insn); >> + change->move_range = move_range; >> + change->new_uses = merge_access_arrays (attempt, >> + input_uses[0], >> + input_uses[1]); >> + gcc_assert (change->new_uses.is_valid ()); >> >> - return false; >> - } >> + auto d1 = drop_memory_access (input_defs[0]); >> + auto d2 = drop_memory_access (input_defs[1]); >> + change->new_defs = merge_access_arrays (attempt, d1, d2); >> + gcc_assert (change->new_defs.is_valid ()); >> >> - base_cand *base = &base_cands[0]; >> - if (base_cands.length () > 1) >> - { >> - // If there are still multiple viable bases, it makes sense >> - // to choose one that allows us to reduce register pressure, >> - // for loads this means moving further down, for stores this >> - // means moving further up. >> - gcc_checking_assert (base_cands.length () == 2); >> - const int hazard_i = !load_p; >> - if (base->hazards[hazard_i]) >> - { >> - if (!base_cands[1].hazards[hazard_i]) >> - base = &base_cands[1]; >> - else if (load_p >> - && *base_cands[1].hazards[hazard_i] >> - > *(base->hazards[hazard_i])) >> - base = &base_cands[1]; >> - else if (!load_p >> - && *base_cands[1].hazards[hazard_i] >> - < *(base->hazards[hazard_i])) >> - base = &base_cands[1]; >> + new_set = crtl->ssa->create_set (attempt, new_insn, memory); >> + change->new_defs = insert_access (attempt, new_set, >> + change->new_defs); >> + gcc_assert (change->new_defs.is_valid ()); >> + pair_change = change; >> + break; >> + } >> + case Action::FIXUP_USE: >> + { >> + // This use now needs to consume memory from our stp. >> + if (dump_file) >> + fprintf (dump_file, >> + " stp: changing i%d to use mem from new stp " >> + "(after i%d)\n", >> + action.insn->uid (), pair_dst->uid ()); >> + change->new_uses = drop_memory_access (change->new_uses); >> + gcc_assert (new_set); >> + auto new_use = crtl->ssa->create_use (attempt, action.insn, >> + new_set); >> + change->new_uses = insert_access (attempt, new_use, >> + change->new_uses); >> + break; >> + } >> + } >> + changes.safe_push (change); >> } >> } >> >> - // Otherwise, hazards[0] > hazards[1]. >> - // Pair can be formed anywhere in (hazards[1], hazards[0]). >> - insn_range_info range (insns[0], insns[1]); >> - if (base->hazards[1]) >> - range.first = base->hazards[1]; >> - if (base->hazards[0]) >> - range.last = base->hazards[0]->prev_nondebug_insn (); >> - >> - // If the second insn can throw, narrow the move range to exactly that >> insn. >> - // This prevents us trying to move the second insn from the end of the BB. >> - if (cfun->can_throw_non_call_exceptions >> - && find_reg_note (insns[1]->rtl (), REG_EH_REGION, NULL_RTX)) >> + if (trailing_add) >> + changes.safe_push (make_delete (trailing_add)); >> + else if ((writeback & 2) && !writeback_effect) >> { >> - gcc_assert (range.includes (insns[1])); >> - range = insn_range_info (insns[1]); >> + // The second insn initially had writeback but now the pair does not, >> + // need to update any nondebug uses of the base register def in the >> + // second insn. We'll take care of debug uses later. >> + auto def = find_access (insns[1]->defs (), base_regno); >> + gcc_assert (def); >> + auto set = dyn_cast<set_info *> (def); >> + if (set && set->has_nondebug_uses ()) >> + { >> + auto orig_use = find_access (insns[0]->uses (), base_regno); >> + for (auto use : set->nondebug_insn_uses ()) >> + { >> + auto change = make_change (use->insn ()); >> + change->new_uses = check_remove_regno_access (attempt, >> + change->new_uses, >> + base_regno); >> + change->new_uses = insert_access (attempt, >> + orig_use, >> + change->new_uses); >> + changes.safe_push (change); >> + } >> + } >> } >> >> - // Placement strategy: push loads down and pull stores up, this should >> - // help register pressure by reducing live ranges. >> - if (load_p) >> - range.first = range.last; >> - else >> - range.last = range.first; >> + auto is_changing = insn_is_changing (changes); >> + for (unsigned i = 0; i < changes.length (); i++) >> + gcc_assert (rtl_ssa::restrict_movement_ignoring (*changes[i], >> is_changing)); >> >> - if (dump_file) >> + // Check the pair pattern is recog'd. >> + if (!rtl_ssa::recog_ignoring (attempt, *pair_change, is_changing)) >> { >> - auto print_hazard = [](insn_info *i) >> - { >> - if (i) >> - fprintf (dump_file, "%d", i->uid ()); >> - else >> - fprintf (dump_file, "-"); >> - }; >> - auto print_pair = [print_hazard](insn_info **i) >> - { >> - print_hazard (i[0]); >> - fprintf (dump_file, ","); >> - print_hazard (i[1]); >> - }; >> + if (dump_file) >> + fprintf (dump_file, " failed to form pair, recog failed\n"); >> >> - fprintf (dump_file, "fusing pair [L=%d] (%d,%d), base=%d, hazards: (", >> - load_p, insns[0]->uid (), insns[1]->uid (), >> - base->def->regno ()); >> - print_pair (base->hazards); >> - fprintf (dump_file, "), move_range: (%d,%d)\n", >> - range.first->uid (), range.last->uid ()); >> + // Free any reg notes we allocated. >> + while (reg_notes) >> + { >> + rtx next = XEXP (reg_notes, 1); >> + free_EXPR_LIST_node (reg_notes); >> + reg_notes = next; >> + } >> + cancel_changes (0); >> + return false; >> } >> >> - return fuse_pair (load_p, access_size, writeback, >> - i1, i2, *base, range); >> + gcc_assert (crtl->ssa->verify_insn_changes (changes)); >> + >> + // Fix up any debug uses that will be affected by the changes. >> + if (MAY_HAVE_DEBUG_INSNS) >> + fixup_debug_uses (attempt, insns, orig_rtl, pair_dst, trailing_add, >> + load_p, writeback, writeback_effect, base_regno); >> + >> + confirm_change_group (); >> + crtl->ssa->change_insns (changes); >> + >> + gcc_checking_assert (tombstone_uids.length () <= 2); >> + for (auto uid : tombstone_uids) >> + track_tombstone (uid); >> + >> + return true; >> } >> >> -static void >> -dump_insn_list (FILE *f, const insn_list_t &l) >> +struct aarch64_pair_fusion : public pair_fusion >> { >> - fprintf (f, "("); >> +public: >> + aarch64_pair_fusion (bb_info *bb) : pair_fusion (bb) {}; >> + bool is_fpsimd_op_p (rtx reg_op, machine_mode mem_mode, bool load_p) >> + { >> + const bool fpsimd_op_p >> + = reload_completed >> + ? (REG_P (reg_op) && FP_REGNUM_P (REGNO (reg_op))) >> + : (GET_MODE_CLASS (mem_mode) != MODE_INT >> + && (load_p || !aarch64_const_zero_rtx_p (reg_op))); >> + return fpsimd_op_p; >> + } >> >> - auto i = l.begin (); >> - auto end = l.end (); >> + bool pair_mem_ok_policy (rtx first_mem, bool load_p, machine_mode mode) >> + { >> + return !aarch64_mem_ok_with_ldpstp_policy_model (first_mem, >> + load_p, >> + mode); >> + } >> + bool pair_operand_mode_ok_p (machine_mode mode); >> >> - if (i != end) >> - fprintf (f, "%d", (*i)->uid ()); >> - i++; >> + void transform_for_base (int encoded_lfs, >> + access_group &group); >> + rtx gen_load_store_pair (rtx *pats, >> + rtx writeback, >> + bool load_p) >> + { >> + rtx pair_pat; >> >> - for (; i != end; i++) >> - fprintf (f, ", %d", (*i)->uid ()); >> + if (writeback) >> + { >> + auto patvec = gen_rtvec (3, writeback, pats[0], pats[1]); >> + pair_pat = gen_rtx_PARALLEL (VOIDmode, patvec); >> + } >> + else if (load_p) >> + pair_pat = aarch64_gen_load_pair (XEXP (pats[0], 0), >> + XEXP (pats[1], 0), >> + XEXP (pats[0], 1)); >> + else >> + pair_pat = aarch64_gen_store_pair (XEXP (pats[0], 0), >> + XEXP (pats[0], 1), >> + XEXP (pats[1], 1)); >> + return pair_pat; >> + } >> >> - fprintf (f, ")"); >> -} >> + void set_multiword_subreg (insn_info *i1, insn_info *i2, bool load_p) >> + { >> + if (i1 || i2 || load_p) >> + return; >> + return; >> + } >> + bool pair_trailing_writeback_p () >> + { >> + return aarch64_ldp_writeback > 1; >> + } >> + bool pair_check_register_operand (bool load_p, rtx reg_op, machine_mode >> mem_mode) >> + { >> + return (load_p >> + ? !aarch64_ldp_reg_operand (reg_op, mem_mode) >> + : !aarch64_stp_reg_operand (reg_op, mem_mode)); >> + } >> + int pair_mem_alias_check_limit () >> + { >> + return aarch64_ldp_alias_check_limit; >> + } >> + bool fuseable_store_p (insn_info *i1, insn_info *i2) { return i1 || i2;} >> + bool fuseable_load_p (insn_info *insn) { return insn;} >> + bool pair_is_writeback () >> + { >> + return !aarch64_ldp_writeback; >> + } >> +private: >> + int num_pairs; >> + rtx_insn *reg_ops[3]; >> +}; >> >> -DEBUG_FUNCTION void >> -debug (const insn_list_t &l) >> +static lfs_fields >> +decode_lfs (int lfs) >> { >> - dump_insn_list (stderr, l); >> - fprintf (stderr, "\n"); >> + bool load_p = (lfs & (1 << 3)); >> + bool fpsimd_p = (lfs & (1 << 2)); >> + unsigned size = 1U << ((lfs & 3) + 2); >> + return { load_p, fpsimd_p, size }; >> } >> >> -// LEFT_LIST and RIGHT_LIST are lists of candidate instructions where all >> insns >> -// in LEFT_LIST are known to be adjacent to those in RIGHT_LIST. >> -// >> -// This function traverses the resulting 2D matrix of possible pair >> candidates >> -// and attempts to merge them into pairs. >> -// >> -// The algorithm is straightforward: if we consider a combined list of >> -// candidates X obtained by merging LEFT_LIST and RIGHT_LIST in program >> order, >> -// then we advance through X until we reach a crossing point (where X[i] and >> -// X[i+1] come from different source lists). >> -// >> -// At this point we know X[i] and X[i+1] are adjacent accesses, and we try >> to >> -// fuse them into a pair. If this succeeds, we remove X[i] and X[i+1] from >> -// their original lists and continue as above. >> -// >> -// In the failure case, we advance through the source list containing X[i] >> and >> -// continue as above (proceeding to the next crossing point). >> -// >> -// The rationale for skipping over groups of consecutive candidates from the >> -// same source list is as follows: >> -// >> -// In the store case, the insns in the group can't be re-ordered over each >> -// other as they are guaranteed to store to the same location, so we're >> -// guaranteed not to lose opportunities by doing this. >> -// >> -// In the load case, subsequent loads from the same location are either >> -// redundant (in which case they should have been cleaned up by an earlier >> -// optimization pass) or there is an intervening aliasing hazard, in which >> case >> -// we can't re-order them anyway, so provided earlier passes have cleaned up >> -// redundant loads, we shouldn't miss opportunities by doing this. >> -void >> -ldp_bb_info::merge_pairs (insn_list_t &left_list, >> - insn_list_t &right_list, >> - bool load_p, >> - unsigned access_size) >> +// Return true if we should consider forming ldp/stp insns from memory >> +// accesses with operand mode MODE at this stage in compilation. >> +static bool >> +ldp_operand_mode_ok_p (machine_mode mode) >> { >> - if (dump_file) >> - { >> - fprintf (dump_file, "merge_pairs [L=%d], cand vecs ", load_p); >> - dump_insn_list (dump_file, left_list); >> - fprintf (dump_file, " x "); >> - dump_insn_list (dump_file, right_list); >> - fprintf (dump_file, "\n"); >> - } >> + const bool allow_qregs >> + = !(aarch64_tune_params.extra_tuning_flags >> + & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS); >> >> - auto iter_l = left_list.begin (); >> - auto iter_r = right_list.begin (); >> + if (!aarch64_ldpstp_operand_mode_p (mode)) >> + return false; >> >> - while (iter_l != left_list.end () && iter_r != right_list.end ()) >> + const auto size = GET_MODE_SIZE (mode).to_constant (); >> + if (size == 16 && !allow_qregs) >> + return false; >> + >> + // We don't pair up TImode accesses before RA because TImode is >> + // special in that it can be allocated to a pair of GPRs or a single >> + // FPR, and the RA is best placed to make that decision. >> + return reload_completed || mode != TImode; >> +} >> + >> +bool >> +aarch64_pair_fusion::pair_operand_mode_ok_p (machine_mode mode) >> +{ >> + return (ldp_operand_mode_ok_p (mode)); >> +} >> + >> +// Given a pair mode MODE, return a canonical mode to be used for a single >> +// operand of such a pair. Currently we only use this when promoting a >> +// non-writeback pair into a writeback pair, as it isn't otherwise clear >> +// which mode to use when storing a modeless CONST_INT. >> +static machine_mode >> +aarch64_operand_mode_for_pair_mode (machine_mode mode) >> +{ >> + switch (mode) >> { >> - auto next_l = std::next (iter_l); >> - auto next_r = std::next (iter_r); >> - if (**iter_l < **iter_r >> - && next_l != left_list.end () >> - && **next_l < **iter_r) >> - iter_l = next_l; >> - else if (**iter_r < **iter_l >> - && next_r != right_list.end () >> - && **next_r < **iter_l) >> - iter_r = next_r; >> - else if (try_fuse_pair (load_p, access_size, *iter_l, *iter_r)) >> - { >> - left_list.erase (iter_l); >> - iter_l = next_l; >> - right_list.erase (iter_r); >> - iter_r = next_r; >> - } >> - else if (**iter_l < **iter_r) >> - iter_l = next_l; >> - else >> - iter_r = next_r; >> + case E_V2x4QImode: >> + return SImode; >> + case E_V2x8QImode: >> + return DImode; >> + case E_V2x16QImode: >> + return V16QImode; >> + default: >> + gcc_unreachable (); >> } >> } >> >> @@ -2890,8 +3108,8 @@ ldp_bb_info::merge_pairs (insn_list_t &left_list, >> // of accesses. If we find two sets of adjacent accesses, call >> // merge_pairs. >> void >> -ldp_bb_info::transform_for_base (int encoded_lfs, >> - access_group &group) >> +aarch64_pair_fusion::transform_for_base (int encoded_lfs, >> + access_group &group) >> { >> const auto lfs = decode_lfs (encoded_lfs); >> const unsigned access_size = lfs.size; >> @@ -2915,55 +3133,6 @@ ldp_bb_info::transform_for_base (int encoded_lfs, >> } >> } >> >> -// If we emitted tombstone insns for this BB, iterate through the BB >> -// and remove all the tombstone insns, being sure to reparent any uses >> -// of mem to previous defs when we do this. >> -void >> -ldp_bb_info::cleanup_tombstones () >> -{ >> - // No need to do anything if we didn't emit a tombstone insn for this BB. >> - if (!m_emitted_tombstone) >> - return; >> - >> - for (auto insn : iterate_safely (m_bb->nondebug_insns ())) >> - { >> - if (!insn->is_real () >> - || !bitmap_bit_p (&m_tombstone_bitmap, insn->uid ())) >> - continue; >> - >> - auto set = as_a<set_info *> (memory_access (insn->defs ())); >> - if (set->has_any_uses ()) >> - { >> - auto prev_set = as_a<set_info *> (set->prev_def ()); >> - while (set->first_use ()) >> - crtl->ssa->reparent_use (set->first_use (), prev_set); >> - } >> - >> - // Now set has no uses, we can delete it. >> - insn_change change (insn, insn_change::DELETE); >> - crtl->ssa->change_insn (change); >> - } >> -} >> - >> -template<typename Map> >> -void >> -ldp_bb_info::traverse_base_map (Map &map) >> -{ >> - for (auto kv : map) >> - { >> - const auto &key = kv.first; >> - auto &value = kv.second; >> - transform_for_base (key.second, value); >> - } >> -} >> - >> -void >> -ldp_bb_info::transform () >> -{ >> - traverse_base_map (expr_map); >> - traverse_base_map (def_map); >> -} >> - >> static void >> ldp_fusion_init () >> { >> @@ -3174,7 +3343,9 @@ void ldp_fusion_bb (bb_info *bb) >> const bool track_stores >> = aarch64_tune_params.stp_policy_model != AARCH64_LDP_STP_POLICY_NEVER; >> >> - ldp_bb_info bb_state (bb); >> + pair_fusion *bb_state; >> + aarch64_pair_fusion derived (bb); >> + bb_state = &derived; >> >> for (auto insn : bb->nondebug_insns ()) >> { >> @@ -3194,13 +3365,13 @@ void ldp_fusion_bb (bb_info *bb) >> continue; >> >> if (track_stores && MEM_P (XEXP (pat, 0))) >> - bb_state.track_access (insn, false, XEXP (pat, 0)); >> + bb_state->track_access (insn, false, XEXP (pat, 0)); >> else if (track_loads && MEM_P (XEXP (pat, 1))) >> - bb_state.track_access (insn, true, XEXP (pat, 1)); >> + bb_state->track_access (insn, true, XEXP (pat, 1)); >> } >> >> - bb_state.transform (); >> - bb_state.cleanup_tombstones (); >> + bb_state->transform (); >> + bb_state->cleanup_tombstones (); >> } >> >> void ldp_fusion () >> @@ -3263,7 +3434,7 @@ public: >> } >> }; >> >> -} // anon namespace >> +}// anon namespace >> >> rtl_opt_pass * >> make_pass_ldp_fusion (gcc::context *ctx)