Several weeks ago I was looking at SATD and realized that we had loads of permutation constants that could be implemented as a trivial adjustment to a prior loaded permutation constant.
For example if we had loaded a permutation constant like 1, 3, 1, 3, 5, 7, 5, 7 and we needed 0, 2, 0, 2, 4, 6, 4, 6. We could use vadd.vi to adjust the already loaded constant into the new constant we wanted.
This has a lot of similarities to SLSR and reload_cse_mov2add, but with the added complexity that we're dealing with vectors and whether or not we can profitably derive one constant from the other is a target dependent decision.
So this is implemented as a mini pass after CSE2. If other targets are interested, we can certainly look to make this more generic. I'm sure we could use a hook to help with the profitability question.
The implementation works by normalizing permutation constants so that the first element is 0 and we hash based on the normalized form. So in the case above, the normalized form is 0, 2, 0, 2, 4, 6, 4, 6, that's what gets entered into the hash table for 1, 3, 1, 3, 5, 7, 5, 7, allowing us to realize the all the elements differ by the same value when we later encounter 0, 2, 0, 2, 4, 6, 4, 6.
After we hit in the hash table we verify that a simple vadd.vi is sufficient to generate the second constant from the first and adjust the code appropriately.
I tested it on the BPI at the time with good results, but the details have escaped me -- at the time it performed poorly on our design. I had a strong suspicion the poor behavior on design was due to a particularly poor scheduler model. With our scheduler model in much better shape I recently retested and it's consistently 1c better for the SATD code. Big win! Wahoo!
Bootstrapped and regression tested on riscv64-linux-gnu, regression tested on riscv64-elf and riscv32-elf as well.
Anyway, RFC for now since it introduces a new target dependent pass. Comments, criticism, etc all welcomed. The goal would be to try and go forward with something in the gcc-15 timeframe.
Thanks, Jeff
gcc/ * config.gcc (riscv*): Add riscv_vect-permconst.h to extra_objs. * config/riscv/t-riscv: Add rules for riscv-vect-permconst.o * config/riscv/riscv-passes.def: Add pass_vector_permconst after cse2. * config/riscv/;riscv-protos.h: Add prototype for new pass. * config/riscv/riscv-vect-permconst.cc: New file. commit 7a62350fbddc903e554457b07f72300536134618 Author: Jeff Law <j...@ventanamicro.com> Date: Wed Sep 25 16:27:32 2024 -0600 Optimize related vector permutation constants to avoid memory references. Given two permutation constants, say 1, 3, 1, 3, 5, 7, 5, 7 and 0, 2, 0, 2, 4, 6, 4, 6. Normally they would be loaded from the constant pool individually. But we can trivially derive the second from the first with vadd.vi <dest>, <src>, -1 which saves a memory reference. diff --git a/gcc/config.gcc b/gcc/config.gcc index a3566f5c77d..1365cb507a4 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -549,7 +549,7 @@ pru-*-*) riscv*) cpu_type=riscv extra_objs="riscv-builtins.o riscv-c.o riscv-sr.o riscv-shorten-memrefs.o riscv-selftests.o riscv-string.o" - extra_objs="${extra_objs} riscv-v.o riscv-vsetvl.o riscv-vector-costs.o riscv-avlprop.o" + extra_objs="${extra_objs} riscv-v.o riscv-vsetvl.o riscv-vector-costs.o riscv-avlprop.o riscv-vect-permconst.o" extra_objs="${extra_objs} riscv-vector-builtins.o riscv-vector-builtins-shapes.o riscv-vector-builtins-bases.o" extra_objs="${extra_objs} thead.o riscv-target-attr.o" d_target_objs="riscv-d.o" diff --git a/gcc/config/riscv/riscv-passes.def b/gcc/config/riscv/riscv-passes.def index 32b79a78f0b..045ce59ad7d 100644 --- a/gcc/config/riscv/riscv-passes.def +++ b/gcc/config/riscv/riscv-passes.def @@ -20,3 +20,4 @@ INSERT_PASS_AFTER (pass_rtl_store_motion, 1, pass_shorten_memrefs); INSERT_PASS_AFTER (pass_split_all_insns, 1, pass_avlprop); INSERT_PASS_BEFORE (pass_fast_rtl_dce, 1, pass_vsetvl); +INSERT_PASS_AFTER (pass_cse2, 1, pass_vector_permconst); diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 860db094e18..29687cfd453 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -196,6 +196,7 @@ extern bool riscv_hard_regno_rename_ok (unsigned, unsigned); rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt); rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt); rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt); +rtl_opt_pass * make_pass_vector_permconst (gcc::context *ctxt); /* Routines implemented in riscv-string.c. */ extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx); diff --git a/gcc/config/riscv/riscv-vect-permconst.cc b/gcc/config/riscv/riscv-vect-permconst.cc new file mode 100644 index 00000000000..f4ea2f525da --- /dev/null +++ b/gcc/config/riscv/riscv-vect-permconst.cc @@ -0,0 +1,299 @@ +/* Copyright (C) 2024 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or(at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#define IN_TARGET_CODE 1 +#define INCLUDE_ALGORITHM +#define INCLUDE_FUNCTIONAL +#define INCLUDE_MEMORY + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "backend.h" +#include "rtl.h" +#include "target.h" +#include "tree-pass.h" +#include "df.h" +#include "rtl-ssa.h" +#include "cfgcleanup.h" +#include "insn-attr.h" +#include "tm-constrs.h" +#include "insn-opinit.h" +#include "cfgrtl.h" + +/* So the basic idea of this pass is to identify loads of permutation + constants from the constant pool which could instead be trivially + derived from some earlier vector permutation constant. This will + replace a memory load from the constant pool with a vadd.vi + instruction. + + Conceptually this is much like the related_values optimization in + CSE, reload_cse_move2add or using SLSR to optimize constant synthesis. + If we wanted to make this generic I would suggest putting it into CSE + and providing target hooks to determine if particular permutation + constants could be derived from earlier permutation constants. */ + +const pass_data pass_data_vect_permconst = { + RTL_PASS, /* type */ + "vect_permconst", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + 0, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +/* Entry in the hash table. We "normalize" the permutation constant + by adjusting all entries by the value in the first element. This + allows simple hashing to discover permutation constants that differ + by a single constant across all their elements and may be derived + from each other with a vadd.vi. */ + +struct vector_permconst_entry +{ + /* The CONST_VECTOR in normalized form (first entry is zero). */ + /* We could avoid copying the vector with a more customized hash + routine which took care of normalization. */ + rtx normalized_vec; + + /* The destination register holding the CONST_VECTOR. When the optimization + applies this will be used as a source operand in the vadd.vi. */ + rtx dest; + + /* The insn generating DEST, the only reason we need this is because we + do not invalidate entries which implies we have to verify that DEST + is unchanged between INSN and the point where we want to use DEST + to derive a new permutation constant. */ + rtx_insn *insn; + + /* The bias of this entry used for normalization. If this value is added + to each element in NORMALIZED_VEC we would have the original permutation + constant. */ + HOST_WIDE_INT bias; +}; + +struct const_vector_hasher : nofree_ptr_hash <vector_permconst_entry> +{ + static inline hashval_t hash (const vector_permconst_entry *); + static inline bool equal (const vector_permconst_entry *, + const vector_permconst_entry *); +}; + +inline bool +const_vector_hasher::equal (const vector_permconst_entry *vpe1, + const vector_permconst_entry *vpe2) +{ + /* Do the cheap tests first, namely that the mode and number of entries + match between the two enries. */ + if (GET_MODE (vpe1->normalized_vec) != GET_MODE (vpe2->normalized_vec)) + return false; + + if (CONST_VECTOR_NUNITS (vpe1->normalized_vec).to_constant () + != CONST_VECTOR_NUNITS (vpe2->normalized_vec).to_constant ()) + return false; + + /* Check the value of each entry in the vector. We violate structure + sharing rules inside this pass, so while pointer equality would normally + be OK, it isn't here. */ + for (int i = 0; + i < CONST_VECTOR_NUNITS (vpe1->normalized_vec).to_constant (); + i++) + if (!rtx_equal_p (CONST_VECTOR_ELT (vpe1->normalized_vec, i), + CONST_VECTOR_ELT (vpe2->normalized_vec, i))) + return false; + + return true; +} + +inline hashval_t +const_vector_hasher::hash (const vector_permconst_entry *vpe) +{ + int do_not_record; + return hash_rtx (vpe->normalized_vec, Pmode, &do_not_record, NULL, false); +} + + +class vector_permconst : public rtl_opt_pass +{ +public: + vector_permconst (gcc::context *ctxt) + : rtl_opt_pass (pass_data_vect_permconst, ctxt) {} + + /* opt_pass methods: */ + virtual bool gate (function *) final override + { + return TARGET_VECTOR && optimize > 0; + } + virtual unsigned int execute (function *) final override; + +private: + void process_bb (basic_block); + hash_table<const_vector_hasher> *vector_permconst_table; +}; // class pass_vector_permconst + +/* Try to optimize vector permutation constants in BB. */ +void +vector_permconst::process_bb (basic_block bb) +{ + vector_permconst_table = new hash_table<const_vector_hasher> (11); + + /* Walk the insns in BB searching for vector loads from the constant pool + which can be satisfied by adjusting an earlier load with trivial + arithmetic. */ + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + { + if (!INSN_P (insn)) + continue; + + rtx set = single_set (insn); + if (!set) + continue; + + rtx dest = SET_DEST (set); + if (GET_MODE_CLASS (GET_MODE (dest)) != MODE_VECTOR_INT) + continue; + + rtx src = SET_SRC (set); + if (!MEM_P (src)) + continue; + + /* A load from the constant pool should have a REG_EQUAL + note with the vector constant in the note. */ + rtx note = find_reg_equal_equiv_note (insn); + if (!note + || REG_NOTE_KIND (note) != REG_EQUAL + || GET_CODE (XEXP (note, 0)) != CONST_VECTOR) + continue; + + if (!CONST_VECTOR_NUNITS (XEXP (note, 0)).is_constant ()) + continue; + + /* XXX Do we need to consider other forms of constants? */ + + /* We want to be selective about what gets past this point since + we make a copy of the vector and possibly enter it into the + hash table. So reject cases that are not likely a permutation + constant. ie, negative bias and large biases. We arbitrarily + use 16k as the largest vector size in bits we try to optimize. + + It may seem like a bias outside the range of vadd.vi should + be rejected, but what really matters is the difference of + biases across the two permutation constants. */ + rtx cvec = XEXP (note, 0); + HOST_WIDE_INT bias = INTVAL (CONST_VECTOR_ELT (cvec, 0)); + if (bias < 0 || bias > 16384 / 8) + continue; + + /* At this point we have a load of a constant integer vector from the + constant pool. That constant integer vector is hopefully a + permutation constant. We need to make a copy of the vector and + normalize it to zero. + + XXX This violates structure sharing conventions. */ + rtvec_def *nvec = gen_rtvec (CONST_VECTOR_NUNITS (cvec).to_constant ()); + + for (int i = 0; i < CONST_VECTOR_NUNITS (cvec).to_constant (); i++) + nvec->elem[i] = GEN_INT (INTVAL (CONST_VECTOR_ELT (cvec, i)) - bias); + + rtx copy = gen_rtx_CONST_VECTOR (GET_MODE (cvec), nvec); + + /* Now that we have a normalized vector, look it up in the hash table, + inserting it if it wasn't already in the table. */ + struct vector_permconst_entry tmp; + tmp.normalized_vec = copy; + struct vector_permconst_entry **slot + = vector_permconst_table->find_slot (&tmp, INSERT); + if (*slot == NULL) + { + /* This constant was not in the table, so initialize the hash table + entry. */ + *slot = XNEW (vector_permconst_entry); + (*slot)->normalized_vec = copy; + (*slot)->dest = dest; + (*slot)->bias = bias; + (*slot)->insn = insn; + } + else + { + /* A hit in the hash table. We may be able to optimize this case. + + If the difference in biases fits in the immediate range for + vadd.vi, then we may optimize. */ + bias = bias - (*slot)->bias; + if (IN_RANGE (bias, -16, 15)) + { + /* We also need to make sure the destination register was not + modified. I've chosen to test for that at optimization time + rather than invalidate entries in the table. This could be + changed to use REG_TICK like schemes or true invalidation if + this proves too compile-time costly. */ + if (!reg_set_between_p ((*slot)->dest, (*slot)->insn, insn)) + { + /* Instead of loading from the constant pool, adjust the + output of the earlier insn into our destination. */ + rtx x = gen_const_vec_duplicate (GET_MODE (copy), + GEN_INT (bias)); + rtx plus = gen_rtx_PLUS (GET_MODE (copy), (*slot)->dest, x); + rtx set = gen_rtx_SET (dest, plus); + emit_insn_after (set, insn); + /* XXX Should we copy over the REG_EQUAL note first? */ + delete_insn (insn); + } + } + + /* We always keep the hash table entry pointing to the most recent + INSN that could generate the normalized entry. We can adjust + in the future if data says it's useful to do so. This just + keeps things simple for now. + + For example, we might want to keep multiple entries if they + have a different biases. */ + (*slot)->dest = dest; + (*slot)->bias = bias; + (*slot)->insn = insn; + } + } + + /* We construct and tear down the table for each block. This may + be overly expensive. */ + vector_permconst_table->empty (); +} + +/* Main entry point for this pass. */ +unsigned int +vector_permconst::execute (function *fn) +{ + /* Handle each block independently. While this should work nicely on EBBs, + let's wait for real world cases where it matters before adding that + complexity. */ + basic_block bb; + FOR_EACH_BB_FN (bb, fn) + process_bb (bb); + + return 0; +} + +rtl_opt_pass * +make_pass_vector_permconst (gcc::context *ctxt) +{ + return new vector_permconst (ctxt); +} diff --git a/gcc/config/riscv/t-riscv b/gcc/config/riscv/t-riscv index 38494320d8b..2a64c28fe49 100644 --- a/gcc/config/riscv/t-riscv +++ b/gcc/config/riscv/t-riscv @@ -86,6 +86,13 @@ riscv-avlprop.o: $(srcdir)/config/riscv/riscv-avlprop.cc \ $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ $(srcdir)/config/riscv/riscv-avlprop.cc +riscv-vect-permconst.o: $(srcdir)/config/riscv/riscv-vect-permconst.cc \ + $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) $(REGS_H) \ + $(TARGET_H) tree-pass.h df.h rtl-ssa.h cfgcleanup.h insn-attr.h \ + tm-constrs.h insn-opinit.h cfgrtl.h + $(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \ + $(srcdir)/config/riscv/riscv-vect-permconst.cc + riscv-d.o: $(srcdir)/config/riscv/riscv-d.cc \ $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(COMPILE) $<