https://gcc.gnu.org/g:742f55622690d35c6cc95f2b8722307699731571
commit r16-2172-g742f55622690d35c6cc95f2b8722307699731571 Author: Daniel Barboza <dbarb...@ventanamicro.com> Date: Thu Jul 10 07:28:38 2025 -0600 [RISC-V] Detect new fusions for RISC-V This is primarily Daniel's work... He's chasing things in QEMU & LLVM right now so I'm doing a bit of clean-up and shepherding this patch forward. -- Instruction fusion is a reasonably common way to improve the performance of code on many architectures/designs. A few years ago we submitted (via VRULL I suspect) fusion support for a number of cases in the RISC-V space. We made each type of fusion selectable independently in the tuning structure so that designs which implemented some particular set of fusions could select just the ones their design implemented. This patch adds to that generic infrastructure. In particular we're introducing additional load fusions, store pair fusions, bitfield extractions and a few B extension related fusions. Conceptually for the new load fusions we're adding the ability to fuse most add/shNadd instructions with a subsequent load. There's a couple of exceptions, but in general the expectation is that if we have add/shNadd for address computation, then they can potentially use with the load where the address gets used. We've had limited forms of store pair fusion for a while. Essentially we required both stores to be 64 bits wide and land on opposite sides of a 128 bit cache line. That was enough to help prologues and a few other things, but was fairly restrictive. The new cases capture store pairs where the two stores have the same size and hit consecutive memory locations. For example, storing consecutive bytes with sb+sb is fusible. For bitfield extractions we can fuse together a shift left followed by a shift right for arbitrary shift counts where as previously we restricted the shift counts to those implementing sign/zero extensions of 8, and 16 bit objects. Finally some B extension fusions. orc.b+not which shows up in string comparisons, ctz+andi (deepsjeng?), neg+max (synthesized abs). I hope these prove to be useful to other RISC-V designs. I wouldn't be surprised if we have to break down the new load fusions further for some designs. If we need to do that it wouldn't be hard. FWIW, our data indicates the generalized store fusions followed by the expanded load fusions are the most important cases for the new code. These have been tested with crosses and bootstrapped on the BPI. Waiting on pre-commit CI before moving forward (though it has been failing to pick up some patches recently...) gcc/ * config/riscv/riscv.cc (riscv_fusion_pairs): Add new cases. (riscv_set_is_add): New function. (riscv_set_is_addi, riscv_set_is_adduw, riscv_set_is_shNadd): Likewise. (riscv_set_is_shNadduw): Likewise. (riscv_macro_fusion_pair_p): Add new fusion cases. Co-authored-by: Jeff Law <j...@ventanamicro.com> Diff: --- gcc/config/riscv/riscv.cc | 383 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 382 insertions(+), 1 deletion(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index b868a503a35f..023adc3284df 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -283,6 +283,10 @@ enum riscv_fusion_pairs RISCV_FUSE_AUIPC_LD = (1 << 7), RISCV_FUSE_LDPREINCREMENT = (1 << 8), RISCV_FUSE_ALIGNED_STD = (1 << 9), + RISCV_FUSE_CACHE_ALIGNED_STD = (1 << 10), + RISCV_FUSE_BFEXT = (1 << 11), + RISCV_FUSE_EXPANDED_LD = (1 << 12), + RISCV_FUSE_B_ALUI = (1 << 13), }; /* Costs of various operations on the different architectures. */ @@ -10205,6 +10209,81 @@ riscv_fusion_enabled_p(enum riscv_fusion_pairs op) return tune_param->fusible_ops & op; } +/* Matches an add: + (set (reg:DI rd) (plus:SI (reg:SI rs1) (reg:SI rs2))) */ + +static bool +riscv_set_is_add (rtx set) +{ + return (GET_CODE (SET_SRC (set)) == PLUS + && REG_P (XEXP (SET_SRC (set), 0)) + && REG_P (XEXP (SET_SRC (set), 1)) + && REG_P (SET_DEST (set))); +} + +/* Matches an addi: + (set (reg:DI rd) (plus:SI (reg:SI rs1) (const_int imm))) */ + +static bool +riscv_set_is_addi (rtx set) +{ + return (GET_CODE (SET_SRC (set)) == PLUS + && REG_P (XEXP (SET_SRC (set), 0)) + && CONST_INT_P (XEXP (SET_SRC (set), 1)) + && REG_P (SET_DEST (set))); +} + +/* Matches an add.uw: + (set (reg:DI rd) + (plus:DI (zero_extend:DI (reg:SI rs1)) (reg:DI rs2))) */ + +static bool +riscv_set_is_adduw (rtx set) +{ + return (GET_CODE (SET_SRC (set)) == PLUS + && GET_CODE (XEXP (SET_SRC (set), 0)) == ZERO_EXTEND + && REG_P (XEXP (XEXP (SET_SRC (set), 0), 0)) + && REG_P (XEXP (SET_SRC (set), 1)) + && REG_P (SET_DEST (set))); +} + +/* Matches a shNadd: + (set (reg:DI rd) + (plus:DI (ashift:DI (reg:DI rs1) (const_int N)) (reg:DI rS2)) */ + +static bool +riscv_set_is_shNadd (rtx set) +{ + return (GET_CODE (SET_SRC (set)) == PLUS + && GET_CODE (XEXP (SET_SRC (set), 0)) == ASHIFT + && REG_P (XEXP (XEXP (SET_SRC (set), 0), 0)) + && CONST_INT_P (XEXP (XEXP (SET_SRC (set), 0), 1)) + && (INTVAL (XEXP (XEXP (SET_SRC (set), 0), 1)) == 1 + || INTVAL (XEXP (XEXP (SET_SRC (set), 0), 1)) == 2 + || INTVAL (XEXP (XEXP (SET_SRC (set), 0), 1)) == 3) + && REG_P (SET_DEST (set))); +} + +/* Matches a shNadd.uw: + (set (reg:DI rd) + (plus:DI (and:DI (ashift:DI (reg:DI rs1) (const_int N)) + (const_int N)) + (reg:DI rs2)) */ + +static bool +riscv_set_is_shNadduw (rtx set) +{ + return (GET_CODE (SET_SRC (set)) == PLUS + && GET_CODE (XEXP (SET_SRC (set), 0)) == AND + && GET_CODE (XEXP (XEXP (SET_SRC (set), 0), 0)) == ASHIFT + && REG_P (XEXP (XEXP (XEXP (SET_SRC (set), 0), 0), 0)) + && CONST_INT_P (XEXP (XEXP (XEXP (SET_SRC (set), 0), 0), 1)) + && (INTVAL (XEXP (XEXP (XEXP (SET_SRC (set), 0), 0), 1)) == 1 + || INTVAL (XEXP (XEXP (XEXP (SET_SRC (set), 0), 0), 1)) == 2 + || INTVAL (XEXP (XEXP (XEXP (SET_SRC (set), 0), 0), 1)) == 3) + && REG_P (SET_DEST (set))); +} + /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR should be kept together during scheduling. */ @@ -10334,6 +10413,139 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } + if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_EXPANDED_LD) + && (sched1 || prev_dest_regno == curr_dest_regno)) + { + /* For the "expanded add/load fusion" family we have 2 main + categories: memory loads with displacement (i.e. with imm offset) + and loads without displacement (i.e. with offset = x0). + + For loads without displacement we'll need: + - add + ld (done in RISCV_FUSE_LDINDEXED) + - addi + ld (done in RISCV_FUSE_LDPREINCREMENT) + - shNadd + ld + - add.uw + lw + - shNadd.uw + lw + + For loads with displacement/immediates: + with lw with immediate): + - add + ld with displacement + - addi + ld with displacement + - shNadd + ld with displacement + - add.uw + lw with displacement + - shNadd.uw + lw with displacement */ + + /* We're trying to match a curr_set ld with displacement: + prev (add|addi) = (set (reg:DI rd) (...)) + curr (ld) == (set (reg:DI rD) + (mem:DI (plus:DI (reg:DI rD) (const_int IMM12)))) */ + if (MEM_P (SET_SRC (curr_set)) + && SCALAR_INT_MODE_P (GET_MODE (SET_DEST (curr_set))) + && GET_CODE (XEXP (SET_SRC (curr_set), 0)) == PLUS + && REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) == prev_dest_regno) + { + if (riscv_set_is_add (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + + if (riscv_set_is_addi (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + + if (riscv_set_is_shNadd (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + } + + /* We're trying to match a ld without displacement: + prev (addi|shNadd) = (reg:DI rD) (...)) + curr (ld) == (set (reg:DI rD) + (mem:DI (reg:DI rD))) */ + if (MEM_P (SET_SRC (curr_set)) + && SCALAR_INT_MODE_P (GET_MODE (SET_DEST (curr_set))) + && REG_P (XEXP (SET_SRC (curr_set), 0)) + && REGNO (XEXP (SET_SRC (curr_set), 0)) == prev_dest_regno) + { + if (riscv_set_is_addi (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + + if (riscv_set_is_shNadd (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + } + + /* We're trying to match a curr_set lw with displacement: + prev (add.uw|shNadd.uw) = (set (reg:DI rd) (...)) + curr (lw) == (set (reg:DI rd) + (any_extend:DI (mem:SUBX (plus:DI ((reg:DI rd) + (const_int IMM)))) */ + if ((GET_CODE (SET_SRC (curr_set)) == SIGN_EXTEND + || (GET_CODE (SET_SRC (curr_set)) == ZERO_EXTEND)) + && MEM_P (XEXP (SET_SRC (curr_set), 0)) + && SCALAR_INT_MODE_P (GET_MODE (SET_DEST (curr_set))) + && GET_CODE (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) == PLUS + && REG_P (XEXP (XEXP (XEXP (SET_SRC (curr_set), 0), 0),0)) + && (REGNO (XEXP (XEXP (XEXP (SET_SRC (curr_set), 0), 0),0)) + == prev_dest_regno)) + { + if (riscv_set_is_adduw (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + + if (riscv_set_is_shNadduw (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + } + + /* We're trying to match a curr_set lw without displacement: + prev (add.uw|shNadd.uw) = (set (reg:DI rd) (...)) + curr (ld|lh|lw) == (set (reg:DI rd) + (any_extend:DI (mem:SUBX (reg:DI rsd)))) */ + if ((GET_CODE (SET_SRC (curr_set)) == SIGN_EXTEND + || (GET_CODE (SET_SRC (curr_set)) == ZERO_EXTEND)) + && MEM_P (XEXP (SET_SRC (curr_set), 0)) + && SCALAR_INT_MODE_P (GET_MODE (SET_DEST (curr_set))) + && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) + && REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) == prev_dest_regno) + { + if (riscv_set_is_adduw (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + + if (riscv_set_is_shNadduw (prev_set)) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_EXPANDED_LD\n"); + return true; + } + } + } + if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_LDPREINCREMENT) && (sched1 || prev_dest_regno == curr_dest_regno)) { @@ -10474,7 +10686,7 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } - if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_ALIGNED_STD)) + if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_CACHE_ALIGNED_STD)) { /* We are trying to match the following: prev (sd) == (set (mem (plus (reg sp|fp) (const_int))) @@ -10528,6 +10740,175 @@ riscv_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) } } + /* More general form of the RISCV_FUSE_CACHE_ALIGNED_STD. The + major difference is the dependency on the stores being opposite + halves of a cache line is dropped. Instead the lowest address + needs 2X the alignment of the object and the higher address + immediately followed the first object. */ + if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_ALIGNED_STD)) + { + /* We are trying to match the following: + prev (sd) == (set (mem (plus (reg rS1) (const_int))) + (reg rS2)) + curr (sd) == (set (mem (plus (reg rS1) (const_int))) + (reg rS3)) */ + + if (MEM_P (SET_DEST (prev_set)) + && SCALAR_INT_MODE_P (GET_MODE (SET_DEST (curr_set))) + && MEM_P (SET_DEST (curr_set)) + /* Stores must have the same width */ + && GET_MODE (SET_DEST (curr_set)) == GET_MODE (SET_DEST (prev_set))) + { + rtx base_prev, base_curr, offset_prev, offset_curr; + unsigned mode_size; + + extract_base_offset_in_addr (SET_DEST (prev_set), + &base_prev, &offset_prev); + extract_base_offset_in_addr (SET_DEST (curr_set), + &base_curr, &offset_curr); + + /* Proceed only if we find both bases, both bases + are registers and bases are the same register. */ + if (base_prev != NULL_RTX && base_curr != NULL_RTX + && REG_P (base_prev) && REG_P (base_curr) + && REGNO (base_prev) == REGNO (base_curr)) + { + machine_mode mode = GET_MODE (SET_DEST (curr_set)); + mode_size = estimated_poly_value (GET_MODE_SIZE (mode)); + + HOST_WIDE_INT offset_prev_int = INTVAL (offset_prev); + HOST_WIDE_INT offset_curr_int = INTVAL (offset_curr); + + /* Get the smaller offset into OFFSET_PREV_INT. */ + if (offset_prev_int > offset_curr_int) + std::swap (offset_prev_int, offset_curr_int); + + /* We've normalized, so we need to check that the lower + address is aligned to 2X the size of the object. The + higher address must be the lower address plus the + size of the object. */ + if (((offset_prev_int % (2 * mode_size)) == 0) + && offset_prev_int + mode_size == offset_curr_int) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_ALIGNED_STD\n"); + return true; + } + } + } + } + + if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_BFEXT) + && (sched1 || prev_dest_regno == curr_dest_regno)) + { + /* We are trying to match the following: + prev (slli) == (set (reg:DI rD) + (ashift:DI (reg:DI rS) (const_int))) + curr (srli) == (set (reg:DI rD) + (lshiftrt:DI (reg:DI rD) (const_int))) */ + + if (GET_CODE (SET_SRC (prev_set)) == ASHIFT + && (GET_CODE (SET_SRC (curr_set)) == LSHIFTRT + || GET_CODE (SET_SRC (curr_set)) == ASHIFTRT) + && REG_P (SET_DEST (prev_set)) + && REG_P (SET_DEST (curr_set)) + && REGNO (XEXP (SET_SRC (curr_set), 0)) == prev_dest_regno + && CONST_INT_P (XEXP (SET_SRC (prev_set), 1)) + && CONST_INT_P (XEXP (SET_SRC (curr_set), 1))) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_BFEXT\n"); + return true; + } + } + + if (simple_sets_p && riscv_fusion_enabled_p (RISCV_FUSE_B_ALUI) + && (sched1 || prev_dest_regno == curr_dest_regno)) + { + /* We are trying to match the following: + prev (orc.b) == (set (reg rD) + (unspec (reg rS1))) + curr (not) == (set (reg rD2) (not (reg rD))) */ + + if (GET_CODE (SET_SRC (prev_set)) == UNSPEC + && GET_CODE (SET_SRC (curr_set)) == NOT + && XINT (SET_SRC (prev_set), 1) == UNSPEC_ORC_B + && REG_P (SET_DEST (prev_set)) + && REG_P (SET_DEST (curr_set)) + && REG_P (XEXP (SET_SRC (curr_set), 0)) + && REGNO (XEXP (SET_SRC (curr_set), 0)) == prev_dest_regno) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_B_ALUI\n"); + return true; + } + + /* We are trying to match the following: + prev (ctz) == (set (reg rD) (ctz (reg rS1))) + curr (andi) == (set (reg rD) + (and (reg rD) (const_int 63))) */ + + if (GET_CODE (SET_SRC (prev_set)) == CTZ + && GET_CODE (SET_SRC (curr_set)) == AND + && CONST_INT_P (XEXP (SET_SRC (curr_set), 1)) + && INTVAL (XEXP (SET_SRC (curr_set), 1)) == 63 + && REG_P (SET_DEST (prev_set)) + && REG_P (SET_DEST (curr_set)) + && REG_P (XEXP (SET_SRC (curr_set), 0)) + && REGNO (XEXP (SET_SRC (curr_set), 0)) == prev_dest_regno) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_B_ALUI\n"); + return true; + } + + /* We are trying to match the following: + prev (sub) == (set (reg rD) + (minus (const_int 0) (reg rS2)) + curr (max) == (set (reg rD) + (smax (reg rD) (reg rS2))) */ + + if (GET_CODE (SET_SRC (prev_set)) == MINUS + && (XEXP (SET_SRC (prev_set), 0) + == CONST0_RTX (GET_MODE (SET_SRC (prev_set)))) + && CONST_INT_P (XEXP (SET_SRC (prev_set), 0)) + && GET_CODE (SET_SRC (curr_set)) == SMAX + && REG_P (SET_DEST (prev_set)) + && REG_P (SET_DEST (curr_set)) + && REG_P (XEXP (SET_SRC (curr_set), 0)) + && REGNO (XEXP (SET_SRC (curr_set), 0)) == prev_dest_regno + && REG_P (XEXP (SET_SRC (prev_set), 1)) + && REG_P (XEXP (SET_SRC (curr_set), 1)) + && (REGNO (XEXP (SET_SRC (prev_set), 1)) + == REGNO (XEXP (SET_SRC (curr_set), 1)))) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_B_ALUI\n"); + return true; + } + + /* We are trying to match the following: + prev (neg) == (set (reg rD) (neg (reg rS1))) + curr (max) == (set (reg rD) + (smax (reg rD) (reg rS1))) */ + + if (GET_CODE (SET_SRC (prev_set)) == NEG + && GET_CODE (SET_SRC (curr_set)) == SMAX + && REG_P (SET_DEST (prev_set)) + && REG_P (SET_DEST (curr_set)) + && REG_P (XEXP (SET_SRC (curr_set), 0)) + && REGNO (XEXP (SET_SRC (curr_set), 0)) == prev_dest_regno + && REG_P (XEXP (SET_SRC (prev_set), 0)) + && REG_P (XEXP (SET_SRC (curr_set), 1)) + && (REGNO (XEXP (SET_SRC (prev_set), 0)) + == REGNO (XEXP (SET_SRC (curr_set), 1)))) + { + if (dump_file) + fprintf (dump_file, "RISCV_FUSE_B_ALUI\n"); + return true; + } + } + return false; }