https://gcc.gnu.org/g:38d15216dbff426747b860e5e2e12f8f9ec60de2
commit r16-6284-g38d15216dbff426747b860e5e2e12f8f9ec60de2 Author: Jeff Law <[email protected]> Date: Fri Dec 19 10:13:24 2025 -0700 [committed] Improve shift loops on the H8 Inspired by Georg-Johann's work on the AVR to convert the shift loops to a sentinel approach and a rough work week, I revisited the shift patterns on the H8 to see if we could improve things on that port as well. It also serves as a good verification that things are working in my environment. The basic idea of Georg-Johann's patch is to clear the bits that are going to be shifted away, then turn on a sentinel bit (the last shifted away bit). This is done outside the loop. The loop then iterates until the sentinel bit shows up in C. This eliminates decrementing the loop counter and better performance. It turns out to be fairly easy to implement on the H8. The first implementation did the clearing and setting in the most simplistic way possible, but to avoid significant code size regressions the clearing and setting really needed to be handled by output_logical_op which has several short cuts. So a bit of adjustment was necessary to make output_logical_op callable from other contexts. Second the H8/S and newer parts have shift-by-2 instructions. These aren't normally used in shift loops unless we're optimizing for size. This requires slight adjustment of the sentinel location for odd shift counts. The residual single bit shift for that case is handled outside the loop. Otherwise it's an uneventful patch. My hope was that it will save a minuscule amount of testing time as the H8 continues to be the slowest cross target for testing. Hard to judge that right now -- while the latest run on the H8 was about 30 minutes faster than any run in the last month, the machine was unloaded for that run while it was fully loaded for the standard nightly runs. If this even approaches 1% I'll jump for joy. Anyway, tested on the H8 with no regressions. Given the H8 is a dead ISA with very few users, I'm going to go ahead and commit even though we're in stage3. gcc/ * config/h8300/h8300.cc (output_logical_op): Adjust last argument to be a pattern, not an insn. Corresponding implementation changes. (output_shift_loop): Extracted from output_a_shift and improved to use a sentinel to indicate when to stop the loop. (output_a_shift): Use output_shift_loop. (compute_a_shift_length): Handle adjusted shift loop code. * config/h8300/logical.md (logicals): Pass pattern to output_logical_op rather then the full insn. * config/h8300/h8300-protos.h (output_logical_op): Update prototype. Diff: --- gcc/config/h8300/h8300-protos.h | 3 +- gcc/config/h8300/h8300.cc | 99 +++++++++++++++++++++++++++++++---------- gcc/config/h8300/logical.md | 2 +- 3 files changed, 78 insertions(+), 26 deletions(-) diff --git a/gcc/config/h8300/h8300-protos.h b/gcc/config/h8300/h8300-protos.h index a2b8427c4eec..838780f856fe 100644 --- a/gcc/config/h8300/h8300-protos.h +++ b/gcc/config/h8300/h8300-protos.h @@ -36,8 +36,7 @@ extern const char *output_simode_bld (int, rtx[]); extern void final_prescan_insn (rtx_insn *, rtx *, int); extern int h8300_expand_movsi (rtx[]); extern machine_mode h8300_select_cc_mode (RTX_CODE, rtx, rtx); -extern const char *output_logical_op (machine_mode, rtx_code code, - rtx *, rtx_insn *); +extern const char *output_logical_op (machine_mode, rtx_code, rtx *, rtx); extern unsigned int compute_logical_op_length (machine_mode, rtx_code, rtx *, rtx_insn *); diff --git a/gcc/config/h8300/h8300.cc b/gcc/config/h8300/h8300.cc index c5935f602e24..f2bfa31534b6 100644 --- a/gcc/config/h8300/h8300.cc +++ b/gcc/config/h8300/h8300.cc @@ -2945,7 +2945,8 @@ compute_plussi_cc (rtx *operands) /* Output a logical insn. */ const char * -output_logical_op (machine_mode mode, rtx_code code, rtx *operands, rtx_insn *insn) +output_logical_op (machine_mode mode, rtx_code code, + rtx *operands, rtx pattern) { /* Pretend that every byte is affected if both operands are registers. */ const unsigned HOST_WIDE_INT intval = @@ -2978,7 +2979,6 @@ output_logical_op (machine_mode mode, rtx_code code, rtx *operands, rtx_insn *in The key is to look at the second object in the PARALLEL. If it is not a CLOBBER, then we care about the condition codes. */ - rtx pattern = PATTERN (insn); gcc_assert (GET_CODE (pattern) == PARALLEL); rtx second_op = XVECEXP (pattern, 0, 1); bool cc_meaningful = (GET_CODE (second_op) != CLOBBER); @@ -4053,20 +4053,65 @@ h8300_shift_needs_scratch_p (int count, machine_mode mode, enum rtx_code type) gcc_unreachable (); } +/* Output a shift loop where the shift count is known. We use a + sentinel bit to know when to stop the loop so that we don't + have to decrement a loop counter. MASK is the bits to leave + alone while SET is the bit to set when setting up the sentinel. + + OPERANDS are the original operands of the shift. + + SHIFT1 is a string for the shift instruction. This routine does + not care about what kind of shift is needed. */ +void +output_shift_loop (rtx operands[3], rtx mask, rtx set, const char *shift1, const char *shift2) +{ + static int loopend_lab; + loopend_lab++; + machine_mode mode = GET_MODE (operands[0]); + + rtx xoperands[3]; + xoperands[0] = operands[0]; + xoperands[1] = operands[0]; + xoperands[2] = mask; + rtx x = gen_rtx_AND (mode, xoperands[1], xoperands[2]); + x = gen_rtx_SET (xoperands[0], x); + rtvec vec = rtvec_alloc (2); + RTVEC_ELT (vec, 0) = x; + RTVEC_ELT (vec, 1) = gen_rtx_CLOBBER (VOIDmode, + gen_rtx_SCRATCH (QImode)); + + output_logical_op (mode, AND, xoperands, + gen_rtx_PARALLEL (VOIDmode, vec)); + + /* Now set the sentinel bit. When this bit shifts into C, + we are done. That avoids the need for the decrement in the + loop. We reuse RTL here in a way that would normally be + unsafe, but these never actually appear in the IL. */ + xoperands[2] = set; + x = gen_rtx_IOR (mode, xoperands[1], xoperands[2]); + x = gen_rtx_SET (xoperands[0], x); + RTVEC_ELT (vec, 0) = x; + output_logical_op (mode, IOR, xoperands, + gen_rtx_PARALLEL (VOIDmode, vec)); + + fprintf (asm_out_file, ".Llt%d:\n", loopend_lab); + output_asm_insn (shift2 ? shift2 : shift1, operands); + fprintf (asm_out_file, "\tbcc .Llt%d\n", loopend_lab); + if (shift2 && INTVAL (operands[2]) % 2) + output_asm_insn (shift1, operands); +} + /* Output the assembler code for doing shifts. */ const char * output_a_shift (rtx operands[4], rtx_code code) { - static int loopend_lab; machine_mode mode = GET_MODE (operands[0]); enum shift_type shift_type; enum shift_mode shift_mode; struct shift_info info; int n; - loopend_lab++; - switch (mode) { case E_QImode: @@ -4177,28 +4222,24 @@ output_a_shift (rtx operands[4], rtx_code code) return ""; } + /* SHIFT_LOOP is not used for H8/S or newer which have stronger + shifters. */ case SHIFT_LOOP: - /* A loop to shift by a "large" constant value. - If we have shift-by-2 insns, use them. */ - if (info.shift2 != NULL) + if (code == ASHIFT) { - fprintf (asm_out_file, "\tmov.b #%d,%sl\n", n / 2, - names_big[REGNO (operands[3])]); - fprintf (asm_out_file, ".Llt%d:\n", loopend_lab); - output_asm_insn (info.shift2, operands); - output_asm_insn ("add #0xff,%X3", operands); - fprintf (asm_out_file, "\tbne .Llt%d\n", loopend_lab); - if (n % 2) - output_asm_insn (info.shift1, operands); - } + unsigned HOST_WIDE_INT mask = (1 << (GET_MODE_BITSIZE (mode) - n)) - 1; + if (info.shift2 && (n & 1)) + n -= 1; + unsigned HOST_WIDE_INT set = (1 << (GET_MODE_BITSIZE (mode) - n)); + output_shift_loop (operands, GEN_INT (mask), GEN_INT (set), info.shift1, info.shift2); + } else { - fprintf (asm_out_file, "\tmov.b #%d,%sl\n", n, - names_big[REGNO (operands[3])]); - fprintf (asm_out_file, ".Llt%d:\n", loopend_lab); - output_asm_insn (info.shift1, operands); - output_asm_insn ("add #0xff,%X3", operands); - fprintf (asm_out_file, "\tbne .Llt%d\n", loopend_lab); + unsigned HOST_WIDE_INT mask =~((HOST_WIDE_INT_1U << n) - 1); + if (info.shift2 && (n & 1)) + n -= 1; + unsigned HOST_WIDE_INT set = HOST_WIDE_INT_1U << (n - 1); + output_shift_loop (operands, GEN_INT (mask), GEN_INT (set), info.shift1, info.shift2); } return ""; @@ -4358,10 +4399,22 @@ compute_a_shift_length (rtx operands[3], rtx_code code) wlength += 3 + h8300_asm_insn_count (info.shift2); if (n % 2) wlength += h8300_asm_insn_count (info.shift1); + if (mode == E_HImode) + wlength += 2; + else if (mode == E_SImode) + wlength += 4; } else { + /* The loop uses a sentinel as a stop point. That requires + two setup instructions before the loop, but they can be + longer based on the mode. The loop itself is just two + two byte instructions. */ wlength += 3 + h8300_asm_insn_count (info.shift1); + if (mode == E_HImode) + wlength += 2; + else if (mode == E_SImode) + wlength += 4; } return 2 * wlength; diff --git a/gcc/config/h8300/logical.md b/gcc/config/h8300/logical.md index f848242ac873..9550943fbcbb 100644 --- a/gcc/config/h8300/logical.md +++ b/gcc/config/h8300/logical.md @@ -228,7 +228,7 @@ (match_operand:QHSI 2 "h8300_src_operand" "rQi"))) (clobber (reg:CC CC_REG))] "h8300_operands_match_p (operands)" - { return output_logical_op (<MODE>mode, <CODE>, operands, insn); } + { return output_logical_op (<MODE>mode, <CODE>, operands, PATTERN (insn)); } [(set (attr "length") (symbol_ref "compute_logical_op_length (<MODE>mode, <CODE>, operands, insn)"))])
