Hello,
This patches adds a few instructions to the inlined builtin_strlen to
unroll the remaining bytes for word-at-a-time loop. This enables to have
2 distinct execution paths (no fall-thru in the byte-at-a-time loop),
allowing block alignment assignation. This partially improves the
problem reported with by Oleg. in [Bug target/0539] New: [SH] builtin
string functions ignore loop and label alignment
whereas the test now expands (-O2 -m4) as
mov r4,r0
tst #3,r0
mov r4,r2
bf/s .L12
mov r4,r3
mov #0,r2
.L4:
mov.l @r4+,r1
cmp/str r2,r1
bf .L4
add #-4,r4
mov.b @r4,r1
tst r1,r1
bt .L2
add #1,r4
mov.b @r4,r1
tst r1,r1
bt .L2
add #1,r4
mov.b @r4,r1
tst r1,r1
mov #-1,r1
negc r1,r1
add r1,r4
.L2:
mov r4,r0
rts
sub r3,r0
.align 1
.L12:
mov.b @r4+,r1
tst r1,r1
bf/s .L12
mov r2,r3
add #1,r3
mov r4,r0
rts
sub r3,r0
Best tuning compared to the "compact" version I got on is ~1% for c++
regular expression benchmark, but well, code looks best this way.
regtested tested for -m2, -m4
OK for trunk ?
2014-03-20 Christian Bruel <[email protected]>
* config/sh/sh-mem.cc (sh_expand_strlen): Unroll last word.
Index: gcc/config/sh/sh-mem.cc
===================================================================
--- gcc/config/sh/sh-mem.cc (revision 208745)
+++ gcc/config/sh/sh-mem.cc (working copy)
@@ -586,9 +586,35 @@ sh_expand_strlen (rtx *operands)
emit_move_insn (current_addr, plus_constant (Pmode, current_addr, -4));
- /* start byte loop. */
addr1 = adjust_address (addr1, QImode, 0);
+ /* unroll remaining bytes. */
+ emit_insn (gen_extendqisi2 (tmp1, addr1));
+ emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+ jump = emit_jump_insn (gen_branch_true (L_return));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+ emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+ emit_insn (gen_extendqisi2 (tmp1, addr1));
+ emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+ jump = emit_jump_insn (gen_branch_true (L_return));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+ emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+ emit_insn (gen_extendqisi2 (tmp1, addr1));
+ emit_insn (gen_cmpeqsi_t (tmp1, const0_rtx));
+ jump = emit_jump_insn (gen_branch_true (L_return));
+ add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+ emit_move_insn (current_addr, plus_constant (Pmode, current_addr, 1));
+
+ emit_insn (gen_extendqisi2 (tmp1, addr1));
+ jump = emit_jump_insn (gen_jump_compact (L_return));
+ emit_barrier_after (jump);
+
+ /* start byte loop. */
emit_label (L_loop_byte);
emit_insn (gen_extendqisi2 (tmp1, addr1));
@@ -600,11 +626,12 @@ sh_expand_strlen (rtx *operands)
/* end loop. */
+ emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
+
emit_label (L_return);
- emit_insn (gen_addsi3 (start_addr, start_addr, GEN_INT (1)));
-
emit_insn (gen_subsi3 (operands[0], current_addr, start_addr));
return true;
}
+