Hello, This patch adds support to inline an optimized version of strcmp when not optimizing for size. The generated code makes use of the cmp/str instruction to test 4 bytes at a time when correctly aligned.
note that a new pattern was added to match the cmp/str instruction, but no attempt was made to catch it from combine. This results in general cycles improvements (against both newlib and glibc implementations), one of which is a 10% cycle improvement for a famous strcmp-biased "benchmark" starting with a D.... , but still standard. This optimization can be disabled with -fno-builtin-strcmp. No regressions on sh4 in big and little endian, and sh2 (sh3, and sh4a are still running for big and little endian for sanity) OK for trunk Thanks Christian
2013-10-17 Christian Bruel <christian.br...@st.com> * gcc/config/sh/sh-mem.c (sh4_expand_cmpstr): New function. * gcc/config/sh/sh-protos.h (sh4_expand_cmpstr): Declare. * gcc/config/sh/sh.md (cmpstrsi, cmpstr_t): New patterns. (rotlhi3_8): Rename. --- gcc/config/sh/sh.md 2013-10-17 15:14:18.000000000 +0200 +++ gcc-new/config/sh/sh.md 2013-10-16 16:13:49.000000000 +0200 @@ -31,9 +31,6 @@ ;; ??? The MAC.W and MAC.L instructions are not supported. There is no ;; way to generate them. -;; ??? The cmp/str instruction is not supported. Perhaps it can be used -;; for a str* inline function. - ;; BSR is not generated by the compiler proper, but when relaxing, it ;; generates .uses pseudo-ops that allow linker relaxation to create ;; BSR. This is actually implemented in bfd/{coff,elf32}-sh.c @@ -4037,7 +4034,7 @@ DONE; }) -(define_insn "*rotlhi3_8" +(define_insn "rotlhi3_8" [(set (match_operand:HI 0 "arith_reg_dest" "=r") (rotate:HI (match_operand:HI 1 "arith_reg_operand" "r") (const_int 8)))] @@ -11912,6 +11909,41 @@ "jsr @%0%#" [(set_attr "type" "sfunc") (set_attr "needs_delay_slot" "yes")]) + +;; byte compare pattern +;; temp = a ^ b; +;; !((temp & 0xF000) && (temp & 0x0F00) && (temp & 0x00F0) && (temp & 0x000F)) +(define_insn "cmpstr_t" + [(set (reg:SI T_REG) + (eq:SI (and:SI + (and:SI + (and:SI + (zero_extract:SI (xor:SI (match_operand:SI 0 "arith_reg_operand" "r") + (match_operand:SI 1 "arith_reg_operand" "r")) + (const_int 8) (const_int 0)) + (zero_extract:SI (xor:SI (match_dup 0) (match_dup 1)) + (const_int 8) (const_int 8))) + (zero_extract:SI (xor:SI (match_dup 0) (match_dup 1)) + (const_int 8) (const_int 16))) + (zero_extract:SI (xor:SI (match_dup 0) (match_dup 1)) + (const_int 8) (const_int 24))) (const_int 0)))] + "TARGET_SH1" + "cmp/str %0,%1" + [(set_attr "type" "mt_group")]) + +(define_expand "cmpstrsi" + [(set (match_operand:SI 0 "register_operand" "") + (compare:SI (match_operand:BLK 1 "memory_operand" "") + (match_operand:BLK 2 "memory_operand" ""))) + (use (match_operand 3 "immediate_operand" ""))] + "TARGET_SH1" + " +{ + if (! optimize_insn_for_size_p () && sh4_expand_cmpstr(operands)) + DONE; + else FAIL; +}") + ;; ------------------------------------------------------------------------- ;; Floating point instructions. diff -ru gcc/config/sh/sh-mem.c gcc-new/config/sh/sh-mem.c --- gcc/config/sh/sh-mem.c 2013-10-17 14:59:02.000000000 +0200 +++ gcc-new/config/sh/sh-mem.c 2013-10-17 14:57:57.000000000 +0200 @@ -23,6 +23,7 @@ #include "tm.h" #include "expr.h" #include "tm_p.h" +#include "basic-block.h" /* Like force_operand, but guarantees that VALUE ends up in TARGET. */ static void @@ -174,3 +175,130 @@ return false; } + +/* Emit code to perform a strcmp. + + OPERANDS[0] is the destination. + OPERANDS[1] is the first string. + OPERANDS[2] is the second string. + OPERANDS[3] is the align. */ +bool +sh4_expand_cmpstr (rtx *operands) +{ + rtx s1 = copy_rtx (operands[1]); + rtx s2 = copy_rtx (operands[2]); + rtx s1_addr = copy_addr_to_reg (XEXP (s1, 0)); + rtx s2_addr = copy_addr_to_reg (XEXP (s2, 0)); + rtx tmp0 = gen_reg_rtx (SImode); + rtx tmp1 = gen_reg_rtx (SImode); + rtx tmp2 = gen_reg_rtx (SImode); + rtx tmp3 = gen_reg_rtx (SImode); + + rtx L_return = gen_label_rtx (); + rtx L_loop_byte = gen_label_rtx (); + rtx L_end_loop_byte = gen_label_rtx (); + rtx L_loop_long = gen_label_rtx (); + rtx L_end_loop_long = gen_label_rtx (); + + rtx jump, addr1, addr2; + int prob_unlikely = REG_BR_PROB_BASE / 10; + int prob_likely = REG_BR_PROB_BASE / 4; + + emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr)); + emit_move_insn (tmp0, GEN_INT (3)); + + emit_insn (gen_tstsi_t (tmp0, tmp1)); + + emit_move_insn (tmp0, const0_rtx); + + jump = emit_jump_insn (gen_branch_false (L_loop_byte)); + add_int_reg_note (jump, REG_BR_PROB, prob_likely); + + addr1 = adjust_automodify_address (s1, SImode, s1_addr, 0); + addr2 = adjust_automodify_address (s2, SImode, s2_addr, 0); + + /* tmp2 is aligned, OK to load. */ + emit_move_insn (tmp3, addr2); + emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4)); + + /*start long loop. */ + emit_label (L_loop_long); + + emit_move_insn (tmp2, tmp3); + + /* tmp1 is aligned, OK to load. */ + emit_move_insn (tmp1, addr1); + emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 4)); + + /* Is there a 0 byte ? */ + emit_insn (gen_andsi3 (tmp3, tmp3, tmp1)); + + emit_insn (gen_cmpstr_t (tmp0, tmp3)); + jump = emit_jump_insn (gen_branch_true (L_end_loop_long)); + add_int_reg_note (jump, REG_BR_PROB, prob_unlikely); + + emit_insn (gen_cmpeqsi_t (tmp1, tmp2)); + + /* tmp2 is aligned, OK to load. */ + emit_move_insn (tmp3, addr2); + emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4)); + + jump = emit_jump_insn (gen_branch_true (L_loop_long)); + add_int_reg_note (jump, REG_BR_PROB, prob_likely); + /* end loop. */ + + /* Fallthu, check if one of the word is greater. */ + if (TARGET_LITTLE_ENDIAN) + { + rtx low_1 = gen_lowpart (HImode, tmp1); + rtx low_2 = gen_lowpart (HImode, tmp2); + + emit_insn (gen_rotlhi3_8 (low_1, low_1)); + emit_insn (gen_rotlhi3_8 (low_2, low_2)); + emit_insn (gen_rotlsi3_16 (tmp1, tmp1)); + emit_insn (gen_rotlsi3_16 (tmp2, tmp2)); + emit_insn (gen_rotlhi3_8 (low_1, low_1)); + emit_insn (gen_rotlhi3_8 (low_2, low_2)); + } + + jump = emit_jump_insn (gen_jump_compact (L_return)); + emit_barrier_after (jump); + + /* start byte loop. */ + addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0); + addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0); + + emit_label (L_end_loop_long); + + emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4)); + emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4)); + + emit_label (L_loop_byte); + + emit_insn (gen_extendqisi2 (tmp2, addr2)); + emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 1)); + + emit_insn (gen_extendqisi2 (tmp1, addr1)); + emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 1)); + + emit_insn (gen_cmpeqsi_t (tmp2, const0_rtx)); + jump = emit_jump_insn (gen_branch_true (L_end_loop_byte)); + add_int_reg_note (jump, REG_BR_PROB, prob_unlikely); + + emit_insn (gen_cmpeqsi_t (tmp1, tmp2)); + emit_jump_insn (gen_branch_true (L_loop_byte)); + add_int_reg_note (jump, REG_BR_PROB, prob_likely); + /* end loop. */ + + emit_label (L_end_loop_byte); + + emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2))); + emit_insn (gen_zero_extendqisi2 (tmp1, gen_lowpart (QImode, tmp1))); + + emit_label (L_return); + + emit_insn (gen_subsi3 (operands[0], tmp1, tmp2)); + + return true; +} + diff -ru gcc/config/sh/sh-protos.h gcc-new/config/sh/sh-protos.h --- gcc/config/sh/sh-protos.h 2013-10-17 15:13:46.000000000 +0200 +++ gcc-new/config/sh/sh-protos.h 2013-10-07 14:38:08.000000000 +0200 @@ -116,6 +116,7 @@ extern void output_pic_addr_const (FILE *, rtx); extern bool expand_block_move (rtx *); extern void prepare_move_operands (rtx[], enum machine_mode mode); +extern bool sh4_expand_cmpstr (rtx *); extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode, enum rtx_code comparison); extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int);