Hello,

This patch adds support to inline an optimized version of strcmp when
not optimizing for size. The generated code makes use of the cmp/str
instruction to test 4 bytes at a time when correctly aligned.

note that a new pattern was added to match the cmp/str instruction, but
no attempt was made to catch it from combine.

This results in general cycles improvements (against both newlib and
glibc implementations), one of which is a 10%  cycle improvement for a
famous strcmp-biased "benchmark" starting with a D.... , but still standard.

This optimization  can be disabled with -fno-builtin-strcmp.

No regressions on sh4 in big and little endian, and sh2 (sh3, and sh4a
are still running for big and little endian for sanity)

OK for trunk

Thanks

Christian



 
2013-10-17  Christian Bruel  <christian.br...@st.com>

	* gcc/config/sh/sh-mem.c (sh4_expand_cmpstr): New function.
	* gcc/config/sh/sh-protos.h (sh4_expand_cmpstr): Declare.
	* gcc/config/sh/sh.md (cmpstrsi, cmpstr_t): New patterns.
	(rotlhi3_8): Rename.

--- gcc/config/sh/sh.md	2013-10-17 15:14:18.000000000 +0200
+++ gcc-new/config/sh/sh.md	2013-10-16 16:13:49.000000000 +0200
@@ -31,9 +31,6 @@
 ;; ??? The MAC.W and MAC.L instructions are not supported.  There is no
 ;; way to generate them.
 
-;; ??? The cmp/str instruction is not supported.  Perhaps it can be used
-;; for a str* inline function.
-
 ;; BSR is not generated by the compiler proper, but when relaxing, it
 ;; generates .uses pseudo-ops that allow linker relaxation to create
 ;; BSR.  This is actually implemented in bfd/{coff,elf32}-sh.c
@@ -4037,7 +4034,7 @@
   DONE;
 })
 
-(define_insn "*rotlhi3_8"
+(define_insn "rotlhi3_8"
   [(set (match_operand:HI 0 "arith_reg_dest" "=r")
 	(rotate:HI (match_operand:HI 1 "arith_reg_operand" "r")
 		   (const_int 8)))]
@@ -11912,6 +11909,41 @@
   "jsr	@%0%#"
   [(set_attr "type" "sfunc")
    (set_attr "needs_delay_slot" "yes")])
+
+;; byte compare pattern
+;; temp = a ^ b;
+;; !((temp & 0xF000) && (temp & 0x0F00) && (temp & 0x00F0) && (temp & 0x000F))
+(define_insn "cmpstr_t"
+  [(set (reg:SI T_REG)
+	(eq:SI (and:SI
+		(and:SI
+		 (and:SI
+		  (zero_extract:SI (xor:SI (match_operand:SI 0 "arith_reg_operand" "r")
+					   (match_operand:SI 1 "arith_reg_operand" "r"))
+				   (const_int 8) (const_int 0))
+		  (zero_extract:SI (xor:SI (match_dup 0) (match_dup 1))
+				   (const_int 8) (const_int 8)))
+		  (zero_extract:SI (xor:SI (match_dup 0) (match_dup 1))
+				   (const_int 8) (const_int 16)))
+		(zero_extract:SI (xor:SI (match_dup 0) (match_dup 1))
+				 (const_int 8) (const_int 24))) (const_int 0)))]
+  "TARGET_SH1"
+  "cmp/str	%0,%1"
+  [(set_attr "type" "mt_group")])
+
+(define_expand "cmpstrsi"
+  [(set (match_operand:SI 0 "register_operand" "")
+	(compare:SI (match_operand:BLK 1 "memory_operand" "")
+		    (match_operand:BLK 2 "memory_operand" "")))
+   (use (match_operand 3 "immediate_operand" ""))]
+  "TARGET_SH1"
+  "
+{
+   if (! optimize_insn_for_size_p () && sh4_expand_cmpstr(operands))
+      DONE;
+   else FAIL;
+}")
+
 
 ;; -------------------------------------------------------------------------
 ;; Floating point instructions.
diff -ru gcc/config/sh/sh-mem.c gcc-new/config/sh/sh-mem.c
--- gcc/config/sh/sh-mem.c	2013-10-17 14:59:02.000000000 +0200
+++ gcc-new/config/sh/sh-mem.c	2013-10-17 14:57:57.000000000 +0200
@@ -23,6 +23,7 @@
 #include "tm.h"
 #include "expr.h"
 #include "tm_p.h"
+#include "basic-block.h"
 
 /* Like force_operand, but guarantees that VALUE ends up in TARGET.  */
 static void
@@ -174,3 +175,130 @@
 
   return false;
 }
+
+/* Emit code to perform a strcmp.
+
+   OPERANDS[0] is the destination.
+   OPERANDS[1] is the first string.
+   OPERANDS[2] is the second string.
+   OPERANDS[3] is the align.  */
+bool
+sh4_expand_cmpstr (rtx *operands)
+{
+  rtx s1 = copy_rtx (operands[1]);
+  rtx s2 = copy_rtx (operands[2]);
+  rtx s1_addr = copy_addr_to_reg (XEXP (s1, 0));
+  rtx s2_addr = copy_addr_to_reg (XEXP (s2, 0));
+  rtx tmp0 = gen_reg_rtx (SImode);
+  rtx tmp1 = gen_reg_rtx (SImode);
+  rtx tmp2 = gen_reg_rtx (SImode);
+  rtx tmp3 = gen_reg_rtx (SImode);
+
+  rtx L_return = gen_label_rtx ();
+  rtx L_loop_byte = gen_label_rtx ();
+  rtx L_end_loop_byte = gen_label_rtx ();
+  rtx L_loop_long = gen_label_rtx ();
+  rtx L_end_loop_long = gen_label_rtx ();
+
+  rtx jump, addr1, addr2;
+  int prob_unlikely = REG_BR_PROB_BASE / 10;
+  int prob_likely = REG_BR_PROB_BASE / 4;
+
+  emit_insn (gen_iorsi3 (tmp1, s1_addr, s2_addr));
+  emit_move_insn (tmp0, GEN_INT (3));
+
+  emit_insn (gen_tstsi_t (tmp0, tmp1));
+
+  emit_move_insn (tmp0, const0_rtx);
+
+  jump = emit_jump_insn (gen_branch_false (L_loop_byte));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+
+  addr1 = adjust_automodify_address (s1, SImode, s1_addr, 0);
+  addr2 = adjust_automodify_address (s2, SImode, s2_addr, 0);
+
+  /* tmp2 is aligned, OK to load.  */
+  emit_move_insn (tmp3, addr2);
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4));
+
+  /*start long loop.  */
+  emit_label (L_loop_long);
+
+  emit_move_insn (tmp2, tmp3);
+
+  /* tmp1 is aligned, OK to load.  */
+  emit_move_insn (tmp1, addr1);
+  emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 4));
+
+  /* Is there a 0 byte ?  */
+  emit_insn (gen_andsi3 (tmp3, tmp3, tmp1));
+
+  emit_insn (gen_cmpstr_t (tmp0, tmp3));
+  jump = emit_jump_insn (gen_branch_true (L_end_loop_long));
+  add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+  emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+
+  /* tmp2 is aligned, OK to load.  */
+  emit_move_insn (tmp3, addr2);
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 4));
+
+  jump = emit_jump_insn (gen_branch_true (L_loop_long));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+  /* end loop.  */
+
+  /* Fallthu, check if one of the word is greater.  */
+  if (TARGET_LITTLE_ENDIAN)
+    {
+      rtx low_1 = gen_lowpart (HImode, tmp1);
+      rtx low_2 = gen_lowpart (HImode, tmp2);
+
+      emit_insn (gen_rotlhi3_8 (low_1, low_1));
+      emit_insn (gen_rotlhi3_8 (low_2, low_2));
+      emit_insn (gen_rotlsi3_16 (tmp1, tmp1));
+      emit_insn (gen_rotlsi3_16 (tmp2, tmp2));
+      emit_insn (gen_rotlhi3_8 (low_1, low_1));
+      emit_insn (gen_rotlhi3_8 (low_2, low_2));
+    }
+
+  jump = emit_jump_insn (gen_jump_compact (L_return));
+  emit_barrier_after (jump);
+
+  /* start byte loop.  */
+  addr1 = adjust_automodify_address (s1, QImode, s1_addr, 0);
+  addr2 = adjust_automodify_address (s2, QImode, s2_addr, 0);
+
+  emit_label (L_end_loop_long);
+
+  emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, -4));
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, -4));
+
+  emit_label (L_loop_byte);
+
+  emit_insn (gen_extendqisi2 (tmp2, addr2));
+  emit_move_insn (s2_addr, plus_constant (Pmode, s2_addr, 1));
+
+  emit_insn (gen_extendqisi2 (tmp1, addr1));
+  emit_move_insn (s1_addr, plus_constant (Pmode, s1_addr, 1));
+
+  emit_insn (gen_cmpeqsi_t (tmp2, const0_rtx));
+  jump = emit_jump_insn (gen_branch_true (L_end_loop_byte));
+  add_int_reg_note (jump, REG_BR_PROB, prob_unlikely);
+
+  emit_insn (gen_cmpeqsi_t (tmp1, tmp2));
+  emit_jump_insn (gen_branch_true (L_loop_byte));
+  add_int_reg_note (jump, REG_BR_PROB, prob_likely);
+  /* end loop.  */
+
+  emit_label (L_end_loop_byte);
+
+  emit_insn (gen_zero_extendqisi2 (tmp2, gen_lowpart (QImode, tmp2)));
+  emit_insn (gen_zero_extendqisi2 (tmp1, gen_lowpart (QImode, tmp1)));
+
+  emit_label (L_return);
+
+  emit_insn (gen_subsi3 (operands[0], tmp1, tmp2));
+
+  return true;
+}
+
diff -ru gcc/config/sh/sh-protos.h gcc-new/config/sh/sh-protos.h
--- gcc/config/sh/sh-protos.h	2013-10-17 15:13:46.000000000 +0200
+++ gcc-new/config/sh/sh-protos.h	2013-10-07 14:38:08.000000000 +0200
@@ -116,6 +116,7 @@
 extern void output_pic_addr_const (FILE *, rtx);
 extern bool expand_block_move (rtx *);
 extern void prepare_move_operands (rtx[], enum machine_mode mode);
+extern bool sh4_expand_cmpstr (rtx *);
 extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode,
 					       enum rtx_code comparison);
 extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int);

Reply via email to