The patch inlines strlen for 8-byte aligned strings on
AARCH64 like it's done on other platforms (power, s390).
The implementation falls back to the library call if the
string is not aligned. Synthetic testing on Cavium T88
and Cavium T99 showed the following performance gains:

T99: up to 8 bytes: +100%, 100+ bytes - +20%
T88: up 8 bytes: +100%, 100+ bytes - 0%

which seems to be OK as most of the string are short strings.

SPEC performance testing on T99 and T88 did not show any
statistically significant differences.

Bootstrapped and regression-tested on aarch64-linux-gnu.
No new failures found. OK for trunk?

2016-08-10 Anton Youdkevitch <anton.youdkevi...@bell-sw.com>

      * gcc/config/aarch64/aarch64.md (strlen) New pattern.
      (UNSPEC_BUILTIN_STRLEN): Define.
      * gcc/config/aarch64/aarch64.c (aarch64_expand_strlen):
      Expand only in 8-byte aligned case, do not attempt to
      adjust address
      * gcc/config/aarch64/aarch64-protos.h
      (aarch64_expand_strlen): Declare.
      * gcc/testsuite/gcc.target/aarch64/strlen_aligned.c: New

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index cda2895..9beb289 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -358,6 +358,7 @@ bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
 void aarch64_expand_call (rtx, rtx, bool);
 bool aarch64_expand_movmem (rtx *);
+void aarch64_expand_strlen (rtx *);
 bool aarch64_float_const_zero_rtx_p (rtx);
 bool aarch64_float_const_rtx_p (rtx);
 bool aarch64_function_arg_regno_p (unsigned);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4b5183b..d12fb6b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -16107,6 +16107,81 @@ aarch64_expand_movmem (rtx *operands)
   return true;
 }
 
+/* Emit code to perform a strlen.
+
+   OPERANDS[0] is the destination.
+   OPERANDS[1] is the string.
+   OPERANDS[2] is the char to search.
+   OPERANDS[3] is the alignment.  */
+
+void aarch64_expand_strlen (rtx* operands) {
+  rtx result = operands[0];
+  rtx src = operands[1];
+  rtx loop_label = gen_label_rtx ();
+  rtx end_label = gen_label_rtx ();
+  rtx end_loop_label = gen_label_rtx ();
+  rtx preloop_label = gen_label_rtx ();
+  rtx str = gen_reg_rtx (DImode);
+  rtx addr = force_reg (DImode, XEXP (src, 0));
+  rtx start_addr = gen_reg_rtx(DImode);
+  rtx tmp1 = gen_reg_rtx (DImode);
+  rtx tmp2 = gen_reg_rtx (DImode);
+  rtx tmp3 = gen_reg_rtx (DImode);
+  rtx mask1 = gen_reg_rtx (DImode);
+  rtx mask2 = gen_reg_rtx (DImode);
+  rtx x;
+  rtx mem;
+
+  emit_insn (gen_rtx_SET (start_addr, addr));
+  emit_insn (gen_anddi3 (tmp1, addr, GEN_INT (4096 - 1)));
+  /* if less than 16 bytes left till the end of the page */
+  x = gen_rtx_GT (DImode, tmp1, GEN_INT (4096 - 16));
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                            gen_rtx_LABEL_REF (Pmode, preloop_label), pc_rtx);
+
+  emit_move_insn (str, gen_rtx_MEM (DImode, addr));
+  emit_insn (gen_rtx_SET (mask1, GEN_INT (0x0101010101010101)));
+  emit_insn (gen_rtx_SET (mask2, GEN_INT (0x7f7f7f7f7f7f7f7f)));
+
+  /* process the chunk */
+  emit_insn (gen_subdi3 (tmp1, str, mask1));
+  emit_insn (gen_iordi3 (tmp2, str, mask2));
+  emit_insn (gen_rtx_SET (tmp2, gen_rtx_NOT (DImode, tmp2)));
+  emit_insn (gen_anddi3 (tmp3, tmp1, tmp2));
+
+
+  /* if NULL found jump to calculate it's exact position */
+  x = gen_rtx_NE (DImode, tmp3, GEN_INT (0));
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                            gen_rtx_LABEL_REF (Pmode, end_loop_label), pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+
+  emit_insn (gen_adddi3 (addr, addr, GEN_INT (8)));
+  emit_label (preloop_label);
+  mem = gen_rtx_POST_MODIFY (DImode, addr, plus_constant (DImode, addr, 1));
+
+  /* simple byte loop */
+  emit_label (loop_label);
+  emit_move_insn (str, gen_rtx_ZERO_EXTEND (DImode, gen_rtx_MEM (QImode, 
mem)));
+  x = gen_rtx_NE (SImode, str, GEN_INT(0));
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, gen_rtx_LABEL_REF (Pmode, 
loop_label), pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+
+  emit_insn (gen_subdi3 (result, addr, start_addr));
+  /* adjusting after the last post-decrement */
+  emit_insn (gen_adddi3 (result, result, GEN_INT (-1)));
+  emit_jump_insn (gen_jump (end_label));
+  emit_barrier ();
+
+  emit_label (end_loop_label);
+  emit_insn (gen_bswapdi2 (tmp3, tmp3));
+  emit_insn (gen_clzdi2 (tmp3, tmp3));
+  emit_insn (gen_ashrdi3 (tmp3, tmp3, GEN_INT (3)));
+  emit_move_insn (result, tmp3);
+
+  emit_label(end_label);
+}
+
 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
    SImode stores.  Handle the case when the constant has identical
    bottom and top halves.  This is beneficial when the two stores can be
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 10fcde6..7c60b69 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -189,6 +189,7 @@
     UNSPEC_CLASTB
     UNSPEC_FADDA
     UNSPEC_REV_SUBREG
+    UNSPEC_BUILTIN_STRLEN
 ])
 
 (define_c_enum "unspecv" [
@@ -395,6 +396,19 @@
   [(set_attr "type" "fccmp<s>")]
 )
 
+(define_expand "strlen<mode>"
+  [(set (match_operand:P 0 "register_operand")
+        (unspec:P [(match_operand:BLK 1 "memory_operand")
+                   (match_operand 2 "immediate_operand")
+                   (match_operand 3 "immediate_operand")]
+                  UNSPEC_BUILTIN_STRLEN))]
+  ""
+{
+  aarch64_expand_strlen (operands);
+  DONE;
+})
+
+
 ;; Expansion of signed mod by a power of 2 using CSNEG.
 ;; For x0 % n where n is a power of 2 produce:
 ;; negs   x1, x0

Reply via email to