Hi.

As mentioned in the PR, strlen expansion leading to rep-scasb leads
to a slow code. The expansion happens for -Os and -O1, with -O2 we
expand into 4B loop if pointer alignment is known to be at least 4.
When using -minline-all-stringops and -O2, the 4B loop happens also
for an unknown alignment (equal to 1).

I'm suggesting to adjust that to:
- -Os will keep using rep-scasb as -Os means optimize for size
 no matter what speed impact is
- otherwise use call to strlen
- when -minline-all-stringops is enabled and -O2+ is used, then
  expand to 4B loop for all possible alignments

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
I've also tested SPEC2006 and SPEC2017 and I don't see any significant
speed or (and) size change.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

2019-04-23  Martin Liska  <mli...@suse.cz>

        PR target/88809
        * config/i386/i386.c (ix86_expand_strlen): For -Os use
        rep-scasb. Otherwise use strlen call. With
        -minline-all-stringops use inline expansion using 4B loop.
        * doc/invoke.texi: Document the change of
        -minline-all-stringops.

gcc/testsuite/ChangeLog:

2019-04-23  Martin Liska  <mli...@suse.cz>

        PR target/88809
        * gcc.target/i386/pr88809.c: New test.
        * gcc.target/i386/pr88809-2.c: New test.
---
 gcc/config/i386/i386.c                    | 67 +++++++++++------------
 gcc/doc/invoke.texi                       |  5 +-
 gcc/testsuite/gcc.target/i386/pr88809-2.c |  9 +++
 gcc/testsuite/gcc.target/i386/pr88809.c   |  9 +++
 4 files changed, 54 insertions(+), 36 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88809-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr88809.c


diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 498a35d8aea..fe6eff78996 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -28270,41 +28270,11 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
 {
   rtx addr, scratch1, scratch2, scratch3, scratch4;
 
-  /* The generic case of strlen expander is long.  Avoid it's
-     expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
-
-  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
-      && !TARGET_INLINE_ALL_STRINGOPS
-      && !optimize_insn_for_size_p ()
-      && (!CONST_INT_P (align) || INTVAL (align) < 4))
-    return false;
-
-  addr = force_reg (Pmode, XEXP (src, 0));
-  scratch1 = gen_reg_rtx (Pmode);
-
-  if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
-      && !optimize_insn_for_size_p ())
-    {
-      /* Well it seems that some optimizer does not combine a call like
-         foo(strlen(bar), strlen(bar));
-         when the move and the subtraction is done here.  It does calculate
-         the length just once when these instructions are done inside of
-         output_strlen_unroll().  But I think since &bar[strlen(bar)] is
-         often used and I use one fewer register for the lifetime of
-         output_strlen_unroll() this is better.  */
-
-      emit_move_insn (out, addr);
-
-      ix86_expand_strlensi_unroll_1 (out, src, align);
-
-      /* strlensi_unroll_1 returns the address of the zero at the end of
-         the string, like memchr(), so compute the length by subtracting
-         the start address.  */
-      emit_insn (ix86_gen_sub3 (out, out, addr));
-    }
-  else
+  if (optimize_insn_for_size_p ())
     {
       rtx unspec;
+      addr = force_reg (Pmode, XEXP (src, 0));
+      scratch1 = gen_reg_rtx (Pmode);
 
       /* Can't use this if the user has appropriated eax, ecx, or edi.  */
       if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
@@ -28328,8 +28298,37 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
       emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
       emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
       emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
+      return true;
     }
-  return true;
+  else if (TARGET_UNROLL_STRLEN
+	   && TARGET_INLINE_ALL_STRINGOPS
+	   && eoschar == const0_rtx
+	   && optimize > 1)
+    {
+      /* The generic case of strlen expander is long.  Avoid it's
+	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
+      addr = force_reg (Pmode, XEXP (src, 0));
+      scratch1 = gen_reg_rtx (Pmode);
+      /* Well it seems that some optimizer does not combine a call like
+	 foo(strlen(bar), strlen(bar));
+	 when the move and the subtraction is done here.  It does calculate
+	 the length just once when these instructions are done inside of
+	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
+	 often used and I use one fewer register for the lifetime of
+	 output_strlen_unroll() this is better.  */
+
+      emit_move_insn (out, addr);
+
+      ix86_expand_strlensi_unroll_1 (out, src, align);
+
+      /* strlensi_unroll_1 returns the address of the zero at the end of
+	 the string, like memchr(), so compute the length by subtracting
+	 the start address.  */
+      emit_insn (ix86_gen_sub3 (out, out, addr));
+      return true;
+    }
+  else
+    return false;
 }
 
 /* For given symbol (function) construct code to compute address of it's PLT
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index f4aa9e53de8..c87ee742ce2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -28499,8 +28499,9 @@ By default GCC inlines string operations only when the destination is
 known to be aligned to least a 4-byte boundary.  
 This enables more inlining and increases code
 size, but may improve performance of code that depends on fast
-@code{memcpy}, @code{strlen},
-and @code{memset} for short lengths.
+@code{memcpy} and @code{memset} for short lengths.
+The option enables inline expansion of @code{strlen} for all
+pointer alignments.
 
 @item -minline-stringops-dynamically
 @opindex minline-stringops-dynamically
diff --git a/gcc/testsuite/gcc.target/i386/pr88809-2.c b/gcc/testsuite/gcc.target/i386/pr88809-2.c
new file mode 100644
index 00000000000..24377791d1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88809-2.c
@@ -0,0 +1,9 @@
+/* PR target/88809 */
+/* { dg-options "-Os" } */
+
+unsigned int foo (const char *ptr)
+{
+  return __builtin_strlen (ptr);
+}
+
+/* { dg-final { scan-assembler "repnz\[ \t\]scasb" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88809.c b/gcc/testsuite/gcc.target/i386/pr88809.c
new file mode 100644
index 00000000000..20844ddb921
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr88809.c
@@ -0,0 +1,9 @@
+/* PR target/88809 */
+/* { dg-options "-O" } */
+
+unsigned int foo (const char *ptr)
+{
+  return __builtin_strlen (ptr);
+}
+
+/* { dg-final { scan-assembler "call\[ \t\]strlen" } } */

Reply via email to