Hi. As mentioned in the PR, strlen expansion leading to rep-scasb leads to a slow code. The expansion happens for -Os and -O1, with -O2 we expand into 4B loop if pointer alignment is known to be at least 4. When using -minline-all-stringops and -O2, the 4B loop happens also for an unknown alignment (equal to 1).
I'm suggesting to adjust that to: - -Os will keep using rep-scasb as -Os means optimize for size no matter what speed impact is - otherwise use call to strlen - when -minline-all-stringops is enabled and -O2+ is used, then expand to 4B loop for all possible alignments Patch can bootstrap on x86_64-linux-gnu and survives regression tests. I've also tested SPEC2006 and SPEC2017 and I don't see any significant speed or (and) size change. Ready to be installed? Thanks, Martin gcc/ChangeLog: 2019-04-23 Martin Liska <mli...@suse.cz> PR target/88809 * config/i386/i386.c (ix86_expand_strlen): For -Os use rep-scasb. Otherwise use strlen call. With -minline-all-stringops use inline expansion using 4B loop. * doc/invoke.texi: Document the change of -minline-all-stringops. gcc/testsuite/ChangeLog: 2019-04-23 Martin Liska <mli...@suse.cz> PR target/88809 * gcc.target/i386/pr88809.c: New test. * gcc.target/i386/pr88809-2.c: New test. --- gcc/config/i386/i386.c | 67 +++++++++++------------ gcc/doc/invoke.texi | 5 +- gcc/testsuite/gcc.target/i386/pr88809-2.c | 9 +++ gcc/testsuite/gcc.target/i386/pr88809.c | 9 +++ 4 files changed, 54 insertions(+), 36 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr88809-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr88809.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 498a35d8aea..fe6eff78996 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -28270,41 +28270,11 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) { rtx addr, scratch1, scratch2, scratch3, scratch4; - /* The generic case of strlen expander is long. Avoid it's - expanding unless TARGET_INLINE_ALL_STRINGOPS. */ - - if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 - && !TARGET_INLINE_ALL_STRINGOPS - && !optimize_insn_for_size_p () - && (!CONST_INT_P (align) || INTVAL (align) < 4)) - return false; - - addr = force_reg (Pmode, XEXP (src, 0)); - scratch1 = gen_reg_rtx (Pmode); - - if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 - && !optimize_insn_for_size_p ()) - { - /* Well it seems that some optimizer does not combine a call like - foo(strlen(bar), strlen(bar)); - when the move and the subtraction is done here. It does calculate - the length just once when these instructions are done inside of - output_strlen_unroll(). But I think since &bar[strlen(bar)] is - often used and I use one fewer register for the lifetime of - output_strlen_unroll() this is better. */ - - emit_move_insn (out, addr); - - ix86_expand_strlensi_unroll_1 (out, src, align); - - /* strlensi_unroll_1 returns the address of the zero at the end of - the string, like memchr(), so compute the length by subtracting - the start address. */ - emit_insn (ix86_gen_sub3 (out, out, addr)); - } - else + if (optimize_insn_for_size_p ()) { rtx unspec; + addr = force_reg (Pmode, XEXP (src, 0)); + scratch1 = gen_reg_rtx (Pmode); /* Can't use this if the user has appropriated eax, ecx, or edi. */ if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG]) @@ -28328,8 +28298,37 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec)); emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1)); emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx)); + return true; } - return true; + else if (TARGET_UNROLL_STRLEN + && TARGET_INLINE_ALL_STRINGOPS + && eoschar == const0_rtx + && optimize > 1) + { + /* The generic case of strlen expander is long. Avoid it's + expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + addr = force_reg (Pmode, XEXP (src, 0)); + scratch1 = gen_reg_rtx (Pmode); + /* Well it seems that some optimizer does not combine a call like + foo(strlen(bar), strlen(bar)); + when the move and the subtraction is done here. It does calculate + the length just once when these instructions are done inside of + output_strlen_unroll(). But I think since &bar[strlen(bar)] is + often used and I use one fewer register for the lifetime of + output_strlen_unroll() this is better. */ + + emit_move_insn (out, addr); + + ix86_expand_strlensi_unroll_1 (out, src, align); + + /* strlensi_unroll_1 returns the address of the zero at the end of + the string, like memchr(), so compute the length by subtracting + the start address. */ + emit_insn (ix86_gen_sub3 (out, out, addr)); + return true; + } + else + return false; } /* For given symbol (function) construct code to compute address of it's PLT diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index f4aa9e53de8..c87ee742ce2 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -28499,8 +28499,9 @@ By default GCC inlines string operations only when the destination is known to be aligned to least a 4-byte boundary. This enables more inlining and increases code size, but may improve performance of code that depends on fast -@code{memcpy}, @code{strlen}, -and @code{memset} for short lengths. +@code{memcpy} and @code{memset} for short lengths. +The option enables inline expansion of @code{strlen} for all +pointer alignments. @item -minline-stringops-dynamically @opindex minline-stringops-dynamically diff --git a/gcc/testsuite/gcc.target/i386/pr88809-2.c b/gcc/testsuite/gcc.target/i386/pr88809-2.c new file mode 100644 index 00000000000..24377791d1c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88809-2.c @@ -0,0 +1,9 @@ +/* PR target/88809 */ +/* { dg-options "-Os" } */ + +unsigned int foo (const char *ptr) +{ + return __builtin_strlen (ptr); +} + +/* { dg-final { scan-assembler "repnz\[ \t\]scasb" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr88809.c b/gcc/testsuite/gcc.target/i386/pr88809.c new file mode 100644 index 00000000000..20844ddb921 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr88809.c @@ -0,0 +1,9 @@ +/* PR target/88809 */ +/* { dg-options "-O" } */ + +unsigned int foo (const char *ptr) +{ + return __builtin_strlen (ptr); +} + +/* { dg-final { scan-assembler "call\[ \t\]strlen" } } */