Hello! Attached patch introduces asm flag outputs in seach_line_sse42 main loop to handle carry flag value from pcmpestri insn. Slightly improved old code that uses asm loop compiles to:
96: 66 0f 6f 05 00 00 00 movdqa 0x0(%rip),%xmm0 9d: 00 9e: 48 83 ef 10 sub $0x10,%rdi a2: ba 10 00 00 00 mov $0x10,%edx a7: b8 04 00 00 00 mov $0x4,%eax ac: 0f 1f 40 00 nopl 0x0(%rax) b0: 48 83 c7 10 add $0x10,%rdi b4: 66 0f 3a 61 07 00 pcmpestri $0x0,(%rdi),%xmm0 ba: 73 f4 jae b0 <_ZL17search_line_sse42PKhS0_+0x20> bc: 48 8d 04 0f lea (%rdi,%rcx,1),%rax c0: c3 retq and new code results in: 96: 66 0f 6f 05 00 00 00 movdqa 0x0(%rip),%xmm0 9d: 00 9e: ba 10 00 00 00 mov $0x10,%edx a3: b8 04 00 00 00 mov $0x4,%eax a8: 66 0f 3a 61 07 00 pcmpestri $0x0,(%rdi),%xmm0 ae: 72 0c jb bc <_ZL17search_line_sse42PKhS0_+0x2c> b0: 48 83 c7 10 add $0x10,%rdi b4: 66 0f 3a 61 07 00 pcmpestri $0x0,(%rdi),%xmm0 ba: 73 f4 jae b0 <_ZL17search_line_sse42PKhS0_+0x20> bc: 48 8d 04 0f lea (%rdi,%rcx,1),%rax c0: c3 retq which looks like an improvement to me. 2015-06-29 Uros Bizjak <ubiz...@gmail.com> * lex.c (search_line_sse42) [__GCC_ASM_FLAG_OUTPUTS__]: New main loop using asm flag outputs. Patch was bootstrapped and regression tested on x86_64-linux-gnu {,m32} (ivybridge), so both code paths were exercised. Since this is a new feature - does the approach look OK? Uros.
Index: lex.c =================================================================== --- lex.c (revision 225138) +++ lex.c (working copy) @@ -450,15 +450,30 @@ search_line_sse42 (const uchar *s, const uchar *en s = (const uchar *)((si + 16) & -16); } - /* Main loop, processing 16 bytes at a time. By doing the whole loop - in inline assembly, we can make proper use of the flags set. */ - __asm ( "sub $16, %1\n" - " .balign 16\n" + /* Main loop, processing 16 bytes at a time. */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ + while (1) + { + char f; + __asm ("%vpcmpestri\t$0, %2, %3" + : "=c"(index), "=@ccc"(f) + : "m"(*s), "x"(search), "a"(4), "d"(16)); + if (f) + break; + + s += 16; + } +#else + s -= 16; + /* By doing the whole loop in inline assembly, + we can make proper use of the flags set. */ + __asm ( ".balign 16\n" "0: add $16, %1\n" - " %vpcmpestri $0, (%1), %2\n" + " %vpcmpestri\t$0, (%1), %2\n" " jnc 0b" : "=&c"(index), "+r"(s) : "x"(search), "a"(4), "d"(16)); +#endif found: return s + index;