Hello!

Attached patch introduces asm flag outputs in seach_line_sse42 main
loop to handle carry flag value from pcmpestri insn. Slightly improved
old code that uses asm loop compiles to:

      96:    66 0f 6f 05 00 00 00     movdqa 0x0(%rip),%xmm0
      9d:    00
      9e:    48 83 ef 10              sub    $0x10,%rdi
      a2:    ba 10 00 00 00           mov    $0x10,%edx
      a7:    b8 04 00 00 00           mov    $0x4,%eax
      ac:    0f 1f 40 00              nopl   0x0(%rax)
      b0:    48 83 c7 10              add    $0x10,%rdi
      b4:    66 0f 3a 61 07 00        pcmpestri $0x0,(%rdi),%xmm0
      ba:    73 f4                    jae    b0
<_ZL17search_line_sse42PKhS0_+0x20>
      bc:    48 8d 04 0f              lea    (%rdi,%rcx,1),%rax
      c0:    c3                       retq

and new code results in:

      96:    66 0f 6f 05 00 00 00     movdqa 0x0(%rip),%xmm0
      9d:    00
      9e:    ba 10 00 00 00           mov    $0x10,%edx
      a3:    b8 04 00 00 00           mov    $0x4,%eax
      a8:    66 0f 3a 61 07 00        pcmpestri $0x0,(%rdi),%xmm0
      ae:    72 0c                    jb     bc
<_ZL17search_line_sse42PKhS0_+0x2c>
      b0:    48 83 c7 10              add    $0x10,%rdi
      b4:    66 0f 3a 61 07 00        pcmpestri $0x0,(%rdi),%xmm0
      ba:    73 f4                    jae    b0
<_ZL17search_line_sse42PKhS0_+0x20>
      bc:    48 8d 04 0f              lea    (%rdi,%rcx,1),%rax
      c0:    c3                       retq

which looks like an improvement to me.

2015-06-29  Uros Bizjak  <ubiz...@gmail.com>

    * lex.c (search_line_sse42) [__GCC_ASM_FLAG_OUTPUTS__]: New main
    loop using asm flag outputs.

Patch was bootstrapped and regression tested on x86_64-linux-gnu
{,m32} (ivybridge), so both code paths were exercised.

Since this is a new feature - does the approach look OK?

Uros.
Index: lex.c
===================================================================
--- lex.c       (revision 225138)
+++ lex.c       (working copy)
@@ -450,15 +450,30 @@ search_line_sse42 (const uchar *s, const uchar *en
       s = (const uchar *)((si + 16) & -16);
     }
 
-  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
-     in inline assembly, we can make proper use of the flags set.  */
-  __asm (      "sub $16, %1\n"
-       "       .balign 16\n"
+  /* Main loop, processing 16 bytes at a time.  */
+#ifdef __GCC_ASM_FLAG_OUTPUTS__
+  while (1)
+    {
+      char f;
+      __asm ("%vpcmpestri\t$0, %2, %3"
+            : "=c"(index), "=@ccc"(f)
+            : "m"(*s), "x"(search), "a"(4), "d"(16));
+      if (f)
+       break;
+      
+      s += 16;
+    }
+#else
+  s -= 16;
+  /* By doing the whole loop in inline assembly,
+     we can make proper use of the flags set.  */
+  __asm (      ".balign 16\n"
        "0:     add $16, %1\n"
-       "       %vpcmpestri $0, (%1), %2\n"
+       "       %vpcmpestri\t$0, (%1), %2\n"
        "       jnc 0b"
        : "=&c"(index), "+r"(s)
        : "x"(search), "a"(4), "d"(16));
+#endif
 
  found:
   return s + index;

Reply via email to