Hi,

As I wrote at

[PATCH, libcpp]: Use asm flag outputs in search_line_sse42 main loop

https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg113610.html

I wont repeat myself with reasons summary is that current sse4.2 code is 
reduntant as it has same performance as sse2 one. 
This improves sse2 performance by around 10% vs sse4.2 code by
using better header.

A updated benchmark attached. It counts number of lines of given c
source, I selected itself for replicable results, on sandy bridge
runtime is following, fx10 and nehalem are similar.


time ./a.out line.c 1 100000;  time ./a.out line.c 2 100000;  time
./a.out line.c 3 100000;  time ./a.out line.c 4 100000;  time ./a.out
line.c 5 100000

# strpbrk
real    0m0.507s
user    0m0.505s
sys     0m0.000s
# current sse2
real    0m0.490s
user    0m0.490s
sys     0m0.000s
# current sse4.2
real    0m0.423s
user    0m0.420s
sys     0m0.003s
# improved header
real    0m0.450s
user    0m0.451s
sys     0m0.000s
# proposed version
real    0m0.426s
user    0m0.426s
sys     0m0.000s



        * lex.c (search_line_sse2): Improve performance by using
        proper header.
        (search_line_sse42): Delete.

diff --git a/libcpp/lex.c b/libcpp/lex.c
index 0ad9660..8032e6e 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -373,36 +373,110 @@ search_line_sse2 (const uchar *s, const uchar *end 
ATTRIBUTE_UNUSED)
   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 
   unsigned int misalign, found, mask;
+
   const v16qi *p;
-  v16qi data, t;
+  v16qi data, t, tx;
+ 
+  if (s + 80 < end)
+    {
+      v16qi x0 = __builtin_ia32_loaddqu ((char const *) s);
+      tx =  __builtin_ia32_pcmpeqb128 (x0, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + found;
+        }
+      v16qi x1 = __builtin_ia32_loaddqu ((char const *) (s + 16));
+      v16qi x2 = __builtin_ia32_loaddqu ((char const *) (s + 32));
+      v16qi x3 = __builtin_ia32_loaddqu ((char const *) (s + 48));
+      v16qi x4 = __builtin_ia32_loaddqu ((char const *) (s + 64));
+
+      tx =  __builtin_ia32_pcmpeqb128 (x1, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 16 + found;
+        }
+
+      tx =  __builtin_ia32_pcmpeqb128 (x2, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_qm);
+
+      found = __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 32 + found;
+        }
+
+
+      tx =  __builtin_ia32_pcmpeqb128 (x3, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 48 + found;
+        }
+
+      tx =  __builtin_ia32_pcmpeqb128 (x4, repl_nl);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_cr);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_bs);
+      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_qm);
+
+      found =  __builtin_ia32_pmovmskb128 (tx);
+
+      if (found)
+        {
+          found = __builtin_ctz (found);
+          return (const uchar *) s + 64 + found;
+        }
+
+      s += 80;
+    }
 
   /* Align the source pointer.  */
   misalign = (uintptr_t)s & 15;
   p = (const v16qi *)((uintptr_t)s & -16);
   data = *p;
 
-  /* Create a mask for the bytes that are valid within the first
-     16-byte block.  The Idea here is that the AND with the mask
-     within the loop is "free", since we need some AND or TEST
-     insn in order to set the flags for the branch anyway.  */
   mask = -1u << misalign;
 
-  /* Main loop processing 16 bytes at a time.  */
-  goto start;
-  do
+  t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+  t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+  found = __builtin_ia32_pmovmskb128 (t);
+  found &= mask;
+
+  while (!found)
     {
       data = *++p;
-      mask = -1;
 
-    start:
       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
       found = __builtin_ia32_pmovmskb128 (t);
-      found &= mask;
     }
-  while (!found);
+
 
   /* FOUND contains 1 in bits for which we matched a relevant
      character.  Conversion to the byte index is trivial.  */
@@ -410,65 +484,7 @@ search_line_sse2 (const uchar *s, const uchar *end 
ATTRIBUTE_UNUSED)
   return (const uchar *)p + found;
 }
 
-#ifdef HAVE_SSE4
-/* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
-
-static const uchar *
-#ifndef __SSE4_2__
-__attribute__((__target__("sse4.2")))
-#endif
-search_line_sse42 (const uchar *s, const uchar *end)
-{
-  typedef char v16qi __attribute__ ((__vector_size__ (16)));
-  static const v16qi search = { '\n', '\r', '?', '\\' };
-
-  uintptr_t si = (uintptr_t)s;
-  uintptr_t index;
-
-  /* Check for unaligned input.  */
-  if (si & 15)
-    {
-      if (__builtin_expect (end - s < 16, 0)
-         && __builtin_expect ((si & 0xfff) > 0xff0, 0))
-       {
-         /* There are less than 16 bytes left in the buffer, and less
-            than 16 bytes left on the page.  Reading 16 bytes at this
-            point might generate a spurious page fault.  Defer to the
-            SSE2 implementation, which already handles alignment.  */
-         return search_line_sse2 (s, end);
-       }
-
-      /* ??? The builtin doesn't understand that the PCMPESTRI read from
-        memory need not be aligned.  */
-      __asm ("%vpcmpestri $0, (%1), %2"
-            : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
-      if (__builtin_expect (index < 16, 0))
-       goto found;
-
-      /* Advance the pointer to an aligned address.  We will re-scan a
-        few bytes, but we no longer need care for reading past the
-        end of a page, since we're guaranteed a match.  */
-      s = (const uchar *)((si + 16) & -16);
-    }
 
-  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
-     in inline assembly, we can make proper use of the flags set.  */
-  __asm (      "sub $16, %1\n"
-       "       .balign 16\n"
-       "0:     add $16, %1\n"
-       "       %vpcmpestri $0, (%1), %2\n"
-       "       jnc 0b"
-       : "=&c"(index), "+r"(s)
-       : "x"(search), "a"(4), "d"(16));
-
- found:
-  return s + index;
-}
-
-#else
-/* Work around out-dated assemblers without sse4 support.  */
-#define search_line_sse42 search_line_sse2
-#endif
 
 /* Check the CPU capabilities.  */
 
@@ -485,21 +501,15 @@ init_vectorized_lexer (void)
   search_line_fast_type impl = search_line_acc_char;
   int minimum = 0;
 
-#if defined(__SSE4_2__)
-  minimum = 3;
-#elif defined(__SSE2__)
+#if defined(__SSE2__)
   minimum = 2;
 #elif defined(__SSE__)
   minimum = 1;
 #endif
 
-  if (minimum == 3)
-    impl = search_line_sse42;
-  else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
+  if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
     {
-      if (minimum == 3 || (ecx & bit_SSE4_2))
-        impl = search_line_sse42;
-      else if (minimum == 2 || (edx & bit_SSE2))
+      if (minimum == 2 || (edx & bit_SSE2))
        impl = search_line_sse2;
       else if (minimum == 1 || (edx & bit_SSE))
        impl = search_line_mmx;
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#define handle_error(msg) \
    do { perror(msg); exit(EXIT_FAILURE); } while (0)

char *next_line(char *x, char *y)
{
  return strpbrk(x,"\r\n?\\");
}

#include <emmintrin.h>
#define __v16qi v16qi
#define uchar unsigned char


/* Replicated character data to be shared between implementations.
   Recall that outside of a context with vector support we can't
   define compatible vector types, therefore these are all defined
   in terms of raw characters.  */
static const char repl_chars[4][16] __attribute__((aligned(16))) = {
  { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
  { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
  { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
  { '?', '?', '?', '?', '?', '?', '?', '?',
    '?', '?', '?', '?', '?', '?', '?', '?' },
};

/* A version of the fast scanner using SSE2 vectorized byte compare insns.  */

static const uchar *
#ifndef __SSE2__
__attribute__((__target__("sse2")))
#endif
search_line_sse2 (const uchar *s, const uchar *end )
{
  typedef char v16qi __attribute__ ((__vector_size__ (16)));

  const v16qi repl_nl = *(const v16qi *)repl_chars[0];
  const v16qi repl_cr = *(const v16qi *)repl_chars[1];
  const v16qi repl_bs = *(const v16qi *)repl_chars[2];
  const v16qi repl_qm = *(const v16qi *)repl_chars[3];

  unsigned int misalign, found, mask;
  const v16qi *p;
  v16qi data, t;

  /* Align the source pointer.  */
  misalign = (uintptr_t)s & 15;
  p = (const v16qi *)((uintptr_t)s & -16);
  data = *p;

  /* Create a mask for the bytes that are valid within the first
     16-byte block.  The Idea here is that the AND with the mask
     within the loop is "free", since we need some AND or TEST
     insn in order to set the flags for the branch anyway.  */
  mask = -1u << misalign;

  /* Main loop processing 16 bytes at a time.  */
  goto start;
  do
    {
      data = *++p;
      mask = -1;

    start:
      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
      found = __builtin_ia32_pmovmskb128 (t);
      found &= mask;
    }
  while (!found);

  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz(found);
  return (const uchar *)p + found;
}
#define OR(x,y) ((x)|(y))
static const uchar *
#ifndef __SSE2__
__attribute__((__target__("sse2")))
#endif
search_line_sse2v3 (const uchar *s, const uchar *end )
{

 typedef char v16qi __attribute__ ((__vector_size__ (16)));

  const v16qi repl_nl = *(const v16qi *)repl_chars[0];
  const v16qi repl_cr = *(const v16qi *)repl_chars[1];
  const v16qi repl_bs = *(const v16qi *)repl_chars[2];
  const v16qi repl_qm = *(const v16qi *)repl_chars[3];

  unsigned int misalign, found, mask;

  const v16qi *p;
  v16qi data, t, tx;
 
  if (__builtin_expect (s + 80 < end, 1))
    {
      v16qi x0 = __builtin_ia32_loaddqu ((char const *) s);
      tx =  __builtin_ia32_pcmpeqb128 (x0, repl_nl);
      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_cr);
      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_bs);
      tx |= __builtin_ia32_pcmpeqb128 (x0, repl_qm);

      found =  __builtin_ia32_pmovmskb128 (tx);
      if (found)
        {
          found = __builtin_ctz (found);
          return (const uchar *) s + found;
        }
      v16qi x1 = __builtin_ia32_loaddqu ((char const *) (s + 16));
      v16qi x2 = __builtin_ia32_loaddqu ((char const *) (s + 32));
      v16qi x3 = __builtin_ia32_loaddqu ((char const *) (s + 48));
      v16qi x4 = __builtin_ia32_loaddqu ((char const *) (s + 64));

      tx =  __builtin_ia32_pcmpeqb128 (x1, repl_nl);
      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_cr);
      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_bs);
      tx |= __builtin_ia32_pcmpeqb128 (x1, repl_qm);

      found =  __builtin_ia32_pmovmskb128 (tx);

      if (found)
        {
          found = __builtin_ctz (found);
          return (const uchar *) s + 16 + found;
        }

      tx =  __builtin_ia32_pcmpeqb128 (x2, repl_nl);
      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_cr);
      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_bs);
      tx |= __builtin_ia32_pcmpeqb128 (x2, repl_qm);

      found = __builtin_ia32_pmovmskb128 (tx);

      if (found)
        {
          found = __builtin_ctz (found);
          return (const uchar *) s + 32 + found;
        }


      tx =  __builtin_ia32_pcmpeqb128 (x3, repl_nl);
      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_cr);
      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_bs);
      tx |= __builtin_ia32_pcmpeqb128 (x3, repl_qm);

      found =  __builtin_ia32_pmovmskb128 (tx);

      if (found)
        {
          found = __builtin_ctz (found);
          return (const uchar *) s + 48 + found;
        }

      tx =  __builtin_ia32_pcmpeqb128 (x4, repl_nl);
      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_cr);
      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_bs);
      tx |= __builtin_ia32_pcmpeqb128 (x4, repl_qm);

      found =  __builtin_ia32_pmovmskb128 (tx);

      if (found)
        {
          found = __builtin_ctz (found);
          return (const uchar *) s + 64 + found;
        }

      s += 80;
    }

  /* Align the source pointer.  */
  misalign = (uintptr_t)s & 15;
  p = (const v16qi *)((uintptr_t)s & -16);
  data = *p;

  mask = -1u << misalign;

  t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
  t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
  t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
  t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
  found = __builtin_ia32_pmovmskb128 (t);
  found &= mask;

  while (!found)
    {
      data = *++p;

      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
      found = __builtin_ia32_pmovmskb128 (t);
    }

  
  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz (found);
  return (const uchar *) p + found;
}




static const uchar *
#ifndef __SSE2__
__attribute__((__target__("sse2")))
#endif
search_line_sse2v2 (const uchar *s, const uchar *end )
{
  typedef char v16qi __attribute__ ((__vector_size__ (16)));

  const v16qi repl_nl = *(const v16qi *)repl_chars[0];
  const v16qi repl_cr = *(const v16qi *)repl_chars[1];
  const v16qi repl_bs = *(const v16qi *)repl_chars[2];
  const v16qi repl_qm = *(const v16qi *)repl_chars[3];

  unsigned long misalign, found, mask;
  const v16qi *p;
  v16qi data, t;
 
  if (s + 96 < end)
    {
      v16qi x0=  (v16qi) _mm_loadu_si128((__m128i *) s);
          v16qi tx;
      tx =  __builtin_ia32_pcmpeqb128(x0, repl_nl);
      tx =  OR(__builtin_ia32_pcmpeqb128(x0, repl_cr),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x0, repl_bs),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x0, repl_qm),tx);

      found =  __builtin_ia32_pmovmskb128 (tx);
      if (found)
      {
      found = __builtin_ctz(found);
      return (const uchar *)s + found;
      }

      s += 16;
     goto next;
      v16qi x1=  (v16qi) _mm_loadu_si128((__m128i *) (s+16));
      v16qi x2=  (v16qi) _mm_loadu_si128((__m128i *) (s+32));
      v16qi x3=  (v16qi) _mm_loadu_si128((__m128i *) (s+48));
      v16qi x4=  (v16qi) _mm_loadu_si128((__m128i *) (s+64));

      tx =  __builtin_ia32_pcmpeqb128(x1, repl_nl);
      tx =  OR(__builtin_ia32_pcmpeqb128(x1, repl_cr),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x1, repl_bs),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x1, repl_qm),tx);

      found =  __builtin_ia32_pmovmskb128 (tx);


      if (found)
      {
      found = __builtin_ctzl(found);
      return (const uchar *)s + 16 + found;
      }

      tx =  __builtin_ia32_pcmpeqb128(x2, repl_nl);
      tx =  OR(__builtin_ia32_pcmpeqb128(x2, repl_cr),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x2, repl_bs),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x2, repl_qm),tx);

      found |=  ((unsigned long) __builtin_ia32_pmovmskb128 (tx))<<16;
      tx =  __builtin_ia32_pcmpeqb128(x3, repl_nl);

      if (found)
      {
      found = __builtin_ctzl(found);
      return (const uchar *)s + 16 + found;
      }


      tx =  OR(__builtin_ia32_pcmpeqb128(x3, repl_cr),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x3, repl_bs),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x3, repl_qm),tx);

      found |=  ((unsigned long) __builtin_ia32_pmovmskb128 (tx))<<32;

      if (found)
      {
      found = __builtin_ctzl(found);
      return (const uchar *)s + 16 + found;
      }

      tx =  __builtin_ia32_pcmpeqb128(x4, repl_nl);
      tx =  OR(__builtin_ia32_pcmpeqb128(x4, repl_cr),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x4, repl_bs),tx);
      tx =  OR(__builtin_ia32_pcmpeqb128(x4, repl_qm),tx);

      found |=  ((unsigned long) __builtin_ia32_pmovmskb128 (tx))<<48;

      if (found)
      {
      found = __builtin_ctzl(found);
      return (const uchar *)s + 16 + found;
      }

    s += 80;
   }
   next:
  /* Align the source pointer.  */
  misalign = (uintptr_t)s & 15;
  p = (const v16qi *)((uintptr_t)s & -16);
  data = *p;

  mask = -1u << misalign;

  /* Main loop processing 16 bytes at a time.  */
      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
      found = __builtin_ia32_pmovmskb128 (t);
      found &= mask;

  while (!found)
    {
      data = *++p;

      t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
      t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
      t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
      t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
      found = __builtin_ia32_pmovmskb128 (t);
    }

  /* FOUND contains 1 in bits for which we matched a relevant
     character.  Conversion to the byte index is trivial.  */
  found = __builtin_ctz(found);
  return (const uchar *)p + found;
}

#ifdef HAVE_SSE4
/* A version of the fast scanner using SSE 4.2 vectorized string insns.  */

static const uchar *
#ifndef __SSE4_2__
__attribute__((__target__("sse4.2")))
#endif
search_line_sse42 (const uchar *s, const uchar *end)
{
  typedef char v16qi __attribute__ ((__vector_size__ (16)));
  static const v16qi search = { '\n', '\r', '?', '\\' };

  uintptr_t si = (uintptr_t)s;
  uintptr_t index;

  /* Check for unaligned input.  */
  if (si & 15)
    {
      if (__builtin_expect (end - s < 16, 0)
          && __builtin_expect ((si & 0xfff) > 0xff0, 0))
        {
          /* There are less than 16 bytes left in the buffer, and less
             than 16 bytes left on the page.  Reading 16 bytes at this
             point might generate a spurious page fault.  Defer to the
             SSE2 implementation, which already handles alignment.  */
          return search_line_sse2 (s, end);
        }

      /* ??? The builtin doesn't understand that the PCMPESTRI read from
         memory need not be aligned.  */
      __asm ("%vpcmpestri $0, (%1), %2"
             : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16));
      if (__builtin_expect (index < 16, 0))
        goto found;

      /* Advance the pointer to an aligned address.  We will re-scan a
         few bytes, but we no longer need care for reading past the
         end of a page, since we're guaranteed a match.  */
      s = (const uchar *)((si + 16) & -16);
    }

  /* Main loop, processing 16 bytes at a time.  By doing the whole loop
     in inline assembly, we can make proper use of the flags set.  */
  __asm (      "sub $16, %1\n"
        "       .balign 16\n"
        "0:     add $16, %1\n"
        "       %vpcmpestri $0, (%1), %2\n"
        "       jnc 0b"
        : "=&c"(index), "+r"(s)
        : "x"(search), "a"(4), "d"(16));

 found:
  return s + index;
}

#else
/* Work around out-dated assemblers without sse4 support.  */
#define search_line_sse42 search_line_sse2
#endif




int line_count(char *start, char *end)
{
  int c = 0;
  while (start != end)
    {
      start = next_line(start+1,end);
      c++;  
    }
  return c;
}
int line_count2(char *start, char *end)
{
  int c = 0;
  while (start != end)
    {
      start = (char *) search_line_sse2(start+1,end);
      c++;  
    }
  return c;
}
int line_count3(char *start, char *end)
{
  int c = 0;
  while (start != end)
    {
      start = (char *) search_line_sse42(start+1,end);
      c++;  
    }
  return c;
}
int line_count4(char *start, char *end)
{
  int c = 0;
  while (start != end)
    {
      start = (char *) search_line_sse2v2(start+1,end);
      c++;
    }
  return c;
}
int line_count5(char *start, char *end)
{
  int c = 0;
  while (start != end)
    {
      start = (char *) search_line_sse2v3(start+1,end);
      c++;
    }
  return c;
}
int
main(int argc, char *argv[])
{
    char *addr;
    int fd;
    struct stat sb;
    off_t offset, pa_offset;
    size_t length;
    ssize_t s;

    fd = open(argv[1], O_RDONLY);
    if (fd == -1)
        handle_error("open");
    if (fstat(fd, &sb) == -1)           /* To obtain file size */
        handle_error("fstat");


    addr = mmap(NULL, sb.st_size, PROT_READ | PROT_WRITE,
                MAP_PRIVATE, fd, 0);
    if (addr == MAP_FAILED)
        handle_error("mmap");
    addr[sb.st_size] = '\n';
    int sum = 0;
    int i;
    if (atoi(argv[2]) == 1)
      for (i=0;i<atoi(argv[3]);i++)
        sum += line_count(addr, addr+sb.st_size);
    if (atoi(argv[2]) == 2)
      for (i=0;i<atoi(argv[3]);i++)
        sum += line_count2(addr, addr+sb.st_size);
    if (atoi(argv[2]) == 3)
      for (i=0;i<atoi(argv[3]);i++)
        sum += line_count3(addr, addr+sb.st_size);
    if (atoi(argv[2]) == 4)
      for (i=0;i<atoi(argv[3]);i++)
        sum += line_count4(addr, addr+sb.st_size);
    if (atoi(argv[2]) == 5)
      for (i=0;i<atoi(argv[3]);i++)
        sum += line_count5(addr, addr+sb.st_size);


    return sum;
}

Reply via email to