This patchset fixes some latent problems in cmpstrn* patterns for x86,
and introduces cmpmemsi for short fixed-size memcmp.

I've verified that repz cmpsb is not a good choice for memcmp,
performance-wise, so I turned to movbe/bswapped loads and unsigned
compares.  Those have performed better than glibc's CPU-optimized memcmp
for all short counts I threw at them, for aligned and misaligned inputs,
on very old and quite new CPUs, Intel and AMD.  I suppose this result
won't carry over to other arches, so I kept it in x86-specific code.

The fixed-size inlined memcmps come at a significant code expansion
growth per compare sequence, so I ended up not even trying to find a
per-CPU break-even point, and instead put in a small cut-off limit that
varies depending on the optimization level.

I even considered using a param for tuning, but params wouldn't work in
the md file, and I ended up deciding that was probably overkill, so I'd
ask for feedback before proceeding any further down that path.


In order to compare the performance of memcmp, I used the attached test
program, that uses x86 CPU cycle counters, with repetition to seek the
minimum cycle count for an operation, rather than an average that is not
so reliable given all the self-tuning and caching and whatnot a CPU is
capable of.

There are 3 kinds of tests in the test program, to test comparing
identical buffers with same alignment, misaligned uncached identical
buffers, and averages over misaligned buffers that are about twice more
likely to first differ at byte N than at byte N+1.


The patchset has 3 patches.  In the first one, I fix latent issues in
existing cmpstr patterns that became apparent with the introduction of
cmpmem.  In the second one, I introduce a very simple cmpmemsi pattern
that will only expand memcmp if length is a power of two, when that many
bytes can be loaded from memory to a non-vector register in a single
instruction, regardless of alignment.  In the third one, I extend it so
as to expand up to a certain number of compare sequences, depending on
the optimization level, so that memcmp will be inlined for several
common small block sizes, but without growing the code too much at low
optimization levels.

The set was regstrapped on x86_64-linux-gnu.  The test program exhibited
better performance in all tests at -m64 and -m32, on various old and new
CPUs, when memcmp was inlined than when it wasn't.

Ok to install?

Would it make sense to install the test program somewhere?

/* memcmp inlining benchmarking program
   Copyright (C) 2018-2019 Free Software Foundation, Inc.
   Contributed by Alexandre Oliva <ol...@adacore.com>, with fragments
   copied from GNU libc's sysdeps/x86/hp-timing.h.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3.  If not see
<http://www.gnu.org/licenses/>.  */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/* From hp-timing.h  */
typedef unsigned long long int hp_timing_t;
#define HP_TIMING_DIFF_INIT() \
  do {									      \
    if (hp_timing_overhead == 0)				      \
      {									      \
	int __cnt = 5;							      \
	hp_timing_overhead = ~0ull;				      \
	do								      \
	  {								      \
	    hp_timing_t __t1, __t2;					      \
	    HP_TIMING_NOW (__t1);					      \
	    HP_TIMING_NOW (__t2);					      \
	    if (__t2 - __t1 < hp_timing_overhead)		      \
	      hp_timing_overhead = __t2 - __t1;		      \
	  }								      \
	while (--__cnt > 0);						      \
      }									      \
  } while (0)
#define HP_TIMING_DIFF(Diff, Start, End)	(Diff) = ((End) - (Start))
#if 0
#define HP_TIMING_NOW(Var) \
  (__extension__ ({                             \
    unsigned int __aux;				\
    (Var) = __builtin_ia32_rdtscp (&__aux);     \
  }))
#else
#define HP_TIMING_NOW(Var) \
  (__extension__ ({                             \
    (Var) = __builtin_ia32_rdtsc ();		\
  }))
#endif

static char *
init_buf (size_t len)
{
  char *buf = malloc (len);
  memset (buf, 0, len);
  return buf;
}

/* This test times memcmp calls of length L in the beginning of a BUF
   of length LEN.  It is repeated so as to get the most benefit from
   caching and branch prediction and whatnot, but identical inputs get
   memcmp to go through the entire block, unless the pointers are
   tested for equality.  Inlined versions may infer a zero result
   without explicit pointer equality testing, even though we attempt
   to hide from the compiler the fact that we're using the same
   pointer.  */

static void
test_samebuf (const int L, const char *buf, size_t len)
{
  if (L > len)
    return;

  printf ("samebuf %i: ", L);

  hp_timing_t hp_timing_overhead, begin, end, diff, mindiff;
  HP_TIMING_DIFF_INIT ();

  size_t minrep = 16384;
  for (int m = 0; m < minrep; m++)
    {
      char *buf0, *buf1;
      asm ("" : "=X" (buf0) : "0" (buf));
      asm ("" : "=X" (buf1) : "0" (buf));

      HP_TIMING_NOW (begin);

      int r = memcmp (buf0, buf1, L);
      asm ("" : : "rm" (r));

      HP_TIMING_NOW (end);
      HP_TIMING_DIFF (diff, begin, end);

      if (!m || diff < mindiff)
	{
	  if (0 && !m)
	    printf ("%lli - ", (long long)diff);
	  mindiff = diff;
	  m = 0;
	}
    }

  printf ("%lli\n", (long long)mindiff);
}

/* Like test_samebuf, this repeatedly calls memcmp to get a lowest
 * bound for compares of identical blocks of size L bytes, but unlike
 * test_samebuf, this uses different memory blocks known to be
 * identical but misaligned, and by cycling over a large buffer BUF of
 * length LEN.  It attempts to avoid using cached memory, so BUFP
 * should be set to the first byte of BUF that would be used in the
 * next iteration.  */

static void
test_eqbufs (const int L, char *buf, size_t len, char **bufp)
{
  size_t minrep = 16384;
  size_t delta = 257;
  if (L > delta)
    delta = L - L % delta + delta;
  if (len < L)
    return;

  printf ("eqbufs %i: ", L);

  hp_timing_t hp_timing_overhead, begin, end, diff, mindiff;
  HP_TIMING_DIFF_INIT ();

  char *buf0 = *bufp;
  for (int m = 0; m < minrep; m++)
    {
      for (;;)
	if (buf0 + L >= buf + len)
	  buf0 -= len;
	else if (buf0 < buf)
	  buf0 += delta;
	else
	  break;

      char *buf1 = buf0 + delta;
      if (buf1 + L >= buf + len)
      for (;;)
	if (buf1 + L >= buf + len)
	  buf1 -= len;
	else if (buf1 < buf)
	  buf1 += delta;
	else
	  break;

      if (0)
	printf ("%+li %+li ", buf0 - buf, buf1 - buf);

      HP_TIMING_NOW (begin);

      int r = memcmp (buf0, buf1, L);
      asm ("" : : "rm" (r));

      HP_TIMING_NOW (end);
      HP_TIMING_DIFF (diff, begin, end);

      if (!m || diff < mindiff)
	{
	  if (0 && !m)
	    printf ("%lli - ", (long long)diff);
	  mindiff = diff;
	  m = 0;
	}

      buf0 = buf1 + delta;
    }

  *bufp = buf0;

  printf ("%lli\n", (long long)mindiff);
}

/* This test attempts to estimates the average performance of memcmp
   over buffers with different alignments and contents, such that
   their alignments match only by chance, and their contents are twice
   more likely to first differ at the Nth byte than at the (N+1)th
   byte.

   It creates its own buffer with repetitions of the following
   sequence:

    L    '\000', 1     '\001',
   (L-1) '\000', 1     '\001',
   ...
      1  '\000', 1     '\001',
   ...
      1  '\000', (L-1) '\001',
      1  '\000',  L    '\001'

   Such blocks are copied over a buffer that spans about 2*C bytes,
   and then we memcmp each contiguous subsequence of length L,
   starting in the first sequence of the first half of the buffer (but
   possibly ending in the first few bytes of the second sequence),
   with each such subsequence in the first sequence of the second half
   of the buffer.  This amounts to averaging memcmp over all
   combinations of alignment, comparing buffers that are about twice
   as likely to have their first difference at byte [B] than at byte
   [B+1].  This averaging is repeated for each pair of sequences, one
   taken from each half of the buffer, and this is all repeated N
   times.  The lowest average, divided twice by V, the length of the
   sequences, is the result.

 */

#define T (L > 1 ? L * (L + 3) - 4 : 2)
#define V T

#define C (8 * 128 * 128)
#if 0
#undef C
#define C (4 * L * T)
#endif

#define S (C / T)
#define N 32

static void
test_varbuf (const int L)
{
  char (*buf)[S][T] = malloc (2 * S * T + (L - 1));

  if (1)
    {
      int n = 0, m = T;
      for (int i = L; i > 1; i--)
	{
	  for (int j = 0; j < i; j++)
	    {
	      buf[0][0][n++] = 0;
	      buf[0][0][--m] = 1;
	    }
	  buf[0][0][n++] = 1;
	  buf[0][0][--m] = 0;
	}
      if (n * 2 != T || m != n)
	abort ();
    }
  for (int n = 1; n < S; n++)
    memcpy (&buf[0][n], buf[0][0], sizeof (buf[0][0]));
  memcpy (&buf[1], &buf[0], sizeof (buf[0]));
  memcpy (&buf[2], &buf[0], L - 1);

  hp_timing_t hp_timing_overhead, begin, end, diff, mindiff;
  HP_TIMING_DIFF_INIT ();

  for (int n = 0; n < N; n++)
    for (int s = 0; s < S; s++)
      {
	HP_TIMING_NOW (begin);
	for (int i = 0; i < V; i++)
	  for (int j = 0; j < V; j++)
	    {
	      int r = memcmp (&buf[0][s][i], &buf[1][s][j], L);
	      asm ("" : : "rm" (r));
	    }
	HP_TIMING_NOW (end);
	HP_TIMING_DIFF (diff, begin, end);
	if ((!n && !s) || diff < mindiff)
	  mindiff = diff;
      }

  printf ("%f = %lli / %lli.0 repeated %i times\n",
	  (double)mindiff / V / V,
	  (long long)mindiff, (long long)V * V, S * N);

  free (buf);
}

/* This buffer, used mainly for eqbufs, is supposed to be larger than
   the memory caches.  It should be divisible by 2, by 256, and also
   by 257 (the minimum delta in test_eqbufs).  */

#define ALL (2 * 256 * 2 /* = 1024 */ * 1024 * 257)

/* Select one or more among the various possibilities of tests by
   tweaking the if()s below.  Compile with optimization and with or
   without -fno-builtin-memcmp, run them and compare the outputs.  */

int
main(int argc, char *argv[])
{
  char *buf = init_buf (ALL), *cbuf = buf;
  if (1)
    {
#ifdef SZ
      test_samebuf (SZ, buf, ALL);
      test_eqbufs (SZ, buf, ALL, &cbuf);
      test_varbuf (SZ);
#endif
    }

  if (0)
    for (int i = 0; i <= 16; i++)
      test_samebuf (1 << i, buf, ALL);
  if (0)
    for (int i = 0; i <= 14; i++)
      test_eqbufs (1 << i, buf, ALL, &cbuf);
  free (buf);

  if (0)
    for (int i = 1; i <= 8; i++)
      test_varbuf (1 << i);
}
make cmpstrnsi patterns safer

If cmpstrnqi_1 is given a length operand in a non-immediate operand,
which cmpstrnsi accepts as an input, the operand will be modified
instead of preserved.  Turn the clobbered match_dup into a separate
output operand to avoid this undesirable effect.  This problem never
came up in existing uses of cmpstrnsi, but upcoming uses of cmpmemsi
run into this.

While at that, adjust cmpstrnqi_1 patterns so that FLAGS_REG is
visibly propagated when length is zero, rather than being merely used.


for  gcc/ChangeLog

        * config/i386/i386.md (cmpstrnsi): Create separate output
        count to pass to cmpstrnqi_nz_1 and ...
        (cmpstrnqi_1): ... this.  Do not use a match_dup of the count
        input as an output.  Preserve FLAGS_REG when length is zero.
        (*cmpstrnqi_1): Preserve FLAGS_REG when length is zero.
---
 gcc/config/i386/i386.md |   25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 7ad9788241988..2b7469991d837 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16974,7 +16974,7 @@
    (use (match_operand 4 "immediate_operand"))]
   ""
 {
-  rtx addr1, addr2, countreg, align, out;
+  rtx addr1, addr2, countreg, countout, align, out;
 
   if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
     FAIL;
@@ -17006,6 +17006,7 @@
     operands[2] = replace_equiv_address_nv (operands[2], addr2);
 
   countreg = ix86_zero_extend_to_Pmode (operands[3]);
+  countout = gen_reg_rtx (Pmode);
 
   /* %%% Iff we are testing strict equality, we can use known alignment
      to good advantage.  This may be possible with combine, particularly
@@ -17019,14 +17020,14 @@
          emit_move_insn (operands[0], const0_rtx);
          DONE;
        }
-      emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align,
-                                    operands[1], operands[2]));
+      emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countout, align,
+                                    operands[1], operands[2], countreg));
     }
   else
     {
       emit_insn (gen_cmp_1 (Pmode, countreg, countreg));
-      emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align,
-                                 operands[1], operands[2]));
+      emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countout, align,
+                                 operands[1], operands[2], countreg));
     }
 
   out = gen_lowpart (QImode, operands[0]);
@@ -17060,11 +17061,11 @@
   [(parallel [(set (reg:CC FLAGS_REG)
                   (compare:CC (match_operand 4 "memory_operand")
                               (match_operand 5 "memory_operand")))
-             (use (match_operand 2 "register_operand"))
+             (use (match_operand 6 "register_operand"))
              (use (match_operand:SI 3 "immediate_operand"))
              (clobber (match_operand 0 "register_operand"))
              (clobber (match_operand 1 "register_operand"))
-             (clobber (match_dup 2))])]
+             (clobber (match_operand 2 "register_operand"))])]
   ""
 {
   if (TARGET_CLD)
@@ -17096,16 +17097,15 @@
 
 (define_expand "cmpstrnqi_1"
   [(parallel [(set (reg:CC FLAGS_REG)
-               (if_then_else:CC (ne (match_operand 2 "register_operand")
+               (if_then_else:CC (ne (match_operand 6 "register_operand")
                                     (const_int 0))
                  (compare:CC (match_operand 4 "memory_operand")
                              (match_operand 5 "memory_operand"))
-                 (const_int 0)))
+                 (reg:CC FLAGS_REG)))
              (use (match_operand:SI 3 "immediate_operand"))
-             (use (reg:CC FLAGS_REG))
              (clobber (match_operand 0 "register_operand"))
              (clobber (match_operand 1 "register_operand"))
-             (clobber (match_dup 2))])]
+             (clobber (match_operand 2 "register_operand"))])]
   ""
 {
   if (TARGET_CLD)
@@ -17118,9 +17118,8 @@
                             (const_int 0))
          (compare:CC (mem:BLK (match_operand:P 4 "register_operand" "0"))
                      (mem:BLK (match_operand:P 5 "register_operand" "1")))
-         (const_int 0)))
+         (reg:CC FLAGS_REG)))
    (use (match_operand:SI 3 "immediate_operand" "i"))
-   (use (reg:CC FLAGS_REG))
    (clobber (match_operand:P 0 "register_operand" "=S"))
    (clobber (match_operand:P 1 "register_operand" "=D"))
    (clobber (match_operand:P 2 "register_operand" "=c"))]


x86 cmpmemsi pattern - single compare

This patch introduces a cmpmemsi pattern to expand to a single compare
insn sequence, involving one bswapped load from each input mem block.
It disregards alignment entirely, leaving it up for the CPU to deal
with it.


for  gcc/ChangeLog

        * config/i386/i386.md (cmpmemsi): New pattern.
---
 gcc/config/i386/i386.md |  114 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 2b7469991d837..b72b94a1fd51d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16966,6 +16966,120 @@
          (const_string "*")))
    (set_attr "mode" "QI")])
 
+(define_expand "cmpmemsi"
+  [(set (match_operand:SI 0 "register_operand")
+       (compare:SI (match_operand:BLK 1 "general_operand")
+                   (match_operand:BLK 2 "general_operand")))
+   (use (match_operand 3 "immediate_operand"))
+   (use (match_operand 4 "immediate_operand"))]
+  ""
+{
+  rtx op1, op2, tmp;
+
+  if (!CONST_INT_P (operands[3]))
+    FAIL;
+
+  if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
+    FAIL;
+
+  switch (INTVAL (operands[3]))
+    {
+    case 0:
+      emit_move_insn (operands[0], const0_rtx);
+      DONE;
+
+    default:
+      FAIL;
+
+    case 8:
+      if (!TARGET_64BIT)
+        FAIL;
+
+      op1 = gen_rtx_MEM (DImode, XEXP (operands[1], 0));
+      MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+      tmp = gen_reg_rtx (DImode);
+      emit_insn (gen_bswapdi2 (tmp, op1));
+      op1 = tmp;
+
+      op2 = gen_rtx_MEM (DImode, XEXP (operands[2], 0));
+      MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+      tmp = gen_reg_rtx (DImode);
+      emit_insn (gen_bswapdi2 (tmp, op2));
+      op2 = tmp;
+
+      emit_insn (gen_cmp_1 (DImode, op1, op2));
+
+      tmp = gen_lowpart (QImode, operands[0]);
+      emit_insn (gen_cmpintqi (tmp));
+      emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
+      DONE;
+
+    case 4:
+      op1 = gen_rtx_MEM (SImode, XEXP (operands[1], 0));
+      MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+      tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_bswapsi2 (tmp, op1));
+      op1 = tmp;
+
+      op2 = gen_rtx_MEM (SImode, XEXP (operands[2], 0));
+      MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+      tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_bswapsi2 (tmp, op2));
+      op2 = tmp;
+
+      emit_insn (gen_cmp_1 (SImode, op1, op2));
+
+      tmp = gen_lowpart (QImode, operands[0]);
+      emit_insn (gen_cmpintqi (tmp));
+      emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
+      DONE;
+
+    case 2:
+      op1 = gen_rtx_MEM (HImode, XEXP (operands[1], 0));
+      MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+      tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_zero_extendhisi2 (tmp, op1));
+      emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
+      op1 = tmp;
+
+      op2 = gen_rtx_MEM (HImode, XEXP (operands[2], 0));
+      MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+      tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_zero_extendhisi2 (tmp, op2));
+      emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
+      op2 = tmp;
+
+      emit_insn (gen_sub3_insn (operands[0], op1, op2));
+      DONE;
+
+    case 1:
+      op1 = gen_rtx_MEM (QImode, XEXP (operands[1], 0));
+      MEM_COPY_ATTRIBUTES (op1, operands[1]);
+
+      tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_zero_extendqisi2 (tmp, op1));
+      op1 = tmp;
+
+      op2 = gen_rtx_MEM (QImode, XEXP (operands[2], 0));
+      MEM_COPY_ATTRIBUTES (op2, operands[2]);
+
+      tmp = gen_reg_rtx (SImode);
+      emit_insn (gen_zero_extendqisi2 (tmp, op2));
+      op2 = tmp;
+
+      emit_insn (gen_sub3_insn (operands[0], op1, op2));
+      DONE;
+    }
+
+  FAIL;
+})
+
 (define_expand "cmpstrnsi"
   [(set (match_operand:SI 0 "register_operand")
        (compare:SI (match_operand:BLK 1 "general_operand")


extend x86 cmpmemsi to use loops

This patch extends the cmpmemsi expander introduced in the previous
patch to use loops for lengths that extend over multiple words.


for  gcc/ChangeLog

        * config/i386/i386.md (cmpmemsi): Expand more than one
        fragment compare sequence depending on optimization level.
        (subcmpsi3): New expand pattern.
---
 gcc/config/i386/i386.md |  204 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 154 insertions(+), 50 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index b72b94a1fd51d..088a591dd5c17 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16974,112 +16974,216 @@
    (use (match_operand 4 "immediate_operand"))]
   ""
 {
-  rtx op1, op2, tmp;
-
   if (!CONST_INT_P (operands[3]))
     FAIL;
 
+  unsigned HOST_WIDE_INT todo = UINTVAL (operands[3]);
+
+  /* Balance size expansion with optimization level.  This will inline
+     memcmp of up to 4 bytes or 1 word at -O1, 4 words or 1 word plus
+     3 compares at -O2, and 7 words or 4 words plus 3 compares at -O3.
+     These are not so much related with the combinations that make
+     individual memcmp calls faster, but with the significant extra
+     code cache use that each additional sequence of loads, byte
+     swapping and compare incurs.  */
+
+  HOST_WIDE_INT size = (TARGET_64BIT ? todo / 8 + !!(todo & 4) : todo / 4);
+  if (size)
+    size++;
+  size += !!(todo & 1) + !!(todo & 2);
+  if (size > 1)
+    size++;
+  if (size > optimize * 3)
+    FAIL;
+
   if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
     FAIL;
 
-  switch (INTVAL (operands[3]))
+  if (!todo)
     {
-    case 0:
       emit_move_insn (operands[0], const0_rtx);
       DONE;
+    }
 
-    default:
-      FAIL;
-
-    case 8:
-      if (!TARGET_64BIT)
-        FAIL;
-
-      op1 = gen_rtx_MEM (DImode, XEXP (operands[1], 0));
-      MEM_COPY_ATTRIBUTES (op1, operands[1]);
-
-      tmp = gen_reg_rtx (DImode);
-      emit_insn (gen_bswapdi2 (tmp, op1));
-      op1 = tmp;
-
-      op2 = gen_rtx_MEM (DImode, XEXP (operands[2], 0));
-      MEM_COPY_ATTRIBUTES (op2, operands[2]);
-
-      tmp = gen_reg_rtx (DImode);
-      emit_insn (gen_bswapdi2 (tmp, op2));
-      op2 = tmp;
+  rtx tmpout = operands[0];
+  if (reg_overlap_mentioned_p (operands[0], XEXP (operands[1], 0))
+      || reg_overlap_mentioned_p (operands[0], XEXP (operands[2], 0)))
+    tmpout = gen_reg_rtx (SImode);
 
-      emit_insn (gen_cmp_1 (DImode, op1, op2));
+  rtx_code_label *labnz = 0, *labfv = 0;
+  unsigned HOST_WIDE_INT done = 0;
+  bool needcmpint = false;
 
-      tmp = gen_lowpart (QImode, operands[0]);
-      emit_insn (gen_cmpintqi (tmp));
-      emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
-      DONE;
+  if (TARGET_64BIT)
+    while (todo >= 8)
+      {
+        rtx op1 = gen_rtx_MEM (DImode, XEXP (operands[1], 0));
+       MEM_COPY_ATTRIBUTES (op1, operands[1]);
+       if (done)
+          op1 = offset_address (op1, GEN_INT (done), 8);
+
+       rtx tmp = gen_reg_rtx (DImode);
+       emit_insn (gen_bswapdi2 (tmp, op1));
+       op1 = tmp;
+
+       rtx op2 = gen_rtx_MEM (DImode, XEXP (operands[2], 0));
+       MEM_COPY_ATTRIBUTES (op2, operands[2]);
+       if (done)
+          op2 = offset_address (op2, GEN_INT (done), 8);
+
+       tmp = gen_reg_rtx (DImode);
+       emit_insn (gen_bswapdi2 (tmp, op2));
+       op2 = tmp;
+
+       emit_insn (gen_cmp_1 (DImode, op1, op2));
+       needcmpint = true;
+
+       done += 8;
+       todo -= 8;
+       if (todo)
+          {
+           if (!labnz)
+             labnz = gen_label_rtx ();
+           LABEL_NUSES (labnz)++;
+           ix86_expand_branch (NE, gen_rtx_REG (CCmode, FLAGS_REG),
+                               const0_rtx, labnz);
+         }
+      }
 
-    case 4:
-      op1 = gen_rtx_MEM (SImode, XEXP (operands[1], 0));
+  while (todo >= 4)
+    {
+      rtx op1 = gen_rtx_MEM (SImode, XEXP (operands[1], 0));
       MEM_COPY_ATTRIBUTES (op1, operands[1]);
+      if (done)
+        op1 = offset_address (op1, GEN_INT (done), 4);
 
-      tmp = gen_reg_rtx (SImode);
+      rtx tmp = gen_reg_rtx (SImode);
       emit_insn (gen_bswapsi2 (tmp, op1));
       op1 = tmp;
 
-      op2 = gen_rtx_MEM (SImode, XEXP (operands[2], 0));
+      rtx op2 = gen_rtx_MEM (SImode, XEXP (operands[2], 0));
       MEM_COPY_ATTRIBUTES (op2, operands[2]);
+      if (done)
+        op2 = offset_address (op2, GEN_INT (done), 4);
 
       tmp = gen_reg_rtx (SImode);
       emit_insn (gen_bswapsi2 (tmp, op2));
       op2 = tmp;
 
       emit_insn (gen_cmp_1 (SImode, op1, op2));
+      needcmpint = true;
 
-      tmp = gen_lowpart (QImode, operands[0]);
-      emit_insn (gen_cmpintqi (tmp));
-      emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, tmp));
-      DONE;
+      done += 4;
+      todo -= 4;
+      if (todo)
+        {
+         if (!labnz)
+           labnz = gen_label_rtx ();
+         LABEL_NUSES (labnz)++;
+         ix86_expand_branch (NE, gen_rtx_REG (CCmode, FLAGS_REG),
+                             const0_rtx, labnz);
+        }
+    }
 
-    case 2:
-      op1 = gen_rtx_MEM (HImode, XEXP (operands[1], 0));
+  if (todo >= 2)
+    {
+      rtx op1 = gen_rtx_MEM (HImode, XEXP (operands[1], 0));
       MEM_COPY_ATTRIBUTES (op1, operands[1]);
+      if (done)
+        op1 = offset_address (op1, GEN_INT (done), 4);
 
-      tmp = gen_reg_rtx (SImode);
+      rtx tmp = gen_reg_rtx (SImode);
       emit_insn (gen_zero_extendhisi2 (tmp, op1));
       emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
       op1 = tmp;
 
-      op2 = gen_rtx_MEM (HImode, XEXP (operands[2], 0));
+      rtx op2 = gen_rtx_MEM (HImode, XEXP (operands[2], 0));
       MEM_COPY_ATTRIBUTES (op2, operands[2]);
+      if (done)
+        op2 = offset_address (op2, GEN_INT (done), 4);
 
       tmp = gen_reg_rtx (SImode);
       emit_insn (gen_zero_extendhisi2 (tmp, op2));
       emit_insn (gen_bswaphi_lowpart (gen_lowpart (HImode, tmp)));
       op2 = tmp;
 
-      emit_insn (gen_sub3_insn (operands[0], op1, op2));
-      DONE;
+      if (needcmpint)
+        emit_insn (gen_cmp_1 (SImode, op1, op2));
+      else
+        emit_insn (gen_subcmpsi3 (tmpout, op1, op2));
 
-    case 1:
-      op1 = gen_rtx_MEM (QImode, XEXP (operands[1], 0));
+      done += 2;
+      todo -= 2;
+      if (todo)
+        {
+         rtx_code_label *lab = labnz;
+         if (!needcmpint)
+           lab = labfv = gen_label_rtx ();
+         LABEL_NUSES (lab)++;
+         ix86_expand_branch (NE, gen_rtx_REG (CCmode, FLAGS_REG),
+                             const0_rtx, lab);
+        }
+    }
+
+  if (todo >= 1)
+    {
+      rtx op1 = gen_rtx_MEM (QImode, XEXP (operands[1], 0));
       MEM_COPY_ATTRIBUTES (op1, operands[1]);
+      if (done)
+        op1 = offset_address (op1, GEN_INT (done), 2);
 
-      tmp = gen_reg_rtx (SImode);
+      rtx tmp = gen_reg_rtx (SImode);
       emit_insn (gen_zero_extendqisi2 (tmp, op1));
       op1 = tmp;
 
-      op2 = gen_rtx_MEM (QImode, XEXP (operands[2], 0));
+      rtx op2 = gen_rtx_MEM (QImode, XEXP (operands[2], 0));
       MEM_COPY_ATTRIBUTES (op2, operands[2]);
+      if (done)
+        op2 = offset_address (op2, GEN_INT (done), 2);
 
       tmp = gen_reg_rtx (SImode);
       emit_insn (gen_zero_extendqisi2 (tmp, op2));
       op2 = tmp;
 
-      emit_insn (gen_sub3_insn (operands[0], op1, op2));
-      DONE;
+      if (needcmpint)
+        emit_insn (gen_cmp_1 (SImode, op1, op2));
+      else
+        emit_insn (gen_subcmpsi3 (tmpout, op1, op2));
+
+      done += 1;
+      todo -= 1;
+    }
+  gcc_assert (!todo);
+
+  if (labnz)
+    emit_label (labnz);
+
+  if (needcmpint)
+    {
+      rtx tmp = gen_lowpart (QImode, tmpout);
+      emit_insn (gen_cmpintqi (tmp));
+      emit_move_insn (tmpout, gen_rtx_SIGN_EXTEND (SImode, tmp));
     }
 
-  FAIL;
+  if (labfv)
+    emit_label (labfv);
+
+  if (tmpout != operands[0])
+    emit_move_insn (operands[0], tmpout);
+
+  DONE;
 })
 
+;; Expand a "*sub<mode>_2" pattern with mode=SI.
+(define_expand "subcmpsi3"
+  [(parallel [(set (reg:CC FLAGS_REG)
+                  (compare:CC
+                    (match_operand:SI 1 "register_operand")
+                    (match_operand:SI 2 "register_operand")))
+             (set (match_operand:SI 0 "register_operand")
+                  (minus:SI (match_dup 1) (match_dup 2)))])]
+  "")
+
 (define_expand "cmpstrnsi"
   [(set (match_operand:SI 0 "register_operand")
        (compare:SI (match_operand:BLK 1 "general_operand")


DO NOT USE - FTR only - cmpsb-based cmpmemsi pattern for x86

I include this just for the record, in case someone wishes to compare
memcmp performance when implemented as 'repz cmpsb', same as used for
strncmp, with the implementations in glibc or in the proposed patchset
above.

---
 gcc/config/i386/i386.md |   56 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 2b7469991d837..cd7b974c7e33f 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16966,6 +16966,62 @@
          (const_string "*")))
    (set_attr "mode" "QI")])
 
+(define_expand "cmpmemsi"
+  [(set (match_operand:SI 0 "register_operand")
+       (compare:SI (match_operand:BLK 1 "general_operand")
+                   (match_operand:BLK 2 "general_operand")))
+   (use (match_operand 3 "general_operand"))
+   (use (match_operand 4 "immediate_operand"))]
+  ""
+{
+  rtx addr1, addr2, countreg, countout, align, out;
+
+  if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS)
+    FAIL;
+
+  /* Can't use this if the user has appropriated ecx, esi or edi.  */
+  if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
+    FAIL;
+
+  addr1 = copy_addr_to_reg (XEXP (operands[1], 0));
+  addr2 = copy_addr_to_reg (XEXP (operands[2], 0));
+  if (addr1 != XEXP (operands[1], 0))
+    operands[1] = replace_equiv_address_nv (operands[1], addr1);
+  if (addr2 != XEXP (operands[2], 0))
+    operands[2] = replace_equiv_address_nv (operands[2], addr2);
+
+  countreg = ix86_zero_extend_to_Pmode (operands[3]);
+  countout = gen_reg_rtx (Pmode);
+
+  /* %%% Iff we are testing strict equality, we can use known alignment
+     to good advantage.  This may be possible with combine, particularly
+     once cc0 is dead.  */
+  align = operands[4];
+
+  if (CONST_INT_P (operands[3]))
+    {
+      if (operands[3] == const0_rtx)
+       {
+         emit_move_insn (operands[0], const0_rtx);
+         DONE;
+       }
+      emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countout, align,
+                                    operands[1], operands[2], countreg));
+    }
+  else
+    {
+      emit_insn (gen_cmp_1 (Pmode, countreg, countreg));
+      emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countout, align,
+                                 operands[1], operands[2], countreg));
+    }
+
+  out = gen_lowpart (QImode, operands[0]);
+  emit_insn (gen_cmpintqi (out));
+  emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, out));
+
+  DONE;
+})
+
 (define_expand "cmpstrnsi"
   [(set (match_operand:SI 0 "register_operand")
        (compare:SI (match_operand:BLK 1 "general_operand")


-- 
Alexandre Oliva, freedom fighter  he/him   https://FSFLA.org/blogs/lxo
Be the change, be Free!       FSF.org & FSF Latin America board member
GNU Toolchain Engineer                        Free Software Evangelist
Hay que enGNUrecerse, pero sin perder la terGNUra jamás - Che GNUevara

Reply via email to