This removes the hotspot in parse_file_seq's call to strpbrk, by using
SSE2 vector instructions. The resulting speedup on QEMU's noop build
is around 6% (15.4 seconds to 14.5).
The code is roughly based on GCC's similar optimizations in the lexer.
* read-opt.c: New.
* read.c (parse_file_seq): Use needs_glob instead of strpbrk.
* Makefile.am (make_SOURCES): Add read-opt.c.
* Makefile.in: Regenerate.
---
(I also had a SSE4.2 version that gave another 1-2% improvement,
but it fails some tests and I also don't feel like adding a lot
of code to detect the instruction set. All x86-64 machines have
SSE2, so this provide the biggest bang for the buck).
Makefile.am | 6 ++---
Makefile.in | 19 ++++++-------
makeint.h | 1 +
read-opt.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
read.c | 2 +-
5 files changed, 103 insertions(+), 13 deletions(-)
create mode 100644 read-opt.c
diff --git a/Makefile.am b/Makefile.am
index c88c465..ef5e1f9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -44,9 +44,9 @@ endif
make_SOURCES = ar.c arscan.c commands.c default.c dir.c expand.c file.c \
function.c getopt.c getopt1.c guile.c implicit.c job.c load.c \
- loadapi.c main.c misc.c $(ossrc) output.c read.c remake.c \
- rule.c signame.c strcache.c variable.c version.c vpath.c \
- hash.c $(remote)
+ loadapi.c main.c misc.c $(ossrc) output.c read.c read-opt.c \
+ remake.c rule.c signame.c strcache.c variable.c version.c \
+ vpath.c hash.c $(remote)
EXTRA_make_SOURCES = vmsjobs.c remote-stub.c remote-cstms.c
diff --git a/Makefile.in b/Makefile.in
index 67b7616..52c854c 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -144,8 +144,8 @@ loadavg_DEPENDENCIES =
am__make_SOURCES_DIST = ar.c arscan.c commands.c default.c dir.c \
expand.c file.c function.c getopt.c getopt1.c guile.c \
implicit.c job.c load.c loadapi.c main.c misc.c posixos.c \
- output.c read.c remake.c rule.c signame.c strcache.c \
- variable.c version.c vpath.c hash.c remote-stub.c \
+ output.c read.c read-opt.c remake.c rule.c signame.c \
+ strcache.c variable.c version.c vpath.c hash.c remote-stub.c \
remote-cstms.c
@WINDOWSENV_FALSE@am__objects_1 = posixos.$(OBJEXT)
@USE_CUSTOMS_FALSE@am__objects_2 = remote-stub.$(OBJEXT)
@@ -156,10 +156,10 @@ am_make_OBJECTS = ar.$(OBJEXT) arscan.$(OBJEXT)
commands.$(OBJEXT) \
getopt1.$(OBJEXT) guile.$(OBJEXT) implicit.$(OBJEXT) \
job.$(OBJEXT) load.$(OBJEXT) loadapi.$(OBJEXT) main.$(OBJEXT) \
misc.$(OBJEXT) $(am__objects_1) output.$(OBJEXT) \
- read.$(OBJEXT) remake.$(OBJEXT) rule.$(OBJEXT) \
- signame.$(OBJEXT) strcache.$(OBJEXT) variable.$(OBJEXT) \
- version.$(OBJEXT) vpath.$(OBJEXT) hash.$(OBJEXT) \
- $(am__objects_2)
+ read.$(OBJEXT) read-opt.$(OBJEXT) remake.$(OBJEXT) \
+ rule.$(OBJEXT) signame.$(OBJEXT) strcache.$(OBJEXT) \
+ variable.$(OBJEXT) version.$(OBJEXT) vpath.$(OBJEXT) \
+ hash.$(OBJEXT) $(am__objects_2)
make_OBJECTS = $(am_make_OBJECTS)
am__DEPENDENCIES_1 =
@WINDOWSENV_TRUE@am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1)
@@ -473,9 +473,9 @@ include_HEADERS = gnumake.h
@USE_CUSTOMS_TRUE@remote = remote-cstms.c
make_SOURCES = ar.c arscan.c commands.c default.c dir.c expand.c file.c \
function.c getopt.c getopt1.c guile.c implicit.c job.c load.c \
- loadapi.c main.c misc.c $(ossrc) output.c read.c remake.c \
- rule.c signame.c strcache.c variable.c version.c vpath.c \
- hash.c $(remote)
+ loadapi.c main.c misc.c $(ossrc) output.c read.c read-opt.c \
+ remake.c rule.c signame.c strcache.c variable.c version.c \
+ vpath.c hash.c $(remote)
EXTRA_make_SOURCES = vmsjobs.c remote-stub.c remote-cstms.c
noinst_HEADERS = commands.h dep.h filedef.h job.h makeint.h rule.h variable.h \
@@ -684,6 +684,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/misc.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/output.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/posixos.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/read-opt.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/read.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/remake.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/remote-cstms.Po@am__quote@
diff --git a/makeint.h b/makeint.h
index 8e0ae6c..ef66312 100644
--- a/makeint.h
+++ b/makeint.h
@@ -712,6 +712,7 @@ void unblock_remote_children (void);
int remote_kill (int id, int sig);
void print_variable_data_base (void);
void print_vpath_data_base (void);
+int needs_glob(const char *);
extern char *starting_directory;
extern unsigned int makelevel;
diff --git a/read-opt.c b/read-opt.c
new file mode 100644
index 0000000..6deb446
--- /dev/null
+++ b/read-opt.c
@@ -0,0 +1,88 @@
+/* Vectorized function for fast parsing of filenames for GNU Make.
+Copyright (C) 2016 Free Software Foundation, Inc.
+This file is part of GNU Make.
+
+GNU Make is free software; you can redistribute it and/or modify it under the
+terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+GNU Make is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "makeint.h"
+
+#ifdef __SSE2__
+int needs_glob(const char *s)
+{
+ static const char repl_chars[4][16] __attribute__((aligned(16))) = {
+ { '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?' },
+ { '*', '*', '*', '*', '*', '*', '*', '*',
+ '*', '*', '*', '*', '*', '*', '*', '*' },
+ { '[', '[', '[', '[', '[', '[', '[', '[',
+ '[', '[', '[', '[', '[', '[', '[', '[' },
+ };
+
+ typedef char v16qi __attribute__ ((__vector_size__ (16)));
+
+ const v16qi repl_qm = *(const v16qi *)repl_chars[0];
+ const v16qi repl_st = *(const v16qi *)repl_chars[1];
+ const v16qi repl_br = *(const v16qi *)repl_chars[2];
+ const v16qi repl_nul = *(const v16qi *)repl_chars[3];
+
+ unsigned int misalign, found, mask, done;
+ const v16qi *p;
+ v16qi data, t, n;
+
+ /* Align the source pointer. */
+ misalign = (uintptr_t)s & 15;
+ p = (const v16qi *)((uintptr_t)s & -16);
+ data = *p;
+
+ /* Create a mask for the bytes that are valid within the first
+ 16-byte block. The Idea here is that the AND with the mask
+ within the loop is "free", since we need some AND or TEST
+ insn in order to set the flags for the branch anyway. */
+ mask = -1u << misalign;
+
+ /* Main loop processing 16 bytes at a time. */
+ goto start;
+ do
+ {
+ data = *++p;
+ mask = -1;
+
+ start:
+ n = __builtin_ia32_pcmpeqb128(data, repl_nul);
+ t = __builtin_ia32_pcmpeqb128(data, repl_qm);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_st);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_br);
+ t |= n;
+ found = __builtin_ia32_pmovmskb128 (t);
+ found &= mask;
+ }
+ while (!found);
+
+ /* FOUND contains 1 in bits for which we matched a relevant
+ character or NUL. DONE contains 1 in bits for which we
+ matched a NUL. */
+ done = __builtin_ia32_pmovmskb128 (n);
+
+ /* Set to 1 all bits corresponding to characters to the left of the
+ first NUL. */
+ done |= -done;
+ found &= ~done;
+ return found > 0;
+}
+
+#else
+int needs_glob(const char *s)
+{
+ return strpbrk (s, "?*[") == NULL;
+}
+#endif
diff --git a/read.c b/read.c
index b870aa8..0883100 100644
--- a/read.c
+++ b/read.c
@@ -3268,7 +3268,7 @@ parse_file_seq (char **stringp, unsigned int size, int
stopmap,
#endif /* !NO_ARCHIVES */
/* glob() is expensive: don't call it unless we need to. */
- if (NONE_SET (flags, PARSEFS_EXISTS) && strpbrk (name, "?*[") == NULL)
+ if (NONE_SET (flags, PARSEFS_EXISTS) && !needs_glob (name))
{
globme = 0;
i = 1;
--
2.7.4
_______________________________________________
Bug-make mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/bug-make