This removes the hotspot in parse_file_seq's call to strpbrk, by using SSE2 vector instructions. The resulting speedup on QEMU's noop build is around 6% (15.4 seconds to 14.5).
The code is roughly based on GCC's similar optimizations in the lexer. * read-opt.c: New. * read.c (parse_file_seq): Use needs_glob instead of strpbrk. * Makefile.am (make_SOURCES): Add read-opt.c. * Makefile.in: Regenerate. --- (I also had a SSE4.2 version that gave another 1-2% improvement, but it fails some tests and I also don't feel like adding a lot of code to detect the instruction set. All x86-64 machines have SSE2, so this provide the biggest bang for the buck). Makefile.am | 6 ++--- Makefile.in | 19 ++++++------- makeint.h | 1 + read-opt.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ read.c | 2 +- 5 files changed, 103 insertions(+), 13 deletions(-) create mode 100644 read-opt.c diff --git a/Makefile.am b/Makefile.am index c88c465..ef5e1f9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -44,9 +44,9 @@ endif make_SOURCES = ar.c arscan.c commands.c default.c dir.c expand.c file.c \ function.c getopt.c getopt1.c guile.c implicit.c job.c load.c \ - loadapi.c main.c misc.c $(ossrc) output.c read.c remake.c \ - rule.c signame.c strcache.c variable.c version.c vpath.c \ - hash.c $(remote) + loadapi.c main.c misc.c $(ossrc) output.c read.c read-opt.c \ + remake.c rule.c signame.c strcache.c variable.c version.c \ + vpath.c hash.c $(remote) EXTRA_make_SOURCES = vmsjobs.c remote-stub.c remote-cstms.c diff --git a/Makefile.in b/Makefile.in index 67b7616..52c854c 100644 --- a/Makefile.in +++ b/Makefile.in @@ -144,8 +144,8 @@ loadavg_DEPENDENCIES = am__make_SOURCES_DIST = ar.c arscan.c commands.c default.c dir.c \ expand.c file.c function.c getopt.c getopt1.c guile.c \ implicit.c job.c load.c loadapi.c main.c misc.c posixos.c \ - output.c read.c remake.c rule.c signame.c strcache.c \ - variable.c version.c vpath.c hash.c remote-stub.c \ + output.c read.c read-opt.c remake.c rule.c signame.c \ + strcache.c variable.c version.c vpath.c hash.c remote-stub.c \ remote-cstms.c @WINDOWSENV_FALSE@am__objects_1 = posixos.$(OBJEXT) @USE_CUSTOMS_FALSE@am__objects_2 = remote-stub.$(OBJEXT) @@ -156,10 +156,10 @@ am_make_OBJECTS = ar.$(OBJEXT) arscan.$(OBJEXT) commands.$(OBJEXT) \ getopt1.$(OBJEXT) guile.$(OBJEXT) implicit.$(OBJEXT) \ job.$(OBJEXT) load.$(OBJEXT) loadapi.$(OBJEXT) main.$(OBJEXT) \ misc.$(OBJEXT) $(am__objects_1) output.$(OBJEXT) \ - read.$(OBJEXT) remake.$(OBJEXT) rule.$(OBJEXT) \ - signame.$(OBJEXT) strcache.$(OBJEXT) variable.$(OBJEXT) \ - version.$(OBJEXT) vpath.$(OBJEXT) hash.$(OBJEXT) \ - $(am__objects_2) + read.$(OBJEXT) read-opt.$(OBJEXT) remake.$(OBJEXT) \ + rule.$(OBJEXT) signame.$(OBJEXT) strcache.$(OBJEXT) \ + variable.$(OBJEXT) version.$(OBJEXT) vpath.$(OBJEXT) \ + hash.$(OBJEXT) $(am__objects_2) make_OBJECTS = $(am_make_OBJECTS) am__DEPENDENCIES_1 = @WINDOWSENV_TRUE@am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1) @@ -473,9 +473,9 @@ include_HEADERS = gnumake.h @USE_CUSTOMS_TRUE@remote = remote-cstms.c make_SOURCES = ar.c arscan.c commands.c default.c dir.c expand.c file.c \ function.c getopt.c getopt1.c guile.c implicit.c job.c load.c \ - loadapi.c main.c misc.c $(ossrc) output.c read.c remake.c \ - rule.c signame.c strcache.c variable.c version.c vpath.c \ - hash.c $(remote) + loadapi.c main.c misc.c $(ossrc) output.c read.c read-opt.c \ + remake.c rule.c signame.c strcache.c variable.c version.c \ + vpath.c hash.c $(remote) EXTRA_make_SOURCES = vmsjobs.c remote-stub.c remote-cstms.c noinst_HEADERS = commands.h dep.h filedef.h job.h makeint.h rule.h variable.h \ @@ -684,6 +684,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/misc.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/output.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/posixos.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/read-opt.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/read.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/remake.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/remote-cstms.Po@am__quote@ diff --git a/makeint.h b/makeint.h index 8e0ae6c..ef66312 100644 --- a/makeint.h +++ b/makeint.h @@ -712,6 +712,7 @@ void unblock_remote_children (void); int remote_kill (int id, int sig); void print_variable_data_base (void); void print_vpath_data_base (void); +int needs_glob(const char *); extern char *starting_directory; extern unsigned int makelevel; diff --git a/read-opt.c b/read-opt.c new file mode 100644 index 0000000..6deb446 --- /dev/null +++ b/read-opt.c @@ -0,0 +1,88 @@ +/* Vectorized function for fast parsing of filenames for GNU Make. +Copyright (C) 2016 Free Software Foundation, Inc. +This file is part of GNU Make. + +GNU Make is free software; you can redistribute it and/or modify it under the +terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +GNU Make is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR +A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include "makeint.h" + +#ifdef __SSE2__ +int needs_glob(const char *s) +{ + static const char repl_chars[4][16] __attribute__((aligned(16))) = { + { '?', '?', '?', '?', '?', '?', '?', '?', + '?', '?', '?', '?', '?', '?', '?', '?' }, + { '*', '*', '*', '*', '*', '*', '*', '*', + '*', '*', '*', '*', '*', '*', '*', '*' }, + { '[', '[', '[', '[', '[', '[', '[', '[', + '[', '[', '[', '[', '[', '[', '[', '[' }, + }; + + typedef char v16qi __attribute__ ((__vector_size__ (16))); + + const v16qi repl_qm = *(const v16qi *)repl_chars[0]; + const v16qi repl_st = *(const v16qi *)repl_chars[1]; + const v16qi repl_br = *(const v16qi *)repl_chars[2]; + const v16qi repl_nul = *(const v16qi *)repl_chars[3]; + + unsigned int misalign, found, mask, done; + const v16qi *p; + v16qi data, t, n; + + /* Align the source pointer. */ + misalign = (uintptr_t)s & 15; + p = (const v16qi *)((uintptr_t)s & -16); + data = *p; + + /* Create a mask for the bytes that are valid within the first + 16-byte block. The Idea here is that the AND with the mask + within the loop is "free", since we need some AND or TEST + insn in order to set the flags for the branch anyway. */ + mask = -1u << misalign; + + /* Main loop processing 16 bytes at a time. */ + goto start; + do + { + data = *++p; + mask = -1; + + start: + n = __builtin_ia32_pcmpeqb128(data, repl_nul); + t = __builtin_ia32_pcmpeqb128(data, repl_qm); + t |= __builtin_ia32_pcmpeqb128(data, repl_st); + t |= __builtin_ia32_pcmpeqb128(data, repl_br); + t |= n; + found = __builtin_ia32_pmovmskb128 (t); + found &= mask; + } + while (!found); + + /* FOUND contains 1 in bits for which we matched a relevant + character or NUL. DONE contains 1 in bits for which we + matched a NUL. */ + done = __builtin_ia32_pmovmskb128 (n); + + /* Set to 1 all bits corresponding to characters to the left of the + first NUL. */ + done |= -done; + found &= ~done; + return found > 0; +} + +#else +int needs_glob(const char *s) +{ + return strpbrk (s, "?*[") == NULL; +} +#endif diff --git a/read.c b/read.c index b870aa8..0883100 100644 --- a/read.c +++ b/read.c @@ -3268,7 +3268,7 @@ parse_file_seq (char **stringp, unsigned int size, int stopmap, #endif /* !NO_ARCHIVES */ /* glob() is expensive: don't call it unless we need to. */ - if (NONE_SET (flags, PARSEFS_EXISTS) && strpbrk (name, "?*[") == NULL) + if (NONE_SET (flags, PARSEFS_EXISTS) && !needs_glob (name)) { globme = 0; i = 1; -- 2.7.4 _______________________________________________ Bug-make mailing list Bug-make@gnu.org https://lists.gnu.org/mailman/listinfo/bug-make