https://gcc.gnu.org/g:20a5b4824993ae1c99f3b965c5e07bbd2c64b2ce
commit r15-3037-g20a5b4824993ae1c99f3b965c5e07bbd2c64b2ce Author: Alexander Monakov <amona...@ispras.ru> Date: Tue Aug 6 09:47:23 2024 +0300 libcpp: replace SSE4.2 helper with an SSSE3 one Since the characters we are searching for (CR, LF, '\', '?') all have distinct ASCII codes mod 16, PSHUFB can help match them all at once. Directly use the new helper if __SSSE3__ is defined. It makes the other helpers unused, so mark them inline to prevent warnings. Rewrite and simplify init_vectorized_lexer. libcpp/ChangeLog: * config.in: Regenerate. * configure: Regenerate. * configure.ac: Check for SSSE3 instead of SSE4.2. * files.cc (read_file_guts): Bump padding to 64 if HAVE_SSSE3. * lex.cc (search_line_acc_char): Mark inline, not "unused". (search_line_sse2): Mark inline. (search_line_sse42): Replace with... (search_line_ssse3): ... this new function. Adjust the use... (init_vectorized_lexer): ... here. Simplify. Diff: --- libcpp/config.in | 4 +- libcpp/configure | 4 +- libcpp/configure.ac | 6 +-- libcpp/files.cc | 19 ++++--- libcpp/lex.cc | 150 +++++++++++++++++++--------------------------------- 5 files changed, 73 insertions(+), 110 deletions(-) diff --git a/libcpp/config.in b/libcpp/config.in index 253ef03a3de..b2e2f4e842c 100644 --- a/libcpp/config.in +++ b/libcpp/config.in @@ -210,8 +210,8 @@ /* Define to 1 if you have the `putc_unlocked' function. */ #undef HAVE_PUTC_UNLOCKED -/* Define to 1 if you can assemble SSE4 insns. */ -#undef HAVE_SSE4 +/* Define to 1 if you can assemble SSSE3 insns. */ +#undef HAVE_SSSE3 /* Define to 1 if you have the <stddef.h> header file. */ #undef HAVE_STDDEF_H diff --git a/libcpp/configure b/libcpp/configure index 32d6aaa3069..1391081ba09 100755 --- a/libcpp/configure +++ b/libcpp/configure @@ -9140,14 +9140,14 @@ case $target in int main () { -asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0)) +asm ("pshufb %xmm0, %xmm1") ; return 0; } _ACEOF if ac_fn_c_try_compile "$LINENO"; then : -$as_echo "#define HAVE_SSE4 1" >>confdefs.h +$as_echo "#define HAVE_SSSE3 1" >>confdefs.h fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext diff --git a/libcpp/configure.ac b/libcpp/configure.ac index b883fec776f..981f97c4abd 100644 --- a/libcpp/configure.ac +++ b/libcpp/configure.ac @@ -197,9 +197,9 @@ fi case $target in i?86-* | x86_64-*) - AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))], - [AC_DEFINE([HAVE_SSE4], [1], - [Define to 1 if you can assemble SSE4 insns.])]) + AC_TRY_COMPILE([], [asm ("pshufb %xmm0, %xmm1")], + [AC_DEFINE([HAVE_SSSE3], [1], + [Define to 1 if you can assemble SSSE3 insns.])]) esac # Enable --enable-host-shared. diff --git a/libcpp/files.cc b/libcpp/files.cc index 78f56e30bde..3775091d259 100644 --- a/libcpp/files.cc +++ b/libcpp/files.cc @@ -693,7 +693,7 @@ static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, const char *input_charset) { - ssize_t size, total, count; + ssize_t size, pad, total, count; uchar *buf; bool regular; @@ -732,11 +732,14 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, the majority of C source files. */ size = 8 * 1024; - /* The + 16 here is space for the final '\n' and 15 bytes of padding, - used to quiet warnings from valgrind or Address Sanitizer, when the - optimized lexer accesses aligned 16-byte memory chunks, including - the bytes after the malloced, area, and stops lexing on '\n'. */ - buf = XNEWVEC (uchar, size + 16); +#ifdef HAVE_SSSE3 + pad = 64; +#else + pad = 16; +#endif + /* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding, + allowing search_line_fast to use (possibly misaligned) vector loads. */ + buf = XNEWVEC (uchar, size + pad); total = 0; while ((count = read (file->fd, buf + total, size - total)) > 0) { @@ -747,7 +750,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, if (regular) break; size *= 2; - buf = XRESIZEVEC (uchar, buf, size + 16); + buf = XRESIZEVEC (uchar, buf, size + pad); } } @@ -765,7 +768,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, file->buffer = _cpp_convert_input (pfile, input_charset, - buf, size + 16, total, + buf, size + pad, total, &file->buffer_start, &file->st.st_size); file->buffer_valid = file->buffer; diff --git a/libcpp/lex.cc b/libcpp/lex.cc index 1591dcdf151..daf2c770bc3 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -225,10 +225,7 @@ acc_char_index (word_type cmp ATTRIBUTE_UNUSED, and branches without increasing the number of arithmetic operations. It's almost certainly going to be a win with 64-bit word size. */ -static const uchar * search_line_acc_char (const uchar *, const uchar *) - ATTRIBUTE_UNUSED; - -static const uchar * +static inline const uchar * search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { const word_type repl_nl = acc_char_replicate ('\n'); @@ -293,7 +290,7 @@ static const char repl_chars[4][16] __attribute__((aligned(16))) = { /* A version of the fast scanner using SSE2 vectorized byte compare insns. */ -static const uchar * +static inline const uchar * #ifndef __SSE2__ __attribute__((__target__("sse2"))) #endif @@ -344,120 +341,83 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) return (const uchar *)p + found; } -#ifdef HAVE_SSE4 -/* A version of the fast scanner using SSE 4.2 vectorized string insns. */ +#ifdef HAVE_SSSE3 +/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ -static const uchar * -#ifndef __SSE4_2__ -__attribute__((__target__("sse4.2"))) +static inline const uchar * +#ifndef __SSSE3__ +__attribute__((__target__("ssse3"))) #endif -search_line_sse42 (const uchar *s, const uchar *end) +search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { typedef char v16qi __attribute__ ((__vector_size__ (16))); - static const v16qi search = { '\n', '\r', '?', '\\' }; - - uintptr_t si = (uintptr_t)s; - uintptr_t index; - - /* Check for unaligned input. */ - if (si & 15) - { - v16qi sv; - - if (__builtin_expect (end - s < 16, 0) - && __builtin_expect ((si & 0xfff) > 0xff0, 0)) - { - /* There are less than 16 bytes left in the buffer, and less - than 16 bytes left on the page. Reading 16 bytes at this - point might generate a spurious page fault. Defer to the - SSE2 implementation, which already handles alignment. */ - return search_line_sse2 (s, end); - } - - /* ??? The builtin doesn't understand that the PCMPESTRI read from - memory need not be aligned. */ - sv = __builtin_ia32_loaddqu ((const char *) s); - index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0); - - if (__builtin_expect (index < 16, 0)) - goto found; - - /* Advance the pointer to an aligned address. We will re-scan a - few bytes, but we no longer need care for reading past the - end of a page, since we're guaranteed a match. */ - s = (const uchar *)((si + 15) & -16); - } - - /* Main loop, processing 16 bytes at a time. */ -#ifdef __GCC_ASM_FLAG_OUTPUTS__ - while (1) + typedef v16qi v16qi_u __attribute__ ((__aligned__ (1))); + /* Helper vector for pshufb-based matching: + each character C we're searching for is at position (C % 16). */ + v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }; + static_assert ('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63); + + v16qi d1, d2, t1, t2; + /* Unaligned loads. Reading beyond the final newline is safe, + since files.cc:read_file_guts pads the allocation. */ + d1 = *(const v16qi_u *)s; + d2 = *(const v16qi_u *)(s + 16); + unsigned m1, m2, found; + /* Process two 16-byte chunks per iteration. */ + do { - char f; - - /* By using inline assembly instead of the builtin, - we can use the result, as well as the flags set. */ - __asm ("%vpcmpestri\t$0, %2, %3" - : "=c"(index), "=@ccc"(f) - : "m"(*s), "x"(search), "a"(4), "d"(16)); - if (f) - break; - - s += 16; + t1 = __builtin_ia32_pshufb128 (lut, d1); + t2 = __builtin_ia32_pshufb128 (lut, d2); + m1 = __builtin_ia32_pmovmskb128 (t1 == d1); + m2 = __builtin_ia32_pmovmskb128 (t2 == d2); + s += 32; + d1 = *(const v16qi_u *)s; + d2 = *(const v16qi_u *)(s + 16); + found = m1 + (m2 << 16); } -#else - s -= 16; - /* By doing the whole loop in inline assembly, - we can make proper use of the flags set. */ - __asm ( ".balign 16\n" - "0: add $16, %1\n" - " %vpcmpestri\t$0, (%1), %2\n" - " jnc 0b" - : "=&c"(index), "+r"(s) - : "x"(search), "a"(4), "d"(16)); -#endif - - found: - return s + index; + while (!found); + /* Prefer to compute 's - 32' here, not spend an extra instruction + to make a copy of the previous value of 's' in the loop. */ + __asm__ ("" : "+r"(s)); + return s - 32 + __builtin_ctz (found); } #else -/* Work around out-dated assemblers without sse4 support. */ -#define search_line_sse42 search_line_sse2 +/* Work around out-dated assemblers without SSSE3 support. */ +#define search_line_ssse3 search_line_sse2 #endif +#ifdef __SSSE3__ +/* No need for CPU probing, just use the best available variant. */ +#define search_line_fast search_line_ssse3 +#else /* Check the CPU capabilities. */ #include "../gcc/config/i386/cpuid.h" typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *); -static search_line_fast_type search_line_fast; +static search_line_fast_type search_line_fast +#if defined(__SSE2__) + = search_line_sse2; +#else + = search_line_acc_char; +#endif #define HAVE_init_vectorized_lexer 1 static inline void init_vectorized_lexer (void) { - unsigned dummy, ecx = 0, edx = 0; - search_line_fast_type impl = search_line_acc_char; - int minimum = 0; - -#if defined(__SSE4_2__) - minimum = 3; -#elif defined(__SSE2__) - minimum = 2; -#endif + unsigned ax, bx, cx, dx; - if (minimum == 3) - impl = search_line_sse42; - else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) - { - if (minimum == 3 || (ecx & bit_SSE4_2)) - impl = search_line_sse42; - else if (minimum == 2 || (edx & bit_SSE2)) - impl = search_line_sse2; - } + if (!__get_cpuid (1, &ax, &bx, &cx, &dx)) + return; - search_line_fast = impl; + if (cx & bit_SSSE3) + search_line_fast = search_line_ssse3; + else if (dx & bit_SSE2) + search_line_fast = search_line_sse2; } +#endif #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)