[gcc r15-628] tree-into-ssa: speed up sorting in prune_unused_phi_nodes [PR114480]
https://gcc.gnu.org/g:4b9e68a6f3b22800a7f12b58ef6b25e3b339bb3c commit r15-628-g4b9e68a6f3b22800a7f12b58ef6b25e3b339bb3c Author: Alexander Monakov Date: Wed May 15 16:23:17 2024 +0300 tree-into-ssa: speed up sorting in prune_unused_phi_nodes [PR114480] In PR 114480 we are hitting a case where tree-into-ssa scales quadratically due to prune_unused_phi_nodes doing O(N log N) work for N basic blocks, for each variable individually. Sorting the 'defs' array is especially costly. It is possible to assist gcc_qsort by laying out dfs_out entries in the reverse order in the 'defs' array, starting from its tail. This is not always a win (in fact it flips most of 7-element qsorts in this testcase from 9 comparisons (best case) to 15 (worst case)), but overall it helps on the testcase and on libstdc++ build. On the testcase we go from 1.28e9 comparator invocations to 1.05e9, on libstdc++ from 2.91e6 to 2.84e6. gcc/ChangeLog: PR c++/114480 * tree-into-ssa.cc (prune_unused_phi_nodes): Add dfs_out entries to the 'defs' array in the reverse order. Diff: --- gcc/tree-into-ssa.cc | 17 + 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/gcc/tree-into-ssa.cc b/gcc/tree-into-ssa.cc index 3732c269ca3d..5b367c358125 100644 --- a/gcc/tree-into-ssa.cc +++ b/gcc/tree-into-ssa.cc @@ -805,21 +805,22 @@ prune_unused_phi_nodes (bitmap phis, bitmap kills, bitmap uses) locate the nearest dominating def in logarithmic time by binary search.*/ bitmap_ior (to_remove, kills, phis); n_defs = bitmap_count_bits (to_remove); - defs = XNEWVEC (struct dom_dfsnum, 2 * n_defs + 1); + adef = 2 * n_defs + 1; + defs = XNEWVEC (struct dom_dfsnum, adef); defs[0].bb_index = 1; defs[0].dfs_num = 0; - adef = 1; + struct dom_dfsnum *head = defs + 1, *tail = defs + adef; EXECUTE_IF_SET_IN_BITMAP (to_remove, 0, i, bi) { def_bb = BASIC_BLOCK_FOR_FN (cfun, i); - defs[adef].bb_index = i; - defs[adef].dfs_num = bb_dom_dfs_in (CDI_DOMINATORS, def_bb); - defs[adef + 1].bb_index = i; - defs[adef + 1].dfs_num = bb_dom_dfs_out (CDI_DOMINATORS, def_bb); - adef += 2; + head->bb_index = i; + head->dfs_num = bb_dom_dfs_in (CDI_DOMINATORS, def_bb); + head++, tail--; + tail->bb_index = i; + tail->dfs_num = bb_dom_dfs_out (CDI_DOMINATORS, def_bb); } + gcc_checking_assert (head == tail); BITMAP_FREE (to_remove); - gcc_assert (adef == 2 * n_defs + 1); qsort (defs, adef, sizeof (struct dom_dfsnum), cmp_dfsnum); gcc_assert (defs[0].bb_index == 1);
[gcc r15-3037] libcpp: replace SSE4.2 helper with an SSSE3 one
https://gcc.gnu.org/g:20a5b4824993ae1c99f3b965c5e07bbd2c64b2ce commit r15-3037-g20a5b4824993ae1c99f3b965c5e07bbd2c64b2ce Author: Alexander Monakov Date: Tue Aug 6 09:47:23 2024 +0300 libcpp: replace SSE4.2 helper with an SSSE3 one Since the characters we are searching for (CR, LF, '\', '?') all have distinct ASCII codes mod 16, PSHUFB can help match them all at once. Directly use the new helper if __SSSE3__ is defined. It makes the other helpers unused, so mark them inline to prevent warnings. Rewrite and simplify init_vectorized_lexer. libcpp/ChangeLog: * config.in: Regenerate. * configure: Regenerate. * configure.ac: Check for SSSE3 instead of SSE4.2. * files.cc (read_file_guts): Bump padding to 64 if HAVE_SSSE3. * lex.cc (search_line_acc_char): Mark inline, not "unused". (search_line_sse2): Mark inline. (search_line_sse42): Replace with... (search_line_ssse3): ... this new function. Adjust the use... (init_vectorized_lexer): ... here. Simplify. Diff: --- libcpp/config.in| 4 +- libcpp/configure| 4 +- libcpp/configure.ac | 6 +-- libcpp/files.cc | 19 --- libcpp/lex.cc | 150 +++- 5 files changed, 73 insertions(+), 110 deletions(-) diff --git a/libcpp/config.in b/libcpp/config.in index 253ef03a3de..b2e2f4e842c 100644 --- a/libcpp/config.in +++ b/libcpp/config.in @@ -210,8 +210,8 @@ /* Define to 1 if you have the `putc_unlocked' function. */ #undef HAVE_PUTC_UNLOCKED -/* Define to 1 if you can assemble SSE4 insns. */ -#undef HAVE_SSE4 +/* Define to 1 if you can assemble SSSE3 insns. */ +#undef HAVE_SSSE3 /* Define to 1 if you have the header file. */ #undef HAVE_STDDEF_H diff --git a/libcpp/configure b/libcpp/configure index 32d6aaa3069..1391081ba09 100755 --- a/libcpp/configure +++ b/libcpp/configure @@ -9140,14 +9140,14 @@ case $target in int main () { -asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0)) +asm ("pshufb %xmm0, %xmm1") ; return 0; } _ACEOF if ac_fn_c_try_compile "$LINENO"; then : -$as_echo "#define HAVE_SSE4 1" >>confdefs.h +$as_echo "#define HAVE_SSSE3 1" >>confdefs.h fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext diff --git a/libcpp/configure.ac b/libcpp/configure.ac index b883fec776f..981f97c4abd 100644 --- a/libcpp/configure.ac +++ b/libcpp/configure.ac @@ -197,9 +197,9 @@ fi case $target in i?86-* | x86_64-*) -AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))], - [AC_DEFINE([HAVE_SSE4], [1], -[Define to 1 if you can assemble SSE4 insns.])]) +AC_TRY_COMPILE([], [asm ("pshufb %xmm0, %xmm1")], + [AC_DEFINE([HAVE_SSSE3], [1], +[Define to 1 if you can assemble SSSE3 insns.])]) esac # Enable --enable-host-shared. diff --git a/libcpp/files.cc b/libcpp/files.cc index 78f56e30bde..3775091d259 100644 --- a/libcpp/files.cc +++ b/libcpp/files.cc @@ -693,7 +693,7 @@ static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, const char *input_charset) { - ssize_t size, total, count; + ssize_t size, pad, total, count; uchar *buf; bool regular; @@ -732,11 +732,14 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, the majority of C source files. */ size = 8 * 1024; - /* The + 16 here is space for the final '\n' and 15 bytes of padding, - used to quiet warnings from valgrind or Address Sanitizer, when the - optimized lexer accesses aligned 16-byte memory chunks, including - the bytes after the malloced, area, and stops lexing on '\n'. */ - buf = XNEWVEC (uchar, size + 16); +#ifdef HAVE_SSSE3 + pad = 64; +#else + pad = 16; +#endif + /* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding, + allowing search_line_fast to use (possibly misaligned) vector loads. */ + buf = XNEWVEC (uchar, size + pad); total = 0; while ((count = read (file->fd, buf + total, size - total)) > 0) { @@ -747,7 +750,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, if (regular) break; size *= 2; - buf = XRESIZEVEC (uchar, buf, size + 16); + buf = XRESIZEVEC (uchar, buf, size + pad); } } @@ -765,7 +768,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, file->buffer = _cpp_convert_input (pfile, input_charset, -buf, size + 16, total, +buf, size + pad, total, &file->buffer_start, &file->st.st_size); file->buffer_valid = file->buffer; diff --git a/libcpp/lex.cc b/libcpp/lex.cc index 1591dcdf151..daf2c770bc3 100644 --- a/libcpp/lex.cc +++ b/lib
[gcc r15-3121] libcpp: bump padding size in _cpp_convert_input [PR116458]
https://gcc.gnu.org/g:b2c1d7c4573d3b938f44b3bda202adeb292b1cbc commit r15-3121-gb2c1d7c4573d3b938f44b3bda202adeb292b1cbc Author: Alexander Monakov Date: Thu Aug 22 21:09:47 2024 +0300 libcpp: bump padding size in _cpp_convert_input [PR116458] The recently introduced search_line_fast_ssse3 raised padding requirement from 16 to 64, which was adjusted in read_file_guts, but the corresponding ' + 16' in _cpp_convert_input was overlooked. libcpp/ChangeLog: PR preprocessor/116458 * charset.cc (_cpp_convert_input): Bump padding to 64 if HAVE_SSSE3. Diff: --- libcpp/charset.cc | 21 - 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/libcpp/charset.cc b/libcpp/charset.cc index d58319a500a1..79072877cbf7 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -3093,6 +3093,7 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, struct cset_converter input_cset; struct _cpp_strbuf to; unsigned char *buffer; + size_t pad; input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset); if (input_cset.func == convert_no_conversion) @@ -3129,16 +3130,18 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, } } +#ifdef HAVE_SSSE3 + pad = 64; +#else + pad = 16; +#endif /* Resize buffer if we allocated substantially too much, or if we - haven't enough space for the \n-terminator or following - 15 bytes of padding (used to quiet warnings from valgrind or - Address Sanitizer, when the optimized lexer accesses aligned - 16-byte memory chunks, including the bytes after the malloced, - area, and stops lexing on '\n'). */ - if (to.len + 4096 < to.asize || to.len + 16 > to.asize) -to.text = XRESIZEVEC (uchar, to.text, to.len + 16); - - memset (to.text + to.len, '\0', 16); + don't have enough space for the following padding, which allows + search_line_fast to use (possibly misaligned) vector loads. */ + if (to.len + 4096 < to.asize || to.len + pad > to.asize) +to.text = XRESIZEVEC (uchar, to.text, to.len + pad); + + memset (to.text + to.len, '\0', pad); /* If the file is using old-school Mac line endings (\r only), terminate with another \r, not an \n, so that we do not mistake
[gcc r15-3192] libcpp: deduplicate definition of padding size
https://gcc.gnu.org/g:a8260ebeae0f817bc7adf99cf62b604b1e2d3895 commit r15-3192-ga8260ebeae0f817bc7adf99cf62b604b1e2d3895 Author: Alexander Monakov Date: Sat Aug 24 17:37:13 2024 +0300 libcpp: deduplicate definition of padding size Tie together the two functions that ensure tail padding with search_line_ssse3 via CPP_BUFFER_PADDING macro. libcpp/ChangeLog: * internal.h (CPP_BUFFER_PADDING): New macro; use it ... * charset.cc (_cpp_convert_input): ...here, and ... * files.cc (read_file_guts): ...here, and ... * lex.cc (search_line_ssse3): here. Diff: --- libcpp/charset.cc | 7 +-- libcpp/files.cc | 6 +- libcpp/internal.h | 7 +++ libcpp/lex.cc | 4 ++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/libcpp/charset.cc b/libcpp/charset.cc index 79072877cbf7..fd57f6139804 100644 --- a/libcpp/charset.cc +++ b/libcpp/charset.cc @@ -3093,7 +3093,7 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, struct cset_converter input_cset; struct _cpp_strbuf to; unsigned char *buffer; - size_t pad; + size_t pad = CPP_BUFFER_PADDING; input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset); if (input_cset.func == convert_no_conversion) @@ -3130,11 +3130,6 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, } } -#ifdef HAVE_SSSE3 - pad = 64; -#else - pad = 16; -#endif /* Resize buffer if we allocated substantially too much, or if we don't have enough space for the following padding, which allows search_line_fast to use (possibly misaligned) vector loads. */ diff --git a/libcpp/files.cc b/libcpp/files.cc index 3775091d259e..fc66b9c3d73a 100644 --- a/libcpp/files.cc +++ b/libcpp/files.cc @@ -732,11 +732,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, the majority of C source files. */ size = 8 * 1024; -#ifdef HAVE_SSSE3 - pad = 64; -#else - pad = 16; -#endif + pad = CPP_BUFFER_PADDING; /* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding, allowing search_line_fast to use (possibly misaligned) vector loads. */ buf = XNEWVEC (uchar, size + pad); diff --git a/libcpp/internal.h b/libcpp/internal.h index a20215c57095..ad0a5d5d4e34 100644 --- a/libcpp/internal.h +++ b/libcpp/internal.h @@ -322,6 +322,13 @@ struct _cpp_line_note unsigned int type; }; +/* Tail padding required by search_line_fast alternatives. */ +#ifdef HAVE_SSSE3 +#define CPP_BUFFER_PADDING 64 +#else +#define CPP_BUFFER_PADDING 16 +#endif + /* Represents the contents of a file cpplib has read in. */ struct cpp_buffer { diff --git a/libcpp/lex.cc b/libcpp/lex.cc index f2d47d112b92..7f0f8d07735b 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -359,8 +359,8 @@ search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) "host character encoding is ASCII"); v16qi d1, d2, t1, t2; - /* Unaligned loads. Reading beyond the final newline is safe, - since files.cc:read_file_guts pads the allocation. */ + /* Unaligned loads, potentially using padding after the final newline. */ + static_assert (CPP_BUFFER_PADDING >= 64, ""); d1 = *(const v16qi_u *)s; d2 = *(const v16qi_u *)(s + 16); unsigned m1, m2, found;