https://github.com/dzbarsky updated https://github.com/llvm/llvm-project/pull/202663
>From f57f93003659102e558b4c84d3ed94b0cb958c50 Mon Sep 17 00:00:00 2001 From: David Zbarsky <[email protected]> Date: Tue, 9 Jun 2026 04:54:39 -0400 Subject: [PATCH] [clang][Tooling] Reuse standard-library symbol descriptors StandardLibrary.cpp expands the generated C and C++ symbol mappings once to initialize SymbolHeaderMapping and a second time to count unique symbols. The generated SymbolMapping arrays also store two pointers and an unsigned integer for every symbol-to-header mapping. Store qualified names and header names in character tables, and store namespace lengths as bytes. Split StdSymbolMap.inc into character tables of at most 48 KiB so the generated string literals remain below MSVC's 64 KiB limit. Construct local StringTable views and traverse each table once for counting and initialization. Qualified-name groups do not cross table boundaries, so the existing grouped-name count remains valid. In an arm64 Release build, StandardLibrary.cpp.o decreases from 522,224 to 181,552 bytes (-340,672), and its loadable contents decrease from 266,416 to 172,400 bytes (-94,016). __DATA,__const decreases by 184,224 bytes, __TEXT,__cstring decreases by 70,199 bytes, and __TEXT,__const increases by 160,787 bytes. Object relocations decrease from 17,539 to 287 (-17,252). Linked clangd decreases from 72,128,200 to 72,028,872 bytes (-99,328), and stripped clangd decreases from 55,789,312 to 55,690,208 bytes (-99,104). Linked fixups decrease from 175,232 to 157,955 (-17,277). An exhaustive dump of all public C and C++ symbol-to-header mappings is byte-identical before and after the change (4,463 lines). A 100-pair initialization benchmark measured -0.08% wall time with a 95% bootstrap interval of -1.30% to +1.13% and -0.22% CPU time with an interval of -1.47% to +1.02%. clangd --check also completes with zero errors. --- .../Inclusions/Stdlib/StandardLibrary.cpp | 233 +++++++++++++----- .../Inclusions/Stdlib/StdSymbolMap.inc | 4 + clang/tools/include-mapping/gen_std.py | 61 ++++- 3 files changed, 242 insertions(+), 56 deletions(-) diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp index 807a8d8a34ad7..a2257075b1e4f 100644 --- a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp +++ b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp @@ -10,10 +10,13 @@ #include "clang/AST/Decl.h" #include "clang/Basic/LangOptions.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringTable.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include <cstdint> +#include <limits> #include <optional> namespace clang { @@ -24,6 +27,123 @@ namespace { // Symbol name -> Symbol::ID, within a namespace. using NSSymbolMap = llvm::DenseMap<llvm::StringRef, unsigned>; +using SymbolNamespaceLength = uint8_t; +static_assert(sizeof(SymbolNamespaceLength) == 1, + "symbol namespace lengths must remain one byte"); +static constexpr size_t MaxSymbolMappingStringTableSize = 48 * 1024; + +template <size_t Size> +constexpr SymbolNamespaceLength getSymbolNamespaceLength(const char (&)[Size]) { + static_assert(Size - 1 <= std::numeric_limits<SymbolNamespaceLength>::max(), + "symbol namespace length does not fit in one byte"); + return static_cast<SymbolNamespaceLength>(Size - 1); +} + +template <size_t Size> +constexpr bool +hasValidSymbolMappingStringTableLayout(const char (&Strings)[Size]) { + static_assert(Size >= 2, "symbol mapping string table is missing sentinels"); + static_assert(Size <= MaxSymbolMappingStringTableSize, + "symbol mapping string table exceeds the MSVC-safe limit"); + return Strings[0] == '\0' && Strings[Size - 2] == '\0' && + Strings[Size - 1] == '\0'; +} + +static constexpr char CSymbolStrings[] = {"\0" +#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0" +#include "CSpecialSymbolMap.inc" +#include "CSymbolMap.inc" +#undef SYMBOL +}; +static constexpr SymbolNamespaceLength CNamespaceLengths[] = { +#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS), +#include "CSpecialSymbolMap.inc" +#include "CSymbolMap.inc" +#undef SYMBOL +}; +static_assert(hasValidSymbolMappingStringTableLayout(CSymbolStrings)); + +static constexpr char CXXSpecialSymbolStrings[] = {"\0" +#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0" +#include "StdSpecialSymbolMap.inc" +#undef SYMBOL +}; +static constexpr SymbolNamespaceLength CXXSpecialNamespaceLengths[] = { +#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS), +#include "StdSpecialSymbolMap.inc" +#undef SYMBOL +}; +static_assert(hasValidSymbolMappingStringTableLayout(CXXSpecialSymbolStrings)); + +#define SYMBOL_MAP_BEGIN(First) \ + static constexpr char CXXSymbolStrings##First[] = {"\0" +#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0" +#define SYMBOL_MAP_ARRAY_END \ + } \ + ; +#define SYMBOL_MAP_PARTITION(Previous, Next) \ + SYMBOL_MAP_ARRAY_END \ + static_assert( \ + hasValidSymbolMappingStringTableLayout(CXXSymbolStrings##Previous)); \ + static constexpr char CXXSymbolStrings##Next[] = {"\0" +#define SYMBOL_MAP_END(Last, Count) \ + SYMBOL_MAP_ARRAY_END \ + static_assert(hasValidSymbolMappingStringTableLayout(CXXSymbolStrings##Last)); +#include "StdSymbolMap.inc" +#undef SYMBOL_MAP_END +#undef SYMBOL_MAP_PARTITION +#undef SYMBOL_MAP_ARRAY_END +#undef SYMBOL +#undef SYMBOL_MAP_BEGIN + +#define SYMBOL_MAP_BEGIN(First) \ + static constexpr SymbolNamespaceLength CXXNamespaceLengths##First[] = { +#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS), +#define SYMBOL_MAP_ARRAY_END \ + } \ + ; +#define SYMBOL_MAP_PARTITION(Previous, Next) \ + SYMBOL_MAP_ARRAY_END \ + static constexpr SymbolNamespaceLength CXXNamespaceLengths##Next[] = { +#define SYMBOL_MAP_END(Last, Count) SYMBOL_MAP_ARRAY_END +#include "StdSymbolMap.inc" +#undef SYMBOL_MAP_END +#undef SYMBOL_MAP_PARTITION +#undef SYMBOL_MAP_ARRAY_END +#undef SYMBOL +#undef SYMBOL_MAP_BEGIN + +static constexpr char CXXTsSymbolStrings[] = {"\0" +#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0" +#include "StdTsSymbolMap.inc" +#undef SYMBOL +}; +static constexpr SymbolNamespaceLength CXXTsNamespaceLengths[] = { +#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS), +#include "StdTsSymbolMap.inc" +#undef SYMBOL +}; +static_assert(hasValidSymbolMappingStringTableLayout(CXXTsSymbolStrings)); + +struct SymbolMappingTable { + // Strings contains the initial empty string followed by alternating + // qualified symbol names and header names. NamespaceLengths has one entry + // for each qualified symbol name and header name pair. + llvm::StringTable Strings; + ArrayRef<SymbolNamespaceLength> NamespaceLengths; +}; + +static StringRef +readSymbolMappingString(const SymbolMappingTable &Table, + llvm::StringTable::Offset &StringOffset) { + assert(StringOffset.value() < Table.Strings.size() - 1 && + "missing symbol mapping string"); + StringRef Result = Table.Strings[StringOffset]; + StringOffset = + llvm::StringTable::Offset(StringOffset.value() + Result.size() + 1); + return Result; +} + // A Mapping per language. struct SymbolHeaderMapping { llvm::StringRef *HeaderNames = nullptr; @@ -54,37 +174,31 @@ static const SymbolHeaderMapping *getMappingPerLang(Lang L) { return LanguageMappings[static_cast<unsigned>(L)]; } -static int countSymbols(Lang Language) { - ArrayRef<const char *> Symbols; -#define SYMBOL(Name, NS, Header) #NS #Name, - switch (Language) { - case Lang::C: { - static constexpr const char *CSymbols[] = { -#include "CSpecialSymbolMap.inc" -#include "CSymbolMap.inc" - }; - Symbols = CSymbols; - break; - } - case Lang::CXX: { - static constexpr const char *CXXSymbols[] = { -#include "StdSpecialSymbolMap.inc" -#include "StdSymbolMap.inc" -#include "StdTsSymbolMap.inc" - }; - Symbols = CXXSymbols; - break; - } +static unsigned countSymbols(ArrayRef<SymbolMappingTable> SymbolMappingTables) { + unsigned Count = 0; + StringRef Previous; + for (const SymbolMappingTable &Table : SymbolMappingTables) { + llvm::StringTable::Offset StringOffset(1); + for (size_t I = 0; I != Table.NamespaceLengths.size(); ++I) { + StringRef QName = readSymbolMappingString(Table, StringOffset); + readSymbolMappingString(Table, StringOffset); + if (Previous != QName) { + ++Count; + Previous = QName; + } + } + assert(StringOffset.value() == Table.Strings.size() - 1 && + "unexpected symbol mapping string"); } -#undef SYMBOL - return llvm::DenseSet<StringRef>(llvm::from_range, Symbols).size(); + return Count; } -static int initialize(Lang Language) { +static int initialize(Lang Language, + ArrayRef<SymbolMappingTable> SymbolMappingTables) { SymbolHeaderMapping *Mapping = new SymbolHeaderMapping(); LanguageMappings[static_cast<unsigned>(Language)] = Mapping; - unsigned SymCount = countSymbols(Language); + unsigned SymCount = countSymbols(SymbolMappingTables); Mapping->SymbolCount = SymCount; Mapping->SymbolNames = new std::remove_reference_t<decltype(*Mapping->SymbolNames)>[SymCount]; @@ -137,36 +251,16 @@ static int initialize(Lang Language) { NSSymbols.try_emplace(QName.drop_front(NSLen), SymIndex); }; - struct Symbol { - const char *QName; - unsigned NSLen; - const char *HeaderName; - }; -#define SYMBOL(Name, NS, Header) \ - {#NS #Name, static_cast<decltype(Symbol::NSLen)>(StringRef(#NS).size()), \ - #Header}, - switch (Language) { - case Lang::C: { - static constexpr Symbol CSymbols[] = { -#include "CSpecialSymbolMap.inc" -#include "CSymbolMap.inc" - }; - for (const Symbol &S : CSymbols) - Add(S.QName, S.NSLen, S.HeaderName); - break; - } - case Lang::CXX: { - static constexpr Symbol CXXSymbols[] = { -#include "StdSpecialSymbolMap.inc" -#include "StdSymbolMap.inc" -#include "StdTsSymbolMap.inc" - }; - for (const Symbol &S : CXXSymbols) - Add(S.QName, S.NSLen, S.HeaderName); - break; - } + for (const SymbolMappingTable &Table : SymbolMappingTables) { + llvm::StringTable::Offset StringOffset(1); + for (SymbolNamespaceLength NSLen : Table.NamespaceLengths) { + StringRef QName = readSymbolMappingString(Table, StringOffset); + StringRef HeaderName = readSymbolMappingString(Table, StringOffset); + Add(QName, NSLen, HeaderName); + } + assert(StringOffset.value() == Table.Strings.size() - 1 && + "unexpected symbol mapping string"); } -#undef SYMBOL Mapping->HeaderNames = new llvm::StringRef[Mapping->HeaderIDs->size()]; for (const auto &E : *Mapping->HeaderIDs) @@ -175,6 +269,35 @@ static int initialize(Lang Language) { return 0; } +static int initialize(Lang Language) { + switch (Language) { + case Lang::C: { + const SymbolMappingTable SymbolMappingTables[] = { + {llvm::StringTable(CSymbolStrings), CNamespaceLengths}}; + return initialize(Language, SymbolMappingTables); + } + case Lang::CXX: { + const SymbolMappingTable SymbolMappingTables[] = { + {llvm::StringTable(CXXSpecialSymbolStrings), + CXXSpecialNamespaceLengths}, +#define SYMBOL(Name, NS, Header) +#define SYMBOL_MAP_BEGIN(First) \ + {llvm::StringTable(CXXSymbolStrings##First), CXXNamespaceLengths##First}, +#define SYMBOL_MAP_PARTITION(Previous, Next) \ + {llvm::StringTable(CXXSymbolStrings##Next), CXXNamespaceLengths##Next}, +#define SYMBOL_MAP_END(Last, Count) +#include "StdSymbolMap.inc" +#undef SYMBOL_MAP_END +#undef SYMBOL_MAP_PARTITION +#undef SYMBOL_MAP_BEGIN +#undef SYMBOL + {llvm::StringTable(CXXTsSymbolStrings), CXXTsNamespaceLengths}}; + return initialize(Language, SymbolMappingTables); + } + } + llvm_unreachable("unknown language"); +} + static void ensureInitialized() { static int Dummy = []() { for (unsigned L = 0; L <= static_cast<unsigned>(Lang::LastValue); ++L) diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc index c1927180d3397..6861cc8d3023a 100644 --- a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc +++ b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc @@ -9,6 +9,7 @@ // Generated from cppreference offline HTML book (modified on 2024-11-10). //===----------------------------------------------------------------------===// +SYMBOL_MAP_BEGIN(0) SYMBOL(ATOMIC_BOOL_LOCK_FREE, None, <atomic>) SYMBOL(ATOMIC_CHAR16_T_LOCK_FREE, None, <atomic>) SYMBOL(ATOMIC_CHAR32_T_LOCK_FREE, None, <atomic>) @@ -1883,6 +1884,7 @@ SYMBOL(istringstream, std::, <iosfwd>) SYMBOL(istrstream, std::, <strstream>) SYMBOL(istrstream, std::, <strstream>) SYMBOL(isunordered, std::, <cmath>) +SYMBOL_MAP_PARTITION(0, 1) SYMBOL(isunordered, None, <cmath>) SYMBOL(isunordered, None, <math.h>) SYMBOL(isupper, std::, <cctype>) @@ -3731,6 +3733,7 @@ SYMBOL(join_with_view, std::ranges::, <ranges>) SYMBOL(keys_view, std::ranges::, <ranges>) SYMBOL(lazy_split_view, std::ranges::, <ranges>) SYMBOL(less, std::ranges::, <functional>) +SYMBOL_MAP_PARTITION(1, 2) SYMBOL(less_equal, std::ranges::, <functional>) SYMBOL(lexicographical_compare, std::ranges::, <algorithm>) SYMBOL(make_heap, std::ranges::, <algorithm>) @@ -3985,3 +3988,4 @@ SYMBOL(transform, std::views::, <ranges>) SYMBOL(values, std::views::, <ranges>) SYMBOL(zip, std::views::, <ranges>) SYMBOL(zip_transform, std::views::, <ranges>) +SYMBOL_MAP_END(2, 3) diff --git a/clang/tools/include-mapping/gen_std.py b/clang/tools/include-mapping/gen_std.py index f362227bc6aab..90cd58ecf2e90 100755 --- a/clang/tools/include-mapping/gen_std.py +++ b/clang/tools/include-mapping/gen_std.py @@ -55,6 +55,63 @@ //===----------------------------------------------------------------------===// """ +# Keep generated character blobs below StringToOffsetTable's 64 KiB threshold. +# This avoids long string literals that MSVC can silently miscompile. +MAX_SYMBOL_TABLE_STRING_BYTES = 48 * 1024 + + +def SymbolTableRowSize(symbol_row): + name, namespace, header = symbol_row + return len(("%s%s\0%s\0" % (namespace, name, header)).encode("utf-8")) + + +def EmitSymbolRows(symbol_rows, partition_string_tables): + if not partition_string_tables: + for name, namespace, header in symbol_rows: + print("SYMBOL(%s, %s, %s)" % (name, namespace, header)) + return + + symbol_groups = [] + for symbol_row in symbol_rows: + qualified_name = symbol_row[1], symbol_row[0] + if not symbol_groups or symbol_groups[-1][0] != qualified_name: + symbol_groups.append((qualified_name, [])) + symbol_groups[-1][1].append(symbol_row) + + string_tables = [] + current_string_table = [] + # Each StringTable chunk has a leading empty string and a final sentinel. + current_string_table_size = 2 + for _, symbol_group in symbol_groups: + group_size = sum(SymbolTableRowSize(row) for row in symbol_group) + if group_size + 2 > MAX_SYMBOL_TABLE_STRING_BYTES: + raise ValueError("symbol mapping group exceeds string table limit") + if ( + current_string_table + and current_string_table_size + group_size + > MAX_SYMBOL_TABLE_STRING_BYTES + ): + string_tables.append(current_string_table) + current_string_table = [] + current_string_table_size = 2 + current_string_table.extend(symbol_group) + current_string_table_size += group_size + if current_string_table: + string_tables.append(current_string_table) + if not string_tables: + string_tables.append([]) + + print("SYMBOL_MAP_BEGIN(0)") + for table_index, string_table in enumerate(string_tables): + if table_index: + print("SYMBOL_MAP_PARTITION(%d, %d)" % (table_index - 1, table_index)) + for name, namespace, header in string_table: + print("SYMBOL(%s, %s, %s)" % (name, namespace, header)) + print( + "SYMBOL_MAP_END(%d, %d)" + % (len(string_tables) - 1, len(string_tables)) + ) + def ParseArg(): parser = argparse.ArgumentParser(description="Generate StdGen file") @@ -263,6 +320,7 @@ def main(): exit("Path %s doesn't exist!" % symbol_index_root) symbols = cppreference_parser.GetSymbols(parse_pages) + symbol_rows = [] # We don't have version information from the unzipped offline HTML files. # so we use the modified time of the symbol_index.html as the version. @@ -279,7 +337,7 @@ def main(): s.headers.extend(AdditionalHeadersForIOSymbols(s)) for header in s.headers: # SYMBOL(unqualified_name, namespace, header) - print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace, header)) + symbol_rows.append((s.name, s.namespace, header)) elif len(symbol.headers) == 0: sys.stderr.write("No header found for symbol %s\n" % symbol.name) else: @@ -288,6 +346,7 @@ def main(): "Ambiguous header for symbol %s: %s\n" % (symbol.name, ", ".join(symbol.headers)) ) + EmitSymbolRows(symbol_rows, args.symbols == "cpp") if __name__ == "__main__": _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
