https://github.com/dzbarsky updated 
https://github.com/llvm/llvm-project/pull/202663

>From 6fbf746b2d530c3924608dac56bdc84da278a954 Mon Sep 17 00:00:00 2001
From: David Zbarsky <[email protected]>
Date: Tue, 9 Jun 2026 04:54:39 -0400
Subject: [PATCH] [clang][Tooling] Reuse standard-library symbol descriptors

StandardLibrary.cpp expands the generated C and C++ symbol mappings once to 
initialize SymbolHeaderMapping and a second time to count unique symbols. The 
generated SymbolMapping arrays also store two pointers and an unsigned integer 
for every symbol-to-header mapping.

Store qualified names and header names in character tables, and store namespace 
lengths as bytes. Split StdSymbolMap.inc into character tables of at most 48 
KiB so the generated string literals remain below MSVC's 64 KiB limit. 
Construct local StringTable views and traverse each table once for counting and 
initialization. Qualified-name groups do not cross table boundaries, so the 
existing grouped-name count remains valid.

In an arm64 Release build, StandardLibrary.cpp.o decreases from 522,224 to 
181,552 bytes (-340,672), and its loadable contents decrease from 266,416 to 
172,400 bytes (-94,016). __DATA,__const decreases by 184,224 bytes, 
__TEXT,__cstring decreases by 70,199 bytes, and __TEXT,__const increases by 
160,787 bytes. Object relocations decrease from 17,539 to 287 (-17,252).

Linked clangd decreases from 72,128,200 to 72,028,872 bytes (-99,328), and 
stripped clangd decreases from 55,789,312 to 55,690,208 bytes (-99,104). Linked 
fixups decrease from 175,232 to 157,955 (-17,277).

An exhaustive dump of all public C and C++ symbol-to-header mappings is 
byte-identical before and after the change (4,463 lines). A 100-pair 
initialization benchmark measured -0.08% wall time with a 95% bootstrap 
interval of -1.30% to +1.13% and -0.22% CPU time with an interval of -1.47% to 
+1.02%. clangd --check also completes with zero errors.
---
 .../Inclusions/Stdlib/StandardLibrary.cpp     | 233 +++++++++++++-----
 .../Inclusions/Stdlib/StdSymbolMap.inc        |   4 +
 clang/tools/include-mapping/gen_std.py        |  57 ++++-
 3 files changed, 238 insertions(+), 56 deletions(-)

diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp 
b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
index 807a8d8a34ad7..a2257075b1e4f 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StandardLibrary.cpp
@@ -10,10 +10,13 @@
 #include "clang/AST/Decl.h"
 #include "clang/Basic/LangOptions.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringTable.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
+#include <limits>
 #include <optional>
 
 namespace clang {
@@ -24,6 +27,123 @@ namespace {
 // Symbol name -> Symbol::ID, within a namespace.
 using NSSymbolMap = llvm::DenseMap<llvm::StringRef, unsigned>;
 
+using SymbolNamespaceLength = uint8_t;
+static_assert(sizeof(SymbolNamespaceLength) == 1,
+              "symbol namespace lengths must remain one byte");
+static constexpr size_t MaxSymbolMappingStringTableSize = 48 * 1024;
+
+template <size_t Size>
+constexpr SymbolNamespaceLength getSymbolNamespaceLength(const char (&)[Size]) 
{
+  static_assert(Size - 1 <= std::numeric_limits<SymbolNamespaceLength>::max(),
+                "symbol namespace length does not fit in one byte");
+  return static_cast<SymbolNamespaceLength>(Size - 1);
+}
+
+template <size_t Size>
+constexpr bool
+hasValidSymbolMappingStringTableLayout(const char (&Strings)[Size]) {
+  static_assert(Size >= 2, "symbol mapping string table is missing sentinels");
+  static_assert(Size <= MaxSymbolMappingStringTableSize,
+                "symbol mapping string table exceeds the MSVC-safe limit");
+  return Strings[0] == '\0' && Strings[Size - 2] == '\0' &&
+         Strings[Size - 1] == '\0';
+}
+
+static constexpr char CSymbolStrings[] = {"\0"
+#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0"
+#include "CSpecialSymbolMap.inc"
+#include "CSymbolMap.inc"
+#undef SYMBOL
+};
+static constexpr SymbolNamespaceLength CNamespaceLengths[] = {
+#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS),
+#include "CSpecialSymbolMap.inc"
+#include "CSymbolMap.inc"
+#undef SYMBOL
+};
+static_assert(hasValidSymbolMappingStringTableLayout(CSymbolStrings));
+
+static constexpr char CXXSpecialSymbolStrings[] = {"\0"
+#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0"
+#include "StdSpecialSymbolMap.inc"
+#undef SYMBOL
+};
+static constexpr SymbolNamespaceLength CXXSpecialNamespaceLengths[] = {
+#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS),
+#include "StdSpecialSymbolMap.inc"
+#undef SYMBOL
+};
+static_assert(hasValidSymbolMappingStringTableLayout(CXXSpecialSymbolStrings));
+
+#define SYMBOL_MAP_BEGIN(First)                                                
\
+  static constexpr char CXXSymbolStrings##First[] = {"\0"
+#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0"
+#define SYMBOL_MAP_ARRAY_END                                                   
\
+  }                                                                            
\
+  ;
+#define SYMBOL_MAP_PARTITION(Previous, Next)                                   
\
+  SYMBOL_MAP_ARRAY_END                                                         
\
+  static_assert(                                                               
\
+      hasValidSymbolMappingStringTableLayout(CXXSymbolStrings##Previous));     
\
+  static constexpr char CXXSymbolStrings##Next[] = {"\0"
+#define SYMBOL_MAP_END(Last, Count)                                            
\
+  SYMBOL_MAP_ARRAY_END                                                         
\
+  
static_assert(hasValidSymbolMappingStringTableLayout(CXXSymbolStrings##Last));
+#include "StdSymbolMap.inc"
+#undef SYMBOL_MAP_END
+#undef SYMBOL_MAP_PARTITION
+#undef SYMBOL_MAP_ARRAY_END
+#undef SYMBOL
+#undef SYMBOL_MAP_BEGIN
+
+#define SYMBOL_MAP_BEGIN(First)                                                
\
+  static constexpr SymbolNamespaceLength CXXNamespaceLengths##First[] = {
+#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS),
+#define SYMBOL_MAP_ARRAY_END                                                   
\
+  }                                                                            
\
+  ;
+#define SYMBOL_MAP_PARTITION(Previous, Next)                                   
\
+  SYMBOL_MAP_ARRAY_END                                                         
\
+  static constexpr SymbolNamespaceLength CXXNamespaceLengths##Next[] = {
+#define SYMBOL_MAP_END(Last, Count) SYMBOL_MAP_ARRAY_END
+#include "StdSymbolMap.inc"
+#undef SYMBOL_MAP_END
+#undef SYMBOL_MAP_PARTITION
+#undef SYMBOL_MAP_ARRAY_END
+#undef SYMBOL
+#undef SYMBOL_MAP_BEGIN
+
+static constexpr char CXXTsSymbolStrings[] = {"\0"
+#define SYMBOL(Name, NS, Header) #NS #Name "\0" #Header "\0"
+#include "StdTsSymbolMap.inc"
+#undef SYMBOL
+};
+static constexpr SymbolNamespaceLength CXXTsNamespaceLengths[] = {
+#define SYMBOL(Name, NS, Header) getSymbolNamespaceLength(#NS),
+#include "StdTsSymbolMap.inc"
+#undef SYMBOL
+};
+static_assert(hasValidSymbolMappingStringTableLayout(CXXTsSymbolStrings));
+
+struct SymbolMappingTable {
+  // Strings contains the initial empty string followed by alternating
+  // qualified symbol names and header names. NamespaceLengths has one entry
+  // for each qualified symbol name and header name pair.
+  llvm::StringTable Strings;
+  ArrayRef<SymbolNamespaceLength> NamespaceLengths;
+};
+
+static StringRef
+readSymbolMappingString(const SymbolMappingTable &Table,
+                        llvm::StringTable::Offset &StringOffset) {
+  assert(StringOffset.value() < Table.Strings.size() - 1 &&
+         "missing symbol mapping string");
+  StringRef Result = Table.Strings[StringOffset];
+  StringOffset =
+      llvm::StringTable::Offset(StringOffset.value() + Result.size() + 1);
+  return Result;
+}
+
 // A Mapping per language.
 struct SymbolHeaderMapping {
   llvm::StringRef *HeaderNames = nullptr;
@@ -54,37 +174,31 @@ static const SymbolHeaderMapping *getMappingPerLang(Lang 
L) {
   return LanguageMappings[static_cast<unsigned>(L)];
 }
 
-static int countSymbols(Lang Language) {
-  ArrayRef<const char *> Symbols;
-#define SYMBOL(Name, NS, Header) #NS #Name,
-  switch (Language) {
-  case Lang::C: {
-    static constexpr const char *CSymbols[] = {
-#include "CSpecialSymbolMap.inc"
-#include "CSymbolMap.inc"
-    };
-    Symbols = CSymbols;
-    break;
-  }
-  case Lang::CXX: {
-    static constexpr const char *CXXSymbols[] = {
-#include "StdSpecialSymbolMap.inc"
-#include "StdSymbolMap.inc"
-#include "StdTsSymbolMap.inc"
-    };
-    Symbols = CXXSymbols;
-    break;
-  }
+static unsigned countSymbols(ArrayRef<SymbolMappingTable> SymbolMappingTables) 
{
+  unsigned Count = 0;
+  StringRef Previous;
+  for (const SymbolMappingTable &Table : SymbolMappingTables) {
+    llvm::StringTable::Offset StringOffset(1);
+    for (size_t I = 0; I != Table.NamespaceLengths.size(); ++I) {
+      StringRef QName = readSymbolMappingString(Table, StringOffset);
+      readSymbolMappingString(Table, StringOffset);
+      if (Previous != QName) {
+        ++Count;
+        Previous = QName;
+      }
+    }
+    assert(StringOffset.value() == Table.Strings.size() - 1 &&
+           "unexpected symbol mapping string");
   }
-#undef SYMBOL
-  return llvm::DenseSet<StringRef>(llvm::from_range, Symbols).size();
+  return Count;
 }
 
-static int initialize(Lang Language) {
+static int initialize(Lang Language,
+                      ArrayRef<SymbolMappingTable> SymbolMappingTables) {
   SymbolHeaderMapping *Mapping = new SymbolHeaderMapping();
   LanguageMappings[static_cast<unsigned>(Language)] = Mapping;
 
-  unsigned SymCount = countSymbols(Language);
+  unsigned SymCount = countSymbols(SymbolMappingTables);
   Mapping->SymbolCount = SymCount;
   Mapping->SymbolNames =
       new std::remove_reference_t<decltype(*Mapping->SymbolNames)>[SymCount];
@@ -137,36 +251,16 @@ static int initialize(Lang Language) {
     NSSymbols.try_emplace(QName.drop_front(NSLen), SymIndex);
   };
 
-  struct Symbol {
-    const char *QName;
-    unsigned NSLen;
-    const char *HeaderName;
-  };
-#define SYMBOL(Name, NS, Header)                                               
\
-  {#NS #Name, static_cast<decltype(Symbol::NSLen)>(StringRef(#NS).size()),     
\
-   #Header},
-  switch (Language) {
-  case Lang::C: {
-    static constexpr Symbol CSymbols[] = {
-#include "CSpecialSymbolMap.inc"
-#include "CSymbolMap.inc"
-    };
-    for (const Symbol &S : CSymbols)
-      Add(S.QName, S.NSLen, S.HeaderName);
-    break;
-  }
-  case Lang::CXX: {
-    static constexpr Symbol CXXSymbols[] = {
-#include "StdSpecialSymbolMap.inc"
-#include "StdSymbolMap.inc"
-#include "StdTsSymbolMap.inc"
-    };
-    for (const Symbol &S : CXXSymbols)
-      Add(S.QName, S.NSLen, S.HeaderName);
-    break;
-  }
+  for (const SymbolMappingTable &Table : SymbolMappingTables) {
+    llvm::StringTable::Offset StringOffset(1);
+    for (SymbolNamespaceLength NSLen : Table.NamespaceLengths) {
+      StringRef QName = readSymbolMappingString(Table, StringOffset);
+      StringRef HeaderName = readSymbolMappingString(Table, StringOffset);
+      Add(QName, NSLen, HeaderName);
+    }
+    assert(StringOffset.value() == Table.Strings.size() - 1 &&
+           "unexpected symbol mapping string");
   }
-#undef SYMBOL
 
   Mapping->HeaderNames = new llvm::StringRef[Mapping->HeaderIDs->size()];
   for (const auto &E : *Mapping->HeaderIDs)
@@ -175,6 +269,35 @@ static int initialize(Lang Language) {
   return 0;
 }
 
+static int initialize(Lang Language) {
+  switch (Language) {
+  case Lang::C: {
+    const SymbolMappingTable SymbolMappingTables[] = {
+        {llvm::StringTable(CSymbolStrings), CNamespaceLengths}};
+    return initialize(Language, SymbolMappingTables);
+  }
+  case Lang::CXX: {
+    const SymbolMappingTable SymbolMappingTables[] = {
+        {llvm::StringTable(CXXSpecialSymbolStrings),
+         CXXSpecialNamespaceLengths},
+#define SYMBOL(Name, NS, Header)
+#define SYMBOL_MAP_BEGIN(First)                                                
\
+  {llvm::StringTable(CXXSymbolStrings##First), CXXNamespaceLengths##First},
+#define SYMBOL_MAP_PARTITION(Previous, Next)                                   
\
+  {llvm::StringTable(CXXSymbolStrings##Next), CXXNamespaceLengths##Next},
+#define SYMBOL_MAP_END(Last, Count)
+#include "StdSymbolMap.inc"
+#undef SYMBOL_MAP_END
+#undef SYMBOL_MAP_PARTITION
+#undef SYMBOL_MAP_BEGIN
+#undef SYMBOL
+        {llvm::StringTable(CXXTsSymbolStrings), CXXTsNamespaceLengths}};
+    return initialize(Language, SymbolMappingTables);
+  }
+  }
+  llvm_unreachable("unknown language");
+}
+
 static void ensureInitialized() {
   static int Dummy = []() {
     for (unsigned L = 0; L <= static_cast<unsigned>(Lang::LastValue); ++L)
diff --git a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc 
b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
index c1927180d3397..6861cc8d3023a 100644
--- a/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
+++ b/clang/lib/Tooling/Inclusions/Stdlib/StdSymbolMap.inc
@@ -9,6 +9,7 @@
 // Generated from cppreference offline HTML book (modified on 2024-11-10).
 
//===----------------------------------------------------------------------===//
 
+SYMBOL_MAP_BEGIN(0)
 SYMBOL(ATOMIC_BOOL_LOCK_FREE, None, <atomic>)
 SYMBOL(ATOMIC_CHAR16_T_LOCK_FREE, None, <atomic>)
 SYMBOL(ATOMIC_CHAR32_T_LOCK_FREE, None, <atomic>)
@@ -1883,6 +1884,7 @@ SYMBOL(istringstream, std::, <iosfwd>)
 SYMBOL(istrstream, std::, <strstream>)
 SYMBOL(istrstream, std::, <strstream>)
 SYMBOL(isunordered, std::, <cmath>)
+SYMBOL_MAP_PARTITION(0, 1)
 SYMBOL(isunordered, None, <cmath>)
 SYMBOL(isunordered, None, <math.h>)
 SYMBOL(isupper, std::, <cctype>)
@@ -3731,6 +3733,7 @@ SYMBOL(join_with_view, std::ranges::, <ranges>)
 SYMBOL(keys_view, std::ranges::, <ranges>)
 SYMBOL(lazy_split_view, std::ranges::, <ranges>)
 SYMBOL(less, std::ranges::, <functional>)
+SYMBOL_MAP_PARTITION(1, 2)
 SYMBOL(less_equal, std::ranges::, <functional>)
 SYMBOL(lexicographical_compare, std::ranges::, <algorithm>)
 SYMBOL(make_heap, std::ranges::, <algorithm>)
@@ -3985,3 +3988,4 @@ SYMBOL(transform, std::views::, <ranges>)
 SYMBOL(values, std::views::, <ranges>)
 SYMBOL(zip, std::views::, <ranges>)
 SYMBOL(zip_transform, std::views::, <ranges>)
+SYMBOL_MAP_END(2, 3)
diff --git a/clang/tools/include-mapping/gen_std.py 
b/clang/tools/include-mapping/gen_std.py
index f362227bc6aab..2ac1b301a89cf 100755
--- a/clang/tools/include-mapping/gen_std.py
+++ b/clang/tools/include-mapping/gen_std.py
@@ -55,6 +55,59 @@
 
//===----------------------------------------------------------------------===//
 """
 
+# Keep generated character blobs below StringToOffsetTable's 64 KiB threshold.
+# This avoids long string literals that MSVC can silently miscompile.
+MAX_SYMBOL_TABLE_STRING_BYTES = 48 * 1024
+
+
+def SymbolTableRowSize(symbol_row):
+    name, namespace, header = symbol_row
+    return len(("%s%s\0%s\0" % (namespace, name, header)).encode("utf-8"))
+
+
+def EmitSymbolRows(symbol_rows, partition_string_tables):
+    if not partition_string_tables:
+        for name, namespace, header in symbol_rows:
+            print("SYMBOL(%s, %s, %s)" % (name, namespace, header))
+        return
+
+    symbol_groups = []
+    for symbol_row in symbol_rows:
+        qualified_name = symbol_row[1], symbol_row[0]
+        if not symbol_groups or symbol_groups[-1][0] != qualified_name:
+            symbol_groups.append((qualified_name, []))
+        symbol_groups[-1][1].append(symbol_row)
+
+    string_tables = []
+    current_string_table = []
+    # Each StringTable chunk has a leading empty string and a final sentinel.
+    current_string_table_size = 2
+    for _, symbol_group in symbol_groups:
+        group_size = sum(SymbolTableRowSize(row) for row in symbol_group)
+        if group_size + 2 > MAX_SYMBOL_TABLE_STRING_BYTES:
+            raise ValueError("symbol mapping group exceeds string table limit")
+        if (
+            current_string_table
+            and current_string_table_size + group_size > 
MAX_SYMBOL_TABLE_STRING_BYTES
+        ):
+            string_tables.append(current_string_table)
+            current_string_table = []
+            current_string_table_size = 2
+        current_string_table.extend(symbol_group)
+        current_string_table_size += group_size
+    if current_string_table:
+        string_tables.append(current_string_table)
+    if not string_tables:
+        string_tables.append([])
+
+    print("SYMBOL_MAP_BEGIN(0)")
+    for table_index, string_table in enumerate(string_tables):
+        if table_index:
+            print("SYMBOL_MAP_PARTITION(%d, %d)" % (table_index - 1, 
table_index))
+        for name, namespace, header in string_table:
+            print("SYMBOL(%s, %s, %s)" % (name, namespace, header))
+    print("SYMBOL_MAP_END(%d, %d)" % (len(string_tables) - 1, 
len(string_tables)))
+
 
 def ParseArg():
     parser = argparse.ArgumentParser(description="Generate StdGen file")
@@ -263,6 +316,7 @@ def main():
         exit("Path %s doesn't exist!" % symbol_index_root)
 
     symbols = cppreference_parser.GetSymbols(parse_pages)
+    symbol_rows = []
 
     # We don't have version information from the unzipped offline HTML files.
     # so we use the modified time of the symbol_index.html as the version.
@@ -279,7 +333,7 @@ def main():
                 s.headers.extend(AdditionalHeadersForIOSymbols(s))
                 for header in s.headers:
                     # SYMBOL(unqualified_name, namespace, header)
-                    print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace, header))
+                    symbol_rows.append((s.name, s.namespace, header))
         elif len(symbol.headers) == 0:
             sys.stderr.write("No header found for symbol %s\n" % symbol.name)
         else:
@@ -288,6 +342,7 @@ def main():
                 "Ambiguous header for symbol %s: %s\n"
                 % (symbol.name, ", ".join(symbol.headers))
             )
+    EmitSymbolRows(symbol_rows, args.symbols == "cpp")
 
 
 if __name__ == "__main__":

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to