jansvoboda11 created this revision.
jansvoboda11 added a reviewer: benlangmuir.
Herald added subscribers: ributzka, arphaman.
Herald added a project: All.
jansvoboda11 requested review of this revision.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

When loading (transitively) imported AST file, `ModuleManager::addModule()` 
first checks it has the expected signature via `readASTFileSignature()`. The 
signature is part of `UNHASHED_CONTROL_BLOCK`, which is placed at the end of 
the AST file. This means that just to verify signature of an AST file, we need 
to skip over all top-level blocks, paging in the whole AST file from disk. This 
is pretty slow.

This patch moves `UNHASHED_CONTROL_BLOCK` to the start of the AST file, so that 
it can be read more efficiently. To achieve this, we use dummy signature when 
first emitting the unhashed control block, and then backpatch the real 
signature at the end of the serialization process.

This speeds up dependency scanning by over 9% and significantly reduces 
run-to-run variability of my benchmarks.

Depends on D158572 <https://reviews.llvm.org/D158572>.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D158573

Files:
  clang/include/clang/Basic/Module.h
  clang/include/clang/Serialization/ASTBitCodes.h
  clang/include/clang/Serialization/ASTReader.h
  clang/include/clang/Serialization/ASTWriter.h
  clang/lib/Serialization/ASTReader.cpp
  clang/lib/Serialization/ASTWriter.cpp
  clang/lib/Serialization/GlobalModuleIndex.cpp
  clang/test/Modules/ASTSignature.c

Index: clang/test/Modules/ASTSignature.c
===================================================================
--- clang/test/Modules/ASTSignature.c
+++ clang/test/Modules/ASTSignature.c
@@ -1,6 +1,6 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -iquote %S/Inputs/ASTHash/ -fsyntax-only -fmodules \
-// RUN:   -fimplicit-module-maps -fmodules-strict-context-hash \
+// RUN: %clang_cc1             -iquote %S/Inputs/ASTHash/ -fsyntax-only \
+// RUN:   -fmodules -fimplicit-module-maps -fmodules-strict-context-hash \
 // RUN:   -fmodules-cache-path=%t -fdisable-module-hash %s
 // RUN: cp %t/MyHeader2.pcm %t1.pcm
 // RUN: rm -rf %t
@@ -8,17 +8,18 @@
 // RUN:   -fmodules -fimplicit-module-maps -fmodules-strict-context-hash \
 // RUN:   -fmodules-cache-path=%t -fdisable-module-hash %s
 // RUN: cp %t/MyHeader2.pcm %t2.pcm
-// RUN: llvm-bcanalyzer --dump --disable-histogram %t1.pcm > %t1.dump
-// RUN: llvm-bcanalyzer --dump --disable-histogram %t2.pcm > %t2.dump
+// RUN: llvm-bcanalyzer --dump --disable-histogram --show-binary-blobs %t1.pcm > %t1.dump
+// RUN: llvm-bcanalyzer --dump --disable-histogram --show-binary-blobs %t2.pcm > %t2.dump
 // RUN: cat %t1.dump %t2.dump | FileCheck %s
 
 #include "my_header_2.h"
 
 my_int var = 42;
 
-// CHECK: [[AST_BLOCK_HASH:<AST_BLOCK_HASH .*>]]
-// CHECK: [[SIGNATURE:<SIGNATURE .*>]]
-// CHECK: [[AST_BLOCK_HASH]]
-// CHECK-NOT: [[SIGNATURE]]
-// The modules built by this test are designed to yield the same AST. If this
-// test fails, it means that the AST block is has become non-relocatable.
+// CHECK:     <AST_BLOCK_HASH abbrevid={{[0-9]+}}/> blob data = '[[AST_BLOCK_HASH:.*]]'
+// CHECK:     <SIGNATURE abbrevid={{[0-9]+}}/> blob data = '[[SIGNATURE:.*]]'
+// CHECK:     <AST_BLOCK_HASH abbrevid={{[0-9]+}}/> blob data = '[[AST_BLOCK_HASH]]'
+// CHECK-NOT: <SIGNATURE abbrevid={{[0-9]+}}/> blob data = '[[SIGNATURE]]'
+// The modules built by this test are designed to yield the same AST but distinct AST files.
+// If this test fails, it means that either the AST block has become non-relocatable,
+// or the file signature stopped hashing some parts of the AST file.
Index: clang/lib/Serialization/GlobalModuleIndex.cpp
===================================================================
--- clang/lib/Serialization/GlobalModuleIndex.cpp
+++ clang/lib/Serialization/GlobalModuleIndex.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/FileManager.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Serialization/ASTBitCodes.h"
+#include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ModuleFile.h"
 #include "clang/Serialization/PCHContainerOperations.h"
 #include "llvm/ADT/DenseMap.h"
@@ -698,8 +699,7 @@
 
     // Get Signature.
     if (State == DiagnosticOptionsBlock && Code == SIGNATURE)
-      getModuleFileInfo(File).Signature = ASTFileSignature::create(
-          Record.begin(), Record.begin() + ASTFileSignature::size);
+      getModuleFileInfo(File).Signature = ASTReader::readSignature(Blob.data());
 
     // We don't care about this record.
   }
Index: clang/lib/Serialization/ASTWriter.cpp
===================================================================
--- clang/lib/Serialization/ASTWriter.cpp
+++ clang/lib/Serialization/ASTWriter.cpp
@@ -1120,50 +1120,117 @@
 }
 
 std::pair<ASTFileSignature, ASTFileSignature>
-ASTWriter::createSignature(StringRef AllBytes, StringRef ASTBlockBytes) {
+ASTWriter::createSignature() const {
+  StringRef AllBytes(Buffer.data(), Buffer.size());
+
   llvm::SHA1 Hasher;
-  Hasher.update(ASTBlockBytes);
+  Hasher.update(AllBytes.slice(ASTBlockRange.first, ASTBlockRange.second));
   ASTFileSignature ASTBlockHash = ASTFileSignature::create(Hasher.result());
 
-  // Add the remaining bytes (i.e. bytes before the unhashed control block that
-  // are not part of the AST block).
-  Hasher.update(
-      AllBytes.take_front(ASTBlockBytes.bytes_end() - AllBytes.bytes_begin()));
+  // Add the remaining bytes:
+  //  1. Before the unhashed control block.
+  Hasher.update(AllBytes.slice(0, UnhashedControlBlockRange.first));
+  //  2. Between the unhashed control block and the AST block.
   Hasher.update(
-      AllBytes.take_back(AllBytes.bytes_end() - ASTBlockBytes.bytes_end()));
+      AllBytes.slice(UnhashedControlBlockRange.second, ASTBlockRange.first));
+  //  3. After the AST block.
+  Hasher.update(AllBytes.slice(ASTBlockRange.second, StringRef::npos));
   ASTFileSignature Signature = ASTFileSignature::create(Hasher.result());
 
   return std::make_pair(ASTBlockHash, Signature);
 }
 
-ASTFileSignature ASTWriter::writeUnhashedControlBlock(Preprocessor &PP,
-                                                      ASTContext &Context) {
+static constexpr ASTFileSignature DummySignature{
+    {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}};
+static constexpr ASTFileSignature DummyASTBlockHash{
+    {2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}};
+
+static void writeSignature(const ASTFileSignature &Sig,
+                           SmallVectorImpl<char> &Vec) {
+  llvm::raw_svector_ostream Out(Vec);
+  using namespace llvm::support;
+  endian::Writer LE(Out, little);
+  for (uint8_t Byte : Sig)
+    LE.write(Byte);
+}
+
+ASTFileSignature ASTWriter::backpatchSignature() {
+  ASTFileSignature Signature;
+  // For implicit modules, write the hash of the PCM as its signature.
+  if (WritingModule &&
+      PP->getHeaderSearchInfo().getHeaderSearchOpts().ModulesHashContent) {
+
+    auto ReadSignatureAt = [&](uint64_t Offset) {
+      return ASTReader::readSignature(Buffer.data() + Offset);
+    };
+    auto WriteSignatureAt = [&](const ASTFileSignature &Sig, uint64_t Offset) {
+      SmallString<128> Out;
+      writeSignature(Sig, Out);
+      std::copy_n(Out.begin(), Out.size(), Buffer.begin() + Offset);
+    };
+
+    assert(ReadSignatureAt(ASTBlockHashOffset) == DummyASTBlockHash);
+    assert(ReadSignatureAt(SignatureOffset) == DummySignature);
+
+    ASTFileSignature ASTBlockHash;
+    std::tie(ASTBlockHash, Signature) = createSignature();
+
+    WriteSignatureAt(ASTBlockHash, ASTBlockHashOffset);
+    WriteSignatureAt(Signature, SignatureOffset);
+
+    assert(ReadSignatureAt(ASTBlockHashOffset) == ASTBlockHash);
+    assert(ReadSignatureAt(SignatureOffset) == Signature);
+  }
+
+  return Signature;
+}
+
+void ASTWriter::writeUnhashedControlBlock(Preprocessor &PP,
+                                          ASTContext &Context) {
   using namespace llvm;
 
   // Flush first to prepare the PCM hash (signature).
   Stream.FlushToWord();
-  auto StartOfUnhashedControl = Stream.GetCurrentBitNo() >> 3;
+  UnhashedControlBlockRange.first = Stream.GetCurrentBitNo() >> 3;
 
   // Enter the block and prepare to write records.
   RecordData Record;
   Stream.EnterSubblock(UNHASHED_CONTROL_BLOCK_ID, 5);
 
   // For implicit modules, write the hash of the PCM as its signature.
-  ASTFileSignature Signature;
   if (WritingModule &&
       PP.getHeaderSearchInfo().getHeaderSearchOpts().ModulesHashContent) {
-    ASTFileSignature ASTBlockHash;
-    auto ASTBlockStartByte = ASTBlockRange.first >> 3;
-    auto ASTBlockByteLength = (ASTBlockRange.second >> 3) - ASTBlockStartByte;
-    std::tie(ASTBlockHash, Signature) = createSignature(
-        StringRef(Buffer.begin(), StartOfUnhashedControl),
-        StringRef(Buffer.begin() + ASTBlockStartByte, ASTBlockByteLength));
-
-    Record.append(ASTBlockHash.begin(), ASTBlockHash.end());
-    Stream.EmitRecord(AST_BLOCK_HASH, Record);
+    // At this point, we don't know the actual signature of the file or the AST
+    // block - we're only able to compute those at the end of the serialization
+    // process. Let's store dummy signatures for now, and replace them with the
+    // real ones later on.
+    // The bitstream VBR-encodes record elements, which makes backpatching them
+    // really difficult. Let's store the signatures as blobs instead - they are
+    // guaranteed to be word-aligned, and we control their format/encoding.
+    SmallString<128> Blob;
+
+    auto Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(AST_BLOCK_HASH));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+    unsigned ASTBlockHashAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
+
+    Abbrev = std::make_shared<BitCodeAbbrev>();
+    Abbrev->Add(BitCodeAbbrevOp(SIGNATURE));
+    Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+    unsigned SignatureAbbrev = Stream.EmitAbbrev(std::move(Abbrev));
+
+    Record.push_back(AST_BLOCK_HASH);
+    writeSignature(DummyASTBlockHash, Blob);
+    Stream.EmitRecordWithBlob(ASTBlockHashAbbrev, Record, Blob);
+    ASTBlockHashOffset = Buffer.size() - Blob.size();
+    Blob.clear();
     Record.clear();
-    Record.append(Signature.begin(), Signature.end());
-    Stream.EmitRecord(SIGNATURE, Record);
+
+    Record.push_back(SIGNATURE);
+    writeSignature(DummySignature, Blob);
+    Stream.EmitRecordWithBlob(SignatureAbbrev, Record, Blob);
+    SignatureOffset = Buffer.size() - Blob.size();
+    Blob.clear();
     Record.clear();
   }
 
@@ -1232,7 +1299,7 @@
 
   // Leave the options block.
   Stream.ExitBlock();
-  return Signature;
+  UnhashedControlBlockRange.second = Stream.GetCurrentBitNo() >> 3;
 }
 
 /// Write the control block.
@@ -4690,6 +4757,7 @@
 
   ASTContext &Context = SemaRef.Context;
   Preprocessor &PP = SemaRef.PP;
+  writeUnhashedControlBlock(PP, Context);
 
   collectNonAffectingInputFiles();
 
@@ -4838,7 +4906,7 @@
 
   // Write the remaining AST contents.
   Stream.FlushToWord();
-  ASTBlockRange.first = Stream.GetCurrentBitNo();
+  ASTBlockRange.first = Stream.GetCurrentBitNo() >> 3;
   Stream.EnterSubblock(AST_BLOCK_ID, 5);
   ASTBlockStartOffset = Stream.GetCurrentBitNo();
 
@@ -5191,13 +5259,13 @@
   Stream.EmitRecord(STATISTICS, Record);
   Stream.ExitBlock();
   Stream.FlushToWord();
-  ASTBlockRange.second = Stream.GetCurrentBitNo();
+  ASTBlockRange.second = Stream.GetCurrentBitNo() >> 3;
 
   // Write the module file extension blocks.
   for (const auto &ExtWriter : ModuleFileExtensionWriters)
     WriteModuleFileExtension(SemaRef, *ExtWriter);
 
-  return writeUnhashedControlBlock(PP, Context);
+  return backpatchSignature();
 }
 
 void ASTWriter::WriteDeclUpdatesBlocks(RecordDataImpl &OffsetsRecord) {
Index: clang/lib/Serialization/ASTReader.cpp
===================================================================
--- clang/lib/Serialization/ASTReader.cpp
+++ clang/lib/Serialization/ASTReader.cpp
@@ -2678,6 +2678,14 @@
   }
 }
 
+ASTFileSignature ASTReader::readSignature(const char *Blob) {
+  ASTFileSignature Sig;
+  using namespace llvm::support;
+  for (unsigned I = 0; I < ASTFileSignature::size; ++I)
+    Sig[I] = endian::readNext<uint8_t, little, aligned>(Blob);
+  return Sig;
+}
+
 ASTReader::ASTReadResult
 ASTReader::ReadControlBlock(ModuleFile &F,
                             SmallVectorImpl<ImportedModule> &Loaded,
@@ -4734,12 +4742,6 @@
       ShouldFinalizePCM = true;
       return Success;
 
-    case UNHASHED_CONTROL_BLOCK_ID:
-      // This block is handled using look-ahead during ReadControlBlock.  We
-      // shouldn't get here!
-      Error("malformed block record in AST file");
-      return Failure;
-
     default:
       if (llvm::Error Err = Stream.SkipBlock()) {
         Error(std::move(Err));
@@ -4860,12 +4862,11 @@
     switch ((UnhashedControlBlockRecordTypes)MaybeRecordType.get()) {
     case SIGNATURE:
       if (F)
-        F->Signature = ASTFileSignature::create(Record.begin(), Record.end());
+        F->Signature = ASTReader::readSignature(Blob.data());
       break;
     case AST_BLOCK_HASH:
       if (F)
-        F->ASTBlockHash =
-            ASTFileSignature::create(Record.begin(), Record.end());
+        F->ASTBlockHash = ASTReader::readSignature(Blob.data());
       break;
     case DIAGNOSTIC_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_OutOfDate) == 0;
@@ -5168,8 +5169,7 @@
       return ASTFileSignature();
     }
     if (SIGNATURE == MaybeRecord.get())
-      return ASTFileSignature::create(Record.begin(),
-                                      Record.begin() + ASTFileSignature::size);
+      return ASTReader::readSignature(Blob.data());
   }
 }
 
Index: clang/include/clang/Serialization/ASTWriter.h
===================================================================
--- clang/include/clang/Serialization/ASTWriter.h
+++ clang/include/clang/Serialization/ASTWriter.h
@@ -111,7 +111,7 @@
   llvm::BitstreamWriter &Stream;
 
   /// The buffer associated with the bitstream.
-  const SmallVectorImpl<char> &Buffer;
+  SmallVectorImpl<char> &Buffer;
 
   /// The PCM manager which manages memory buffers for pcm files.
   InMemoryModuleCache &ModuleCache;
@@ -128,10 +128,17 @@
   /// The module we're currently writing, if any.
   Module *WritingModule = nullptr;
 
+  /// The byte range representing all the UNHASHED_CONTROL_BLOCK.
+  std::pair<uint64_t, uint64_t> UnhashedControlBlockRange;
+  /// The byte offset of the AST block hash blob.
+  uint64_t ASTBlockHashOffset = 0;
+  /// The byte offset of the signature blob.
+  uint64_t SignatureOffset = 0;
+
   /// The offset of the first bit inside the AST_BLOCK.
   uint64_t ASTBlockStartOffset = 0;
 
-  /// The range representing all the AST_BLOCK.
+  /// The byte range representing all the AST_BLOCK.
   std::pair<uint64_t, uint64_t> ASTBlockRange;
 
   /// The base directory for any relative paths we emit.
@@ -495,12 +502,11 @@
                          StringRef isysroot);
 
   /// Write out the signature and diagnostic options, and return the signature.
-  ASTFileSignature writeUnhashedControlBlock(Preprocessor &PP,
-                                             ASTContext &Context);
+  void writeUnhashedControlBlock(Preprocessor &PP, ASTContext &Context);
+  ASTFileSignature backpatchSignature();
 
   /// Calculate hash of the pcm content.
-  static std::pair<ASTFileSignature, ASTFileSignature>
-  createSignature(StringRef AllBytes, StringRef ASTBlockBytes);
+  std::pair<ASTFileSignature, ASTFileSignature> createSignature() const;
 
   void WriteInputFiles(SourceManager &SourceMgr, HeaderSearchOptions &HSOpts);
   void WriteSourceManagerBlock(SourceManager &SourceMgr,
Index: clang/include/clang/Serialization/ASTReader.h
===================================================================
--- clang/include/clang/Serialization/ASTReader.h
+++ clang/include/clang/Serialization/ASTReader.h
@@ -2394,6 +2394,9 @@
                                llvm::function_ref<void(FileEntryRef)> Visitor);
 
   bool isProcessingUpdateRecords() { return ProcessingUpdateRecords; }
+
+  /// Read the AST file signature out of a binary blob.
+  static ASTFileSignature readSignature(const char *Blob);
 };
 
 } // namespace clang
Index: clang/include/clang/Serialization/ASTBitCodes.h
===================================================================
--- clang/include/clang/Serialization/ASTBitCodes.h
+++ clang/include/clang/Serialization/ASTBitCodes.h
@@ -41,7 +41,7 @@
 /// Version 4 of AST files also requires that the version control branch and
 /// revision match exactly, since there is no backward compatibility of
 /// AST files at this time.
-const unsigned VERSION_MAJOR = 28;
+const unsigned VERSION_MAJOR = 29;
 
 /// AST file minor version number supported by this version of
 /// Clang.
Index: clang/include/clang/Basic/Module.h
===================================================================
--- clang/include/clang/Basic/Module.h
+++ clang/include/clang/Basic/Module.h
@@ -58,7 +58,7 @@
 
   static constexpr size_t size = std::tuple_size<BaseT>::value;
 
-  ASTFileSignature(BaseT S = {{0}}) : BaseT(std::move(S)) {}
+  constexpr ASTFileSignature(BaseT S = {{0}}) : BaseT(std::move(S)) {}
 
   explicit operator bool() const { return *this != BaseT({{0}}); }
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to