[PATCH] D52300: [clangd] Implement VByte PostingList compression

Kirill Bobyrev via Phabricator via cfe-commits Fri, 21 Sep 2018 05:11:18 -0700

kbobyrev updated this revision to Diff 166453.
kbobyrev marked 9 inline comments as done.
kbobyrev added a comment.


I addressed the easiest issues. I'll try to implement separate storage 
structure for `Head`s and `Payload`s which would potentially make the 
implementation cleaner and easier to understand (and also more maintainable 
since that would be way easier to go for SIMD instructions speedups and other 
encoding schemes if we do that).

Also, I'll refine https://reviews.llvm.org/D52047 a little bit and I believe 
that is should be way easier to understand performance + memory consumption 
once we have these benchmarks in. Both @ioeric and @ilya-biryukov expressed 
their concern with regard to the memory consumption "benchmark" and suggested a 
separate binary. While this seems fine to me, I think it's important to keep 
performance + memory tracking infrastructure easy to use (in this sense 
scattering different metrics across multiple binaries makes it less accessible 
and probably introduce some code duplication) and therefore using this "trick" 
is OK to me, but I don't have a strong opinion about this. What do you think, 
@sammccall?


https://reviews.llvm.org/D52300

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/Dex.cpp
  clang-tools-extra/clangd/index/dex/PostingList.cpp
  clang-tools-extra/clangd/index/dex/PostingList.h
  clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp
  clang-tools-extra/unittests/clangd/DexTests.cpp

Index: clang-tools-extra/unittests/clangd/DexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexTests.cpp
@@ -260,15 +260,15 @@
   const PostingList L2({1, 5, 7, 9});
   const PostingList L3({0, 5});
   const PostingList L4({0, 1, 5});
-  const PostingList L5({});
-
-  EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4]");
-
-  auto Nested =
-      createAnd(createAnd(L1.iterator(), L2.iterator()),
-                createOr(L3.iterator(), L4.iterator(), L5.iterator()));
-
-  EXPECT_EQ(llvm::to_string(*Nested), "(& (| [5] [1] [END]) (& [1] [1]))");
+  const PostingList Empty({});
+
+  EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4 ...]");
+  EXPECT_EQ(llvm::to_string(*Empty.iterator()), "[END]");
+  auto It = L0.iterator();
+  It->advanceTo(19);
+  EXPECT_EQ(llvm::to_string(*It), "[... 20 ...]");
+  It->advanceTo(9000);
+  EXPECT_EQ(llvm::to_string(*It), "[... END]");
 }
 
 TEST(DexIterators, Limit) {
Index: clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp
@@ -0,0 +1,64 @@
+//===-- VByteFuzzer.cpp - Fuzz VByte Posting List encoding ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a function that runs clangd on a single input.
+/// This function is then linked into the Fuzzer library.
+///
+//===----------------------------------------------------------------------===//
+
+#include "../Iterator.h"
+#include "../PostingList.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <vector>
+
+using DocID = clang::clangd::dex::DocID;
+
+/// Transform raw byte sequence into list of DocIDs.
+std::vector<DocID> generateDocuments(uint8_t *Data, size_t Size) {
+  std::vector<DocID> Result;
+  DocID ID = 0;
+  for (size_t I = 0; I < Size; ++I) {
+    size_t Offset = I % 4;
+    if (Offset == 0 && I != 0) {
+      ID = 0;
+      Result.push_back(ID);
+    }
+    ID |= (Data[I] << Offset);
+  }
+  if (Size > 4 && Size % 4 != 0)
+    Result.push_back(ID);
+  return Result;
+}
+
+/// This fuzzer checks that compressed PostingList contains can be successfully
+/// decoded into the original sequence.
+extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) {
+  if (Size == 0)
+    return 0;
+  const auto OriginalDocuments = generateDocuments(Data, Size);
+  if (OriginalDocuments.empty())
+    return 0;
+  // Ensure that given sequence of DocIDs is sorted.
+  for (size_t I = 1; I < OriginalDocuments.size(); ++I)
+    if (OriginalDocuments[I] <= OriginalDocuments[I - 1])
+      return 0;
+  const clang::clangd::dex::PostingList List(OriginalDocuments);
+  const auto DecodedDocuments = clang::clangd::dex::consume(*List.iterator());
+  // Compare decoded sequence against the original PostingList contents.
+  if (DecodedDocuments.size() != OriginalDocuments.size())
+    LLVM_BUILTIN_TRAP;
+  for (size_t I = 0; I < DecodedDocuments.size(); ++I)
+    if (DecodedDocuments[I].first != OriginalDocuments[I])
+      LLVM_BUILTIN_TRAP;
+  return 0;
+}
Index: clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt
@@ -0,0 +1,19 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(LLVM_LINK_COMPONENTS Support)
+
+if(LLVM_USE_SANITIZE_COVERAGE)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer")
+endif()
+
+add_clang_executable(clangd-vbyte-fuzzer
+  EXCLUDE_FROM_ALL
+  VByteFuzzer.cpp
+  )
+
+target_link_libraries(clangd-vbyte-fuzzer
+  PRIVATE
+  clangBasic
+  clangDaemon
+  ${LLVM_LIB_FUZZING_ENGINE}
+  )
Index: clang-tools-extra/clangd/index/dex/PostingList.h
===================================================================
--- clang-tools-extra/clangd/index/dex/PostingList.h
+++ clang-tools-extra/clangd/index/dex/PostingList.h
@@ -6,13 +6,19 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This defines posting list interface: a storage for identifiers of symbols
-// which can be characterized by a specific feature (such as fuzzy-find trigram,
-// scope, type or any other Search Token). Posting lists can be traversed in
-// order using an iterator and are values for inverted index, which maps search
-// tokens to corresponding posting lists.
-//
+///
+/// \file
+/// This defines posting list interface: a storage for identifiers of symbols
+/// which can be characterized by a specific feature (such as fuzzy-find
+/// trigram, scope, type or any other Search Token). Posting lists can be
+/// traversed in order using an iterator and are values for inverted index,
+/// which maps search tokens to corresponding posting lists.
+///
+/// In order to decrease size of Index in-memory representation, Variable Byte
+/// Encoding (VByte) is used for PostingLists compression. An overview of VByte
+/// algorithm can be found in "Introduction to Information Retrieval" book:
+/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html
+///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H
@@ -29,20 +35,39 @@
 
 class Iterator;
 
+/// Chunk is a fixed-width piece of PostingList which contains the first DocID
+/// in uncompressed format (Head) and delta-encoded Payload. It can be
+/// decompressed upon request.
+struct Chunk {
+  // Keep sizeof(Chunk) == 32.
+  static constexpr size_t PayloadSize = 32 - sizeof(DocID);
+
+  std::vector<DocID> decompress() const;
+
+  /// The first element of
+  DocID Head;
+  /// VByte-encoded deltas.
+  std::array<uint8_t, PayloadSize> Payload = std::array<uint8_t, PayloadSize>();
+};
+static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory.");
+
 /// PostingList is the storage of DocIDs which can be inserted to the Query
-/// Tree as a leaf by constructing Iterator over the PostingList object.
-// FIXME(kbobyrev): Use VByte algorithm to compress underlying data.
+/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs
+/// are stored in underlying chunks. Compression saves memory at a small cost
+/// in access time, which is still fast enough in practice.
 class PostingList {
 public:
-  explicit PostingList(const std::vector<DocID> &&Documents)
-      : Documents(std::move(Documents)) {}
+  explicit PostingList(const std::vector<DocID> &Documents);
 
+  /// Constructs DocumentIterator over given posting list. DocumentIterator will
+  /// go through the chunks and decompress them on-the-fly when necessary.
   std::unique_ptr<Iterator> iterator() const;
 
-  size_t bytes() const { return Documents.size() * sizeof(DocID); }
+  /// Returns in-memory size.
+  size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); }
 
 private:
-  const std::vector<DocID> Documents;
+  const std::vector<Chunk> Chunks;
 };
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/PostingList.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/PostingList.cpp
+++ clang-tools-extra/clangd/index/dex/PostingList.cpp
@@ -9,74 +9,265 @@
 
 #include "PostingList.h"
 #include "Iterator.h"
+#include <queue>
 
 namespace clang {
 namespace clangd {
 namespace dex {
 
 namespace {
 
-/// Implements Iterator over std::vector<DocID>. This is the most basic
-/// iterator and is simply a wrapper around
-/// std::vector<DocID>::const_iterator.
-class PlainIterator : public Iterator {
+/// Implements iterator of PostingList chunks. This requires iterating over two
+/// levels: the first level iterator iterates over the chunks and decompresses
+/// them on-the-fly when the contents of chunk are to be seen.
+class ChunkIterator : public Iterator {
 public:
-  explicit PlainIterator(llvm::ArrayRef<DocID> Documents)
-      : Documents(Documents), Index(std::begin(Documents)) {}
+  explicit ChunkIterator(const std::vector<Chunk> &Chunks)
+      : Chunks(Chunks), CurrentChunk(begin(Chunks)) {
+    if (!Chunks.empty()) {
+      DecompressedChunk = CurrentChunk->decompress();
+      CurrentID = begin(DecompressedChunk);
+    }
+  }
 
-  bool reachedEnd() const override { return Index == std::end(Documents); }
+  bool reachedEnd() const override { return CurrentChunk == end(Chunks); }
 
   /// Advances cursor to the next item.
   void advance() override {
     assert(!reachedEnd() &&
            "Posting List iterator can't advance() at the end.");
-    ++Index;
+    if (++CurrentID != end(DecompressedChunk))
+      return; // Return if current chunk is not exhausted.
+    ++CurrentChunk;
+    if (reachedEnd())
+      return; // Can't advance to the next chunk at the end.
+    // Decompress next chunk and reset inner iterator.
+    DecompressedChunk = CurrentChunk->decompress();
+    CurrentID = begin(DecompressedChunk);
   }
 
   /// Applies binary search to advance cursor to the next item with DocID
   /// equal or higher than the given one.
   void advanceTo(DocID ID) override {
     assert(!reachedEnd() &&
            "Posting List iterator can't advance() at the end.");
-    // If current ID is beyond requested one, iterator is already in the right
-    // state.
-    if (peek() < ID)
-      Index = std::lower_bound(Index, std::end(Documents), ID);
+    if (ID <= peek())
+      return;
+    // If current chunk doesn't contain needed element, find the chunk which
+    // does.
+    if ((CurrentChunk != end(Chunks) - 1) && ((CurrentChunk + 1)->Head <= ID)) {
+      // Find the next chunk with Head >= ID.
+      CurrentChunk = std::lower_bound(
+          CurrentChunk, end(Chunks), ID,
+          [](const Chunk &C, const DocID ID) { return C.Head < ID; });
+      if (reachedEnd())
+        return;
+      // Look for ID in the previous chunk if the current Head > ID and
+      // therefore needed position is either in previous Chunk or in the
+      // beginning of the current chunk.
+      if (CurrentChunk != begin(Chunks) && ID < CurrentChunk->Head)
+        --CurrentChunk;
+      DecompressedChunk = CurrentChunk->decompress();
+      CurrentID = begin(DecompressedChunk);
+    }
+    // Try to find ID within current chunk.
+    CurrentID = std::lower_bound(CurrentID, std::end(DecompressedChunk), ID);
+    // Return if the position was found in current chunk.
+    if (CurrentID != std::end(DecompressedChunk))
+      return;
+    // Otherwise, the iterator should point to the first element of the next
+    // chunk (if there is any).
+    ++CurrentChunk;
+    if (CurrentChunk != end(Chunks))
+      DecompressedChunk = CurrentChunk->decompress();
+    CurrentID = begin(DecompressedChunk);
   }
 
   DocID peek() const override {
-    assert(!reachedEnd() &&
-           "Posting List iterator can't peek() at the end.");
-    return *Index;
+    assert(!reachedEnd() && "Posting List iterator can't peek() at the end.");
+    return *CurrentID;
   }
 
   float consume() override {
     assert(!reachedEnd() &&
            "Posting List iterator can't consume() at the end.");
     return DEFAULT_BOOST_SCORE;
   }
 
-  size_t estimateSize() const override { return Documents.size(); }
+  size_t estimateSize() const override {
+    return Chunks.size() * ApproxEntriesPerChunk;
+  }
 
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << '[';
-    if (Index != std::end(Documents))
-      OS << *Index;
-    else
-      OS << "END";
+    if (CurrentChunk != begin(Chunks) || CurrentID != begin(DecompressedChunk))
+      OS << "... ";
+    OS << (reachedEnd() ? "END" : std::to_string(*CurrentID));
+    if (!reachedEnd() && CurrentID < std::end(DecompressedChunk) - 1)
+      OS << " ...";
     OS << ']';
     return OS;
   }
 
-  llvm::ArrayRef<DocID> Documents;
-  llvm::ArrayRef<DocID>::const_iterator Index;
+  const std::vector<Chunk> &Chunks;
+  // Iterator over chunks.
+  std::vector<Chunk>::const_iterator CurrentChunk;
+  std::vector<DocID> DecompressedChunk;
+  // Iterator over DecompressedChunk.
+  std::vector<DocID>::iterator CurrentID;
+
+  static constexpr size_t ApproxEntriesPerChunk = 15;
 };
 
+/// Single-byte masks used for VByte compression bit manipulations.
+constexpr uint8_t SevenBytesMask = 0x7f;  // 0b01111111
+constexpr uint8_t FourBytesMask = 0xf;    // 0b00001111
+constexpr uint8_t ContinuationBit = 0x80; // 0b10000000
+
+/// Fills chunk with the maximum number of bits available.
+Chunk createChunk(DocID Head, std::queue<uint8_t> &Payload,
+                  size_t DocumentsCount, size_t MeaningfulBytes) {
+  assert(DocumentsCount > 0 && "Can't create chunk without Head.");
+  Chunk Result;
+  Result.Head = Head;
+  for (size_t I = 0; I < MeaningfulBytes; ++I) {
+    Result.Payload[I] = Payload.front();
+    Payload.pop();
+  }
+  if (MeaningfulBytes < Chunk::PayloadSize)
+    Result.Payload[MeaningfulBytes] = ContinuationBit;
+  return Result;
+}
+
+/// Byte offsets of Payload contents within DocID.
+const size_t Offsets[] = {0, 7, 7 * 2, 7 * 3, 7 * 4};
+
+/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of
+/// DocIDs. The compression stores deltas (differences) between subsequent
+/// DocIDs and encodes these deltas utilizing the least possible number of
+/// bytes.
+///
+/// Each encoding byte consists of two parts: the first bit (continuation bit)
+/// indicates whether this is the last byte of current encoding and seven bytes
+/// a piece of DocID (payload). DocID contains 32 bits and therefore it takes
+/// up to 5 bytes to encode it (4 full 7-bit payloads and one 4-bit payload),
+/// but in practice it is expected that gaps (deltas) between subsequent DocIDs
+/// are not large enough to require 5 bytes. In very dense posting lists (with
+/// average gaps less than 128) this representation would be 4 times more
+/// efficient than raw DocID array.
+///
+/// PostingList encoding example:
+///
+/// DocIDs    42            47        7000
+/// gaps                    5         6958
+/// Encoding  (raw number)  10000101  10110110 00101110
+std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) {
+  // Masks are used to perform bit manipulations over DocID. Each mask
+  // represents
+  static const std::vector<DocID> Masks = {
+      SevenBytesMask,                              // First 7 bytes: 0b1111111
+      SevenBytesMask << 7U,                        // Next 7 bytes
+      SevenBytesMask << 7U * 2,                    // ...
+      SevenBytesMask << 7U * 3,                    // ...
+      static_cast<DocID>(FourBytesMask << 7U * 4), // 4 last bytes
+  };
+
+  // These limits are used to calculate the width of DocID encoding: when
+  // ID >= Limits[I], it takes at least I + 1 bytes.
+  static const DocID Limits[] = {
+      1U << 7U,
+      1U << 7U * 2,
+      1U << 7U * 3,
+      1U << 7U * 4,
+  };
+
+  std::vector<Chunk> Result;
+  std::queue<uint8_t> Payload;
+  size_t HeadIndex = 0;
+  // Keep track of the last Payload size which doesn't exceed the limit.
+  size_t LastEncodingEnd = 0;
+  for (size_t I = 0; I < Documents.size(); ++I) {
+    // Don't encode Heads.
+    if (HeadIndex == I)
+      continue;
+    const DocID Delta = Documents[I] - Documents[I - 1];
+    // Encode Delta.
+    for (size_t I = 0; I < Masks.size(); ++I) {
+      uint8_t Encoding = (Delta & Masks[I]) >> Offsets[I];
+      bool HasNextByte = I < Masks.size() - 1 ? Delta >= Limits[I] : false;
+      // If !HasNextByte, mark the end of encoding stream.
+      Payload.push(!HasNextByte ? Encoding | ContinuationBit : Encoding);
+      if (!HasNextByte)
+        break;
+    }
+    if (Payload.size() <= Chunk::PayloadSize)
+      LastEncodingEnd = Payload.size();
+    // Read stream until Payload overflows.
+    if (Payload.size() < Chunk::PayloadSize)
+      continue;
+    // If Payload contains exactly Chunk::PayloadSize bytes, use all of them to
+    // fill the next Chunk. Otherwise, use the last valid size.
+    Result.push_back(createChunk(Documents[HeadIndex], Payload,
+                                 Payload.size() == Chunk::PayloadSize
+                                     ? I - HeadIndex + 1
+                                     : I - HeadIndex,
+                                 LastEncodingEnd));
+    // Next head is the next item if Payload contains exactly Chunk::PayloadSize
+    // bytes, otherwise it is the current item.
+    HeadIndex = Payload.empty() ? I + 1 : I;
+    // If overflow happened, Payload contains encoding of the next Head: discard
+    // it.
+    while (!Payload.empty())
+      Payload.pop();
+    LastEncodingEnd = 0;
+  }
+  // Add another chunk if there are some bytes left in Payload or if there's a
+  // trailing Head.
+  if (!Payload.empty() || HeadIndex + 1 == Documents.size())
+    Result.push_back(createChunk(Documents[HeadIndex], Payload,
+                                 Documents.size() - HeadIndex,
+                                 LastEncodingEnd));
+  return std::vector<Chunk>(Result); // no move, shrink-to-fit
+}
+
 } // namespace
 
+std::vector<DocID> Chunk::decompress() const {
+  std::vector<DocID> Result{Head};
+  // Store sum of Head and all deltas to keep track of the last ID.
+  DocID Current = Head;
+  DocID Delta = 0;
+  uint8_t Length = 0;
+  // Decode bytes from Payload into Delta.
+  for (const auto &Byte : Payload) {
+    assert(Length <= 5 && "Can't decode sequences longer than 5 bytes");
+    // Write meaningful bits to the correct place in the document decoding.
+    Delta |= (Byte & (Length < 4 ? SevenBytesMask : FourBytesMask))
+             << Offsets[Length];
+    ++Length;
+    // Add document decoding to the result as soon as END marker is seen.
+    if ((Byte & ContinuationBit) != 0) {
+      Current += Delta;
+      Length = 0;
+      // Termination byte.
+      if (Delta == 0)
+        break;
+      Delta = 0;
+      Result.push_back(Current);
+    }
+  }
+  assert(Result.size() == 1 ||
+         Length == 0 &&
+             "Unterminated byte sequence at the end of input stream.");
+  return std::vector<DocID>(Result); // no move, shrink-to-fit
+}
+
+PostingList::PostingList(const std::vector<DocID> &Documents)
+    : Chunks(encodeStream(Documents)) {}
+
 std::unique_ptr<Iterator> PostingList::iterator() const {
-  return llvm::make_unique<PlainIterator>(Documents);
+  return llvm::make_unique<ChunkIterator>(Chunks);
 }
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/Dex.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Dex.cpp
+++ clang-tools-extra/clangd/index/dex/Dex.cpp
@@ -122,8 +122,8 @@
 
   // Convert lists of items to posting lists.
   for (const auto &TokenToPostingList : TempInvertedIndex)
-    InvertedIndex.insert({TokenToPostingList.first,
-                          PostingList(move(TokenToPostingList.second))});
+    InvertedIndex.insert(
+        {TokenToPostingList.first, PostingList(TokenToPostingList.second)});
 
   vlog("Built Dex with estimated memory usage {0} bytes.",
        estimateMemoryUsage());
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -76,6 +76,7 @@
 add_subdirectory(tool)
 add_subdirectory(indexer)
 add_subdirectory(index/dex/dexp)
+add_subdirectory(index/dex/fuzzer)
 
 if (LLVM_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D52300: [clangd] Implement VByte PostingList compression

Reply via email to