This revision was automatically updated to reflect the committed changes.
Closed by commit rCTE340409: [clangd] Implement BOOST iterator (authored by 
omtcyfz, committed by ).

Changed prior to commit:
  https://reviews.llvm.org/D50970?vs=161936&id=161944#toc

Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D50970

Files:
  clangd/index/dex/DexIndex.cpp
  clangd/index/dex/Iterator.cpp
  clangd/index/dex/Iterator.h
  unittests/clangd/DexIndexTests.cpp

Index: unittests/clangd/DexIndexTests.cpp
===================================================================
--- unittests/clangd/DexIndexTests.cpp
+++ unittests/clangd/DexIndexTests.cpp
@@ -29,6 +29,15 @@
 namespace dex {
 namespace {
 
+std::vector<DocID>
+consumeIDs(Iterator &It, size_t Limit = std::numeric_limits<size_t>::max()) {
+  auto IDAndScore = consume(It, Limit);
+  std::vector<DocID> IDs(IDAndScore.size());
+  for (size_t I = 0; I < IDAndScore.size(); ++I)
+    IDs[I] = IDAndScore[I].first;
+  return IDs;
+}
+
 TEST(DexIndexIterators, DocumentIterator) {
   const PostingList L = {4, 7, 8, 20, 42, 100};
   auto DocIterator = create(L);
@@ -62,7 +71,7 @@
   auto AndWithEmpty = createAnd(create(L0), create(L1));
   EXPECT_TRUE(AndWithEmpty->reachedEnd());
 
-  EXPECT_THAT(consume(*AndWithEmpty), ElementsAre());
+  EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre());
 }
 
 TEST(DexIndexIterators, AndTwoLists) {
@@ -72,7 +81,7 @@
   auto And = createAnd(create(L1), create(L0));
 
   EXPECT_FALSE(And->reachedEnd());
-  EXPECT_THAT(consume(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U));
+  EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U));
 
   And = createAnd(create(L0), create(L1));
 
@@ -113,7 +122,7 @@
   auto OrWithEmpty = createOr(create(L0), create(L1));
   EXPECT_FALSE(OrWithEmpty->reachedEnd());
 
-  EXPECT_THAT(consume(*OrWithEmpty),
+  EXPECT_THAT(consumeIDs(*OrWithEmpty),
               ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U));
 }
 
@@ -146,7 +155,7 @@
 
   Or = createOr(create(L0), create(L1));
 
-  EXPECT_THAT(consume(*Or),
+  EXPECT_THAT(consumeIDs(*Or),
               ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U));
 }
 
@@ -178,53 +187,61 @@
 // FIXME(kbobyrev): The testcase below is similar to what is expected in real
 // queries. It should be updated once new iterators (such as boosting, limiting,
 // etc iterators) appear. However, it is not exhaustive and it would be
-// beneficial to implement automatic generation of query trees for more
-// comprehensive testing.
+// beneficial to implement automatic generation (e.g. fuzzing) of query trees
+// for more comprehensive testing.
 TEST(DexIndexIterators, QueryTree) {
-  // An example of more complicated query
   //
   //                      +-----------------+
   //                      |And Iterator:1, 5|
   //                      +--------+--------+
   //                               |
   //                               |
-  //                 +------------------------------------+
+  //                 +-------------+----------------------+
   //                 |                                    |
   //                 |                                    |
-  //      +----------v----------+              +----------v---------+
-  //      |And Iterator: 1, 5, 9|              |Or Iterator: 0, 1, 5|
-  //      +----------+----------+              +----------+---------+
+  //      +----------v----------+              +----------v------------+
+  //      |And Iterator: 1, 5, 9|              |Or Iterator: 0, 1, 3, 5|
+  //      +----------+----------+              +----------+------------+
   //                 |                                    |
-  //          +------+-----+                    +---------+-----------+
+  //          +------+-----+                    +---------------------+
   //          |            |                    |         |           |
-  //  +-------v-----+ +----v-----+           +--v--+    +-V--+    +---v---+
-  //  |1, 3, 5, 8, 9| |1, 5, 7, 9|           |Empty|    |0, 5|    |0, 1, 5|
-  //  +-------------+ +----------+           +-----+    +----+    +-------+
-
+  //  +-------v-----+ +----+---+             +--v--+  +---v----+ +----v---+
+  //  |1, 3, 5, 8, 9| |Boost: 2|             |Empty|  |Boost: 3| |Boost: 4|
+  //  +-------------+ +----+---+             +-----+  +---+----+ +----+---+
+  //                       |                              |           |
+  //                  +----v-----+                      +-v--+    +---v---+
+  //                  |1, 5, 7, 9|                      |1, 5|    |0, 3, 5|
+  //                  +----------+                      +----+    +-------+
+  //
   const PostingList L0 = {1, 3, 5, 8, 9};
   const PostingList L1 = {1, 5, 7, 9};
-  const PostingList L2 = {0, 5};
-  const PostingList L3 = {0, 1, 5};
-  const PostingList L4;
+  const PostingList L3;
+  const PostingList L4 = {1, 5};
+  const PostingList L5 = {0, 3, 5};
 
   // Root of the query tree: [1, 5]
   auto Root = createAnd(
       // Lower And Iterator: [1, 5, 9]
-      createAnd(create(L0), create(L1)),
+      createAnd(create(L0), createBoost(create(L1), 2U)),
       // Lower Or Iterator: [0, 1, 5]
-      createOr(create(L2), create(L3), create(L4)));
+      createOr(create(L3), createBoost(create(L4), 3U),
+               createBoost(create(L5), 4U)));
 
   EXPECT_FALSE(Root->reachedEnd());
   EXPECT_EQ(Root->peek(), 1U);
   Root->advanceTo(0);
   // Advance multiple times. Shouldn't do anything.
   Root->advanceTo(1);
   Root->advanceTo(0);
   EXPECT_EQ(Root->peek(), 1U);
+  auto ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 6);
   Root->advance();
   EXPECT_EQ(Root->peek(), 5U);
   Root->advanceTo(5);
   EXPECT_EQ(Root->peek(), 5U);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 8);
   Root->advanceTo(9000);
   EXPECT_TRUE(Root->reachedEnd());
 }
@@ -256,29 +273,57 @@
   const PostingList L5;
 
   auto DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100));
+  EXPECT_THAT(consumeIDs(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100));
 
   DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100));
+  EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100));
 
   DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator, 3), ElementsAre(4, 7, 8));
+  EXPECT_THAT(consumeIDs(*DocIterator, 3), ElementsAre(4, 7, 8));
 
   DocIterator = create(L0);
-  EXPECT_THAT(consume(*DocIterator, 0), ElementsAre());
+  EXPECT_THAT(consumeIDs(*DocIterator, 0), ElementsAre());
 }
 
 TEST(DexIndexIterators, True) {
   auto TrueIterator = createTrue(0U);
   EXPECT_TRUE(TrueIterator->reachedEnd());
-  EXPECT_THAT(consume(*TrueIterator), ElementsAre());
+  EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre());
 
   PostingList L0 = {1, 2, 5, 7};
   TrueIterator = createTrue(7U);
   EXPECT_THAT(TrueIterator->peek(), 0);
   auto AndIterator = createAnd(create(L0), move(TrueIterator));
   EXPECT_FALSE(AndIterator->reachedEnd());
-  EXPECT_THAT(consume(*AndIterator), ElementsAre(1, 2, 5));
+  EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5));
+}
+
+TEST(DexIndexIterators, Boost) {
+  auto BoostIterator = createBoost(createTrue(5U), 42U);
+  EXPECT_FALSE(BoostIterator->reachedEnd());
+  auto ElementBoost = BoostIterator->consume(BoostIterator->peek());
+  EXPECT_THAT(ElementBoost, 42U);
+
+  const PostingList L0 = {2, 4};
+  const PostingList L1 = {1, 4};
+  auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U),
+                       createBoost(create(L1), 3U));
+
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE);
+  Root->advance();
+  EXPECT_THAT(Root->peek(), 1U);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 3);
+
+  Root->advance();
+  EXPECT_THAT(Root->peek(), 2U);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 2);
+
+  Root->advanceTo(4);
+  ElementBoost = Root->consume(Root->peek());
+  EXPECT_THAT(ElementBoost, 3);
 }
 
 testing::Matcher<std::vector<Token>>
Index: clangd/index/dex/Iterator.h
===================================================================
--- clangd/index/dex/Iterator.h
+++ clangd/index/dex/Iterator.h
@@ -66,11 +66,9 @@
 /// (their children) to form a multi-level Query Tree. The interface is designed
 /// to be extensible in order to support multiple types of iterators.
 class Iterator {
-  // FIXME(kbobyrev): Provide callback for matched documents.
-  // FIXME(kbobyrev): Implement new types of iterators: Label, Boost (with
-  // scoring), Limit.
   // FIXME(kbobyrev): Implement iterator cost, an estimate of advance() calls
   // before iterator exhaustion.
+  // FIXME(kbobyrev): Implement Limit iterator.
 public:
   /// Returns true if all valid DocIDs were processed and hence the iterator is
   /// exhausted.
@@ -89,6 +87,13 @@
   ///
   /// Note: reachedEnd() must be false.
   virtual DocID peek() const = 0;
+  /// Retrieves boosting score. Query tree root should pass Root->peek() to this
+  /// function, the parameter is needed to propagate through the tree. Given ID
+  /// should be compared against BOOST iterator peek()s: some of the iterators
+  /// would not point to the item which was propagated to the top of the query
+  /// tree (e.g. if these iterators are branches of OR iterator) and hence
+  /// shouldn't apply any boosting to the consumed item.
+  virtual float consume(DocID ID) = 0;
 
   virtual ~Iterator() {}
 
@@ -107,32 +112,59 @@
     return Iterator.dump(OS);
   }
 
+  constexpr static float DEFAULT_BOOST_SCORE = 1;
+
 private:
   virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
 };
 
 /// Advances the iterator until it is either exhausted or the number of
-/// requested items is reached. The result contains sorted DocumentIDs.
-std::vector<DocID> consume(Iterator &It,
-                           size_t Limit = std::numeric_limits<size_t>::max());
+/// requested items is reached. Returns pairs of document IDs with the
+/// corresponding boosting score.
+///
+/// Boosting can be seen as a compromise between retrieving too many items and
+/// calculating finals score for each of them (which might be very expensive)
+/// and not retrieving enough items so that items with very high final score
+/// would not be processed. Boosting score is a computationally efficient way
+/// to acquire preliminary scores of requested items.
+std::vector<std::pair<DocID, float>>
+consume(Iterator &It, size_t Limit = std::numeric_limits<size_t>::max());
 
 /// Returns a document iterator over given PostingList.
+///
+/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item.
 std::unique_ptr<Iterator> create(PostingListRef Documents);
 
 /// Returns AND Iterator which performs the intersection of the PostingLists of
 /// its children.
+///
+/// consume(): AND Iterator returns the product of Childrens' boosting scores
+/// when not exhausted and DEFAULT_BOOST_SCORE otherwise.
 std::unique_ptr<Iterator>
 createAnd(std::vector<std::unique_ptr<Iterator>> Children);
 
 /// Returns OR Iterator which performs the union of the PostingLists of its
 /// children.
+///
+/// consume(): OR Iterator returns the highest boost value among children
+/// pointing to requested item when not exhausted and DEFAULT_BOOST_SCORE
+/// otherwise.
 std::unique_ptr<Iterator>
 createOr(std::vector<std::unique_ptr<Iterator>> Children);
 
 /// Returns TRUE Iterator which iterates over "virtual" PostingList containing
 /// all items in range [0, Size) in an efficient manner.
+///
+/// TRUE returns DEFAULT_BOOST_SCORE for each processed item.
 std::unique_ptr<Iterator> createTrue(DocID Size);
 
+/// Returns BOOST iterator which multiplies the score of each item by given
+/// factor. Boosting can be used as a computationally inexpensive filtering.
+/// Users can return significantly more items using consumeAndBoost() and then
+/// trim Top K using retrieval score.
+std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
+                                      float Factor);
+
 /// This allows createAnd(create(...), create(...)) syntax.
 template <typename... Args> std::unique_ptr<Iterator> createAnd(Args... args) {
   std::vector<std::unique_ptr<Iterator>> Children;
Index: clangd/index/dex/Iterator.cpp
===================================================================
--- clangd/index/dex/Iterator.cpp
+++ clangd/index/dex/Iterator.cpp
@@ -46,6 +46,8 @@
     return *Index;
   }
 
+  float consume(DocID ID) override { return DEFAULT_BOOST_SCORE; }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << '[';
@@ -103,6 +105,19 @@
 
   DocID peek() const override { return Children.front()->peek(); }
 
+  // If not exhausted and points to the given item, consume() returns the
+  // product of Children->consume(ID). Otherwise, DEFAULT_BOOST_SCORE is
+  // returned.
+  float consume(DocID ID) override {
+    if (reachedEnd() || peek() != ID)
+      return DEFAULT_BOOST_SCORE;
+    return std::accumulate(
+        begin(Children), end(Children), DEFAULT_BOOST_SCORE,
+        [&](float Current, const std::unique_ptr<Iterator> &Child) {
+          return Current * Child->consume(ID);
+        });
+  }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << "(& ";
@@ -209,6 +224,20 @@
     return Result;
   }
 
+  // Returns the maximum boosting score among all Children when iterator is not
+  // exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise.
+  float consume(DocID ID) override {
+    if (reachedEnd() || peek() != ID)
+      return DEFAULT_BOOST_SCORE;
+    return std::accumulate(
+        begin(Children), end(Children), DEFAULT_BOOST_SCORE,
+        [&](float Current, const std::unique_ptr<Iterator> &Child) {
+          return (!Child->reachedEnd() && Child->peek() == ID)
+                     ? std::max(Current, Child->consume(ID))
+                     : Current;
+        });
+  }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << "(| ";
@@ -249,6 +278,8 @@
     return Index;
   }
 
+  float consume(DocID) override { return DEFAULT_BOOST_SCORE; }
+
 private:
   llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
     OS << "(TRUE {" << Index << "} out of " << Size << ")";
@@ -260,13 +291,42 @@
   DocID Size;
 };
 
+/// Boost iterator is a wrapper around its child which multiplies scores of
+/// each retrieved item by a given factor.
+class BoostIterator : public Iterator {
+public:
+  BoostIterator(std::unique_ptr<Iterator> Child, float Factor)
+      : Child(move(Child)), Factor(Factor) {}
+
+  bool reachedEnd() const override { return Child->reachedEnd(); }
+
+  void advance() override { Child->advance(); }
+
+  void advanceTo(DocID ID) override { Child->advanceTo(ID); }
+
+  DocID peek() const override { return Child->peek(); }
+
+  float consume(DocID ID) override { return Child->consume(ID) * Factor; }
+
+private:
+  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
+    OS << "(BOOST " << Factor << ' ' << *Child << ')';
+    return OS;
+  }
+
+  std::unique_ptr<Iterator> Child;
+  float Factor;
+};
+
 } // end namespace
 
-std::vector<DocID> consume(Iterator &It, size_t Limit) {
-  std::vector<DocID> Result;
+std::vector<std::pair<DocID, float>> consume(Iterator &It, size_t Limit) {
+  std::vector<std::pair<DocID, float>> Result;
   for (size_t Retrieved = 0; !It.reachedEnd() && Retrieved < Limit;
-       It.advance(), ++Retrieved)
-    Result.push_back(It.peek());
+       It.advance(), ++Retrieved) {
+    DocID Document = It.peek();
+    Result.push_back(std::make_pair(Document, It.consume(Document)));
+  }
   return Result;
 }
 
@@ -288,6 +348,11 @@
   return llvm::make_unique<TrueIterator>(Size);
 }
 
+std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child,
+                                      float Factor) {
+  return llvm::make_unique<BoostIterator>(move(Child), Factor);
+}
+
 } // namespace dex
 } // namespace clangd
 } // namespace clang
Index: clangd/index/dex/DexIndex.cpp
===================================================================
--- clangd/index/dex/DexIndex.cpp
+++ clangd/index/dex/DexIndex.cpp
@@ -125,11 +125,15 @@
     // using 100x of the requested number might not be good in practice, e.g.
     // when the requested number of items is small.
     const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount;
-    std::vector<DocID> SymbolDocIDs = consume(*QueryIterator, ItemsToRetrieve);
+    // FIXME(kbobyrev): Add boosting to the query and utilize retrieved
+    // boosting scores.
+    std::vector<std::pair<DocID, float>> SymbolDocIDs =
+        consume(*QueryIterator, ItemsToRetrieve);
 
     // Retrieve top Req.MaxCandidateCount items.
     std::priority_queue<std::pair<float, const Symbol *>> Top;
-    for (const auto &SymbolDocID : SymbolDocIDs) {
+    for (const auto &P : SymbolDocIDs) {
+      const DocID SymbolDocID = P.first;
       const auto *Sym = (*Symbols)[SymbolDocID];
       const llvm::Optional<float> Score = Filter.match(Sym->Name);
       if (!Score)
@@ -161,7 +165,6 @@
   }
 }
 
-
 void DexIndex::findOccurrences(
     const OccurrencesRequest &Req,
     llvm::function_ref<void(const SymbolOccurrence &)> Callback) const {
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to