kbobyrev updated this revision to Diff 161477. kbobyrev retitled this revision from "[clangd] Introduce BOOST iterators" to "[clangd] Implement BOOST iterator". kbobyrev edited the summary of this revision. kbobyrev added a comment.
Add documentation, cleanup tests. https://reviews.llvm.org/D50970 Files: clang-tools-extra/clangd/index/dex/Iterator.cpp clang-tools-extra/clangd/index/dex/Iterator.h clang-tools-extra/unittests/clangd/DexIndexTests.cpp
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -172,53 +172,61 @@ // FIXME(kbobyrev): The testcase below is similar to what is expected in real // queries. It should be updated once new iterators (such as boosting, limiting, // etc iterators) appear. However, it is not exhaustive and it would be -// beneficial to implement automatic generation of query trees for more -// comprehensive testing. +// beneficial to implement automatic generation (e.g. fuzzing) of query trees +// for more comprehensive testing. TEST(DexIndexIterators, QueryTree) { - // An example of more complicated query // // +-----------------+ // |And Iterator:1, 5| // +--------+--------+ // | // | - // +------------------------------------+ + // +-------------+----------------------+ // | | // | | - // +----------v----------+ +----------v---------+ - // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 5| - // +----------+----------+ +----------+---------+ + // +----------v----------+ +----------v------------+ + // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5| + // +----------+----------+ +----------+------------+ // | | - // +------+-----+ +---------+-----------+ + // +------+-----+ +---------------------+ // | | | | | - // +-------v-----+ +----v-----+ +--v--+ +-V--+ +---v---+ - // |1, 3, 5, 8, 9| |1, 5, 7, 9| |Empty| |0, 5| |0, 1, 5| - // +-------------+ +----------+ +-----+ +----+ +-------+ - + // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+ + // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4| + // +-------------+ +----+---+ +-----+ +---+----+ +----+---+ + // | | | + // +----v-----+ +-v--+ +---v---+ + // |1, 5, 7, 9| |1, 5| |0, 3, 5| + // +----------+ +----+ +-------+ + // const PostingList L0 = {1, 3, 5, 8, 9}; const PostingList L1 = {1, 5, 7, 9}; - const PostingList L2 = {0, 5}; - const PostingList L3 = {0, 1, 5}; - const PostingList L4; + const PostingList L3; + const PostingList L4 = {1, 5}; + const PostingList L5 = {0, 3, 5}; // Root of the query tree: [1, 5] auto Root = createAnd( // Lower And Iterator: [1, 5, 9] - createAnd(create(L0), create(L1)), + createAnd(create(L0), createBoost(create(L1), 2U)), // Lower Or Iterator: [0, 1, 5] - createOr(create(L2), create(L3), create(L4))); + createOr(create(L3), createBoost(create(L4), 3U), + createBoost(create(L5), 4U))); EXPECT_FALSE(Root->reachedEnd()); EXPECT_EQ(Root->peek(), 1U); Root->advanceTo(0); // Advance multiple times. Shouldn't do anything. Root->advanceTo(1); Root->advanceTo(0); EXPECT_EQ(Root->peek(), 1U); + auto ElementBoost = Root->boost(Root->peek()); + EXPECT_THAT(ElementBoost, 6); Root->advance(); EXPECT_EQ(Root->peek(), 5U); Root->advanceTo(5); EXPECT_EQ(Root->peek(), 5U); + ElementBoost = Root->boost(Root->peek()); + EXPECT_THAT(ElementBoost, 8); Root->advanceTo(9000); EXPECT_TRUE(Root->reachedEnd()); } @@ -275,6 +283,34 @@ EXPECT_THAT(consume(*AndIterator), ElementsAre(1, 2, 5)); } +TEST(DexIndexIterators, Boost) { + auto BoostIterator = createBoost(createTrue(5U), 42U); + EXPECT_FALSE(BoostIterator->reachedEnd()); + auto ElementBoost = BoostIterator->boost(BoostIterator->peek()); + EXPECT_THAT(ElementBoost, 42U); + + const PostingList L0 = {2, 4}; + const PostingList L1 = {1, 4}; + auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U), + createBoost(create(L1), 3U)); + + ElementBoost = Root->boost(Root->peek()); + EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE); + Root->advance(); + EXPECT_THAT(Root->peek(), 1U); + ElementBoost = Root->boost(Root->peek()); + EXPECT_THAT(ElementBoost, 3); + + Root->advance(); + EXPECT_THAT(Root->peek(), 2U); + ElementBoost = Root->boost(Root->peek()); + EXPECT_THAT(ElementBoost, 2); + + Root->advanceTo(4); + ElementBoost = Root->boost(Root->peek()); + EXPECT_THAT(ElementBoost, 3); +} + testing::Matcher<std::vector<Token>> trigramsAre(std::initializer_list<std::string> Trigrams) { std::vector<Token> Tokens; Index: clang-tools-extra/clangd/index/dex/Iterator.h =================================================================== --- clang-tools-extra/clangd/index/dex/Iterator.h +++ clang-tools-extra/clangd/index/dex/Iterator.h @@ -66,11 +66,9 @@ /// (their children) to form a multi-level Query Tree. The interface is designed /// to be extensible in order to support multiple types of iterators. class Iterator { - // FIXME(kbobyrev): Provide callback for matched documents. - // FIXME(kbobyrev): Implement new types of iterators: Label, Boost (with - // scoring), Limit. // FIXME(kbobyrev): Implement iterator cost, an estimate of advance() calls // before iterator exhaustion. + // FIXME(kbobyrev): Implement Limit iterator. public: /// Returns true if all valid DocIDs were processed and hence the iterator is /// exhausted. @@ -89,6 +87,9 @@ /// /// Note: reachedEnd() must be false. virtual DocID peek() const = 0; + /// Retrieves boosting score. Query tree root should pass Root->peek() to this + /// function, the parameter is needed to propagate through the tree. + virtual unsigned boost(DocID ID) const = 0; virtual ~Iterator() {} @@ -107,6 +108,8 @@ return Iterator.dump(OS); } + const static unsigned DEFAULT_BOOST_SCORE; + private: virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; }; @@ -116,23 +119,50 @@ std::vector<DocID> consume(Iterator &It, size_t Limit = std::numeric_limits<size_t>::max()); +/// Same as consume(), but also returns boosting scores of each retrieved item. +/// Boosting can be seen as a compromise between retrieving too many items and +/// calculating finals score for each of them (which might be very expensive) +/// and not retrieving enough items so that items with very high final score +/// would not be processed. Boosting score is a computationally efficient way +/// to acquire preliminary scores of requested items. +std::vector<std::pair<DocID, unsigned>> +consumeAndBoost(Iterator &It, + size_t Limit = std::numeric_limits<size_t>::max()); + /// Returns a document iterator over given PostingList. +/// +/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item. std::unique_ptr<Iterator> create(PostingListRef Documents); /// Returns AND Iterator which performs the intersection of the PostingLists of /// its children. +/// +/// boost(): AND Iterator returns the product of Childrens' boosting scores when +/// not exhausted and DEFAULT_BOOST_SCORE otherwise. std::unique_ptr<Iterator> createAnd(std::vector<std::unique_ptr<Iterator>> Children); /// Returns OR Iterator which performs the union of the PostingLists of its /// children. +/// +/// boost(): OR Iterator returns the highest boost value among children pointing +/// to requested item when not exhausted and DEFAULT_BOOST_SCORE otherwise. std::unique_ptr<Iterator> createOr(std::vector<std::unique_ptr<Iterator>> Children); /// Returns TRUE Iterator which iterates over "virtual" PostingList containing /// all items in range [0, Size) in an efficient manner. +/// +/// TRUE returns DEFAULT_BOOST_SCORE for each processed item. std::unique_ptr<Iterator> createTrue(DocID Size); +/// Returns BOOST iterator which multiplies the score of each item by given +/// factor. Boosting can be used as a computationally inexpensive filtering. +/// Users can return significantly more items using consumeAndBoost() and then +/// trim Top K using retrieval score. +std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child, + unsigned Factor); + /// This allows createAnd(create(...), create(...)) syntax. template <typename... Args> std::unique_ptr<Iterator> createAnd(Args... args) { std::vector<std::unique_ptr<Iterator>> Children; Index: clang-tools-extra/clangd/index/dex/Iterator.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/Iterator.cpp +++ clang-tools-extra/clangd/index/dex/Iterator.cpp @@ -16,6 +16,8 @@ namespace clangd { namespace dex { +unsigned const Iterator::DEFAULT_BOOST_SCORE = 1U; + namespace { /// Implements Iterator over a PostingList. DocumentIterator is the most basic @@ -46,6 +48,8 @@ return *Index; } + unsigned boost(DocID ID) const override { return DEFAULT_BOOST_SCORE; } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << '['; @@ -103,6 +107,18 @@ DocID peek() const override { return Children.front()->peek(); } + // If not exhausted and points to the given item, boost() returns the product + // of Children->boost(ID). Otherwise, DEFAULT_BOOST_SCORE is returned. + unsigned boost(DocID ID) const override { + if (reachedEnd() || peek() != ID) + return DEFAULT_BOOST_SCORE; + return std::accumulate( + begin(Children), end(Children), DEFAULT_BOOST_SCORE, + [&](unsigned Current, const std::unique_ptr<Iterator> &Child) { + return Current * Child->boost(ID); + }); + } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << "(& "; @@ -209,6 +225,20 @@ return Result; } + // Returns the maximum boosting score among all Children when iterator is not + // exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise. + unsigned boost(DocID ID) const override { + if (reachedEnd() || peek() != ID) + return DEFAULT_BOOST_SCORE; + return std::accumulate( + begin(Children), end(Children), DEFAULT_BOOST_SCORE, + [&](unsigned Current, const std::unique_ptr<Iterator> &Child) { + return (!Child->reachedEnd() && Child->peek() == ID) + ? std::max(Current, Child->boost(ID)) + : Current; + }); + } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << "(| "; @@ -249,6 +279,8 @@ return Index; } + unsigned boost(DocID) const override { return DEFAULT_BOOST_SCORE; } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << "(TRUE {" << Index << "} out of " << Size << ")"; @@ -260,16 +292,58 @@ DocID Size; }; + +/// Boost iterator is a wrapper around its child which multiplies scores of +/// each retrieved item by a given factor. +class BoostIterator : public Iterator { +public: + BoostIterator(std::unique_ptr<Iterator> Child, unsigned Factor) + : Child(move(Child)), Factor(Factor) {} + + bool reachedEnd() const override { return Child->reachedEnd(); } + + void advance() override { Child->advance(); } + + void advanceTo(DocID ID) override { Child->advanceTo(ID); } + + DocID peek() const override { return Child->peek(); } + + unsigned boost(DocID ID) const override { + return Child->boost(ID) * Factor; + } + +private: + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + OS << "(BOOST " << Factor << ' ' << *Child << ')'; + return OS; + } + + std::unique_ptr<Iterator> Child; + unsigned Factor; +}; + } // end namespace +// FIXME(kbobyrev): Update DexIndex and include two-stage retrieval. std::vector<DocID> consume(Iterator &It, size_t Limit) { std::vector<DocID> Result; for (size_t Retrieved = 0; !It.reachedEnd() && Retrieved < Limit; It.advance(), ++Retrieved) Result.push_back(It.peek()); return Result; } +std::vector<std::pair<DocID, unsigned>> consumeAndBoost(Iterator &It, + size_t Limit) { + std::vector<std::pair<DocID, unsigned>> Result; + for (size_t Retrieved = 0; !It.reachedEnd() && Retrieved < Limit; + It.advance(), ++Retrieved) { + DocID Document = It.peek(); + Result.push_back(std::make_pair(Document, It.boost(Document))); + } + return Result; +} + std::unique_ptr<Iterator> create(PostingListRef Documents) { return llvm::make_unique<DocumentIterator>(Documents); } @@ -288,6 +362,11 @@ return llvm::make_unique<TrueIterator>(Size); } +std::unique_ptr<Iterator> createBoost(std::unique_ptr<Iterator> Child, + unsigned Factor) { + return llvm::make_unique<BoostIterator>(move(Child), Factor); +} + } // namespace dex } // namespace clangd } // namespace clang
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits