kbobyrev updated this revision to Diff 160071. kbobyrev added a comment. Complete the tests, finish the implementation.
One thought about prefix match suggestion: we should either make it more explicit for the index (e.g. introduce `prefixMatch` and dispatch `fuzzyMatch` to prefix matching in case query only contains one "true" symbol) or document this properly. While, as I wrote earlier, I totally support the idea of prefix matching queries of length 1 it might not align with some user expectations and it's also very implicit if we just generate tokens this way and don't mention it anywhere in the `DexIndex` implementation. @ioeric, @ilya-biryukov any thoughts? https://reviews.llvm.org/D50517 Files: clang-tools-extra/clangd/index/dex/Trigram.cpp clang-tools-extra/clangd/index/dex/Trigram.h clang-tools-extra/unittests/clangd/DexIndexTests.cpp
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -250,45 +250,60 @@ } TEST(DexIndexTrigrams, IdentifierTrigrams) { - EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"})); + EXPECT_THAT(generateIdentifierTrigrams("X86"), + trigramsAre({"x86", "x$$", "x8$", "86$"})); - EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({})); + EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"})); + + EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"})); EXPECT_THAT(generateIdentifierTrigrams("clangd"), - trigramsAre({"cla", "lan", "ang", "ngd"})); + trigramsAre({"cla", "lan", "ang", "ngd", "an$", "c$$", "cl$", + "ng$", "gd$", "la$"})); - EXPECT_THAT(generateIdentifierTrigrams("abc_def"), - trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"})); + EXPECT_THAT( + generateIdentifierTrigrams("abc_def"), + trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def", "a$$", + "ab$", "ad$", "bc$", "bd$", "cd$", "de$", "ef$"})); EXPECT_THAT( generateIdentifierTrigrams("a_b_c_d_e_"), - trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde", + "a$$", "ab$", "ac$", "bc$", "bd$", "cd$", "ce$", "de$"})); - EXPECT_THAT( - generateIdentifierTrigrams("unique_ptr"), - trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp", - "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"})); + EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), + trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", + "iqp", "ipt", "que", "qup", "qpt", "uep", "ept", + "ptr", "u$$", "un$", "up$", "ni$", "np$", "iq$", + "ip$", "qu$", "qp$", "ue$", "ep$", "pt$", "tr$"})); EXPECT_THAT(generateIdentifierTrigrams("TUDecl"), - trigramsAre({"tud", "tde", "ude", "dec", "ecl"})); - - EXPECT_THAT(generateIdentifierTrigrams("IsOK"), - trigramsAre({"iso", "iok", "sok"})); - - EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"), - trigramsAre({ - "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd", - "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk", - "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg", - "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", - "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", - })); + trigramsAre({"tud", "tde", "ude", "dec", "ecl", "t$$", "tu$", + "td$", "ud$", "de$", "ec$", "cl$"})); + + EXPECT_THAT( + generateIdentifierTrigrams("IsOK"), + trigramsAre({"iso", "iok", "sok", "i$$", "is$", "io$", "so$", "ok$"})); + + EXPECT_THAT( + generateIdentifierTrigrams("abc_defGhij__klm"), + trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh", + "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", + "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", + "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl", + "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik", + "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$", "ag$", + "bc$", "bd$", "bg$", "cd$", "cg$", "de$", "dg$", "dk$", + "ef$", "eg$", "ek$", "fg$", "fk$", "gh$", "gk$", "hi$", + "hk$", "ij$", "ik$", "jk$", "kl$", "lm$"})); } TEST(DexIndexTrigrams, QueryTrigrams) { - EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); + EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"})); + EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); + EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); - EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); EXPECT_THAT(generateQueryTrigrams("clangd"), trigramsAre({"cla", "lan", "ang", "ngd"})); Index: clang-tools-extra/clangd/index/dex/Trigram.h =================================================================== --- clang-tools-extra/clangd/index/dex/Trigram.h +++ clang-tools-extra/clangd/index/dex/Trigram.h @@ -31,6 +31,13 @@ namespace clangd { namespace dex { +/// This is used to mark unigrams and bigrams and distinct them from complete +/// trigrams. Since '$' is not present in valid identifier names, it is safe to +/// use it as the special symbol. +// FIXME(kbobyrev): Global variables are not good, should this be a static +// member of Token even though it's Trigram-specific? +const auto END_MARKER = '$'; + /// Returns list of unique fuzzy-search trigrams from unqualified symbol. /// /// First, given Identifier (unqualified symbol name) is segmented using @@ -44,15 +51,27 @@ /// to the next character, move to the start of the next segment, or skip over a /// segment. /// +/// Special kind of trigrams - incomplete trigrams is also present in the +/// result. Incomplete trigrams contain END_MARKER ('$') at the end. Result +/// contains one unigram which contains the first lowercase letter or digit +/// of given identifier. The result also contains all bigrams which are +/// generated in the same way sequences of length 3 are created. +/// /// Note: the returned list of trigrams does not have duplicates, if any trigram /// belongs to more than one class it is only inserted once. +// FIXME(kbobyrev): Document somewhere in DexIndex that for queries of size 1 +// it will do prefix matching instead of fuzzy matching on the identifier names, +// because it might be not obvious for users. std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier); /// Returns list of unique fuzzy-search trigrams given a query. /// /// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then, /// the simplest trigrams - sequences of three consecutive letters and digits /// are extracted and returned after deduplication. +/// +/// For short queries (Query contains less than 3 letters and digits) this +/// returns a single trigram with all valid symbols. std::vector<Token> generateQueryTrigrams(llvm::StringRef Query); } // namespace dex Index: clang-tools-extra/clangd/index/dex/Trigram.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/Trigram.cpp +++ clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -10,11 +10,9 @@ #include "Trigram.h" #include "../../FuzzyMatch.h" #include "Token.h" - #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringExtras.h" - #include <cctype> #include <queue> #include <string> @@ -25,12 +23,14 @@ namespace clangd { namespace dex { -// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would -// be generating unigrams and bigrams here, too. This would prevent symbol index -// from applying fuzzy matching on a tremendous number of symbols and allow -// supplementary retrieval for short queries. -// -// Short names (total segment length <3 characters) are currently ignored. +// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are +// likely to be very dense and hence require special attention so that the index +// doesn't use too much memory. Possible solution would be to construct +// compressed posting lists which consist of ranges of DocIDs instead of +// distinct DocIDs. A special case would be the empty query: for that case +// TrueIterator should be implemented - an iterator which doesn't actually store +// any PostingList within itself, but "contains" all DocIDs in range +// [0, IndexSize). std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) { // Apply fuzzy matching text segmentation. std::vector<CharRole> Roles(Identifier.size()); @@ -59,15 +59,37 @@ } } + // Iterate through valid seqneces of three characters Fuzzy Matcher can + // process. DenseSet<Token> UniqueTrigrams; std::array<char, 4> Chars; + bool FoundFirstSymbol = false; for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) continue; + + // Generate incomplete trigram with one symbol only from the first valid + // symbol of the identifier. + if (!FoundFirstSymbol) { + FoundFirstSymbol = true; + Chars = {{LowercaseIdentifier[I], END_MARKER, END_MARKER, 0}}; + const auto Unigram = Token(Token::Kind::Trigram, Chars.data()); + if (!UniqueTrigrams.count(Unigram)) { + UniqueTrigrams.insert(Unigram); + } + } + for (const unsigned J : Next[I]) { if (!J) continue; + + Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J], END_MARKER, 0}}; + const auto Bigram = Token(Token::Kind::Trigram, Chars.data()); + if (!UniqueTrigrams.count(Bigram)) { + UniqueTrigrams.insert(Bigram); + } + for (const unsigned K : Next[J]) { if (!K) continue; @@ -89,13 +111,19 @@ return Result; } -// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short -// inputs (total segment length <3 characters). +// FIXME(kbobyrev): Correctly handle empty trigrams "$$$". std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) { // Apply fuzzy matching text segmentation. std::vector<CharRole> Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + // Additional pass is necessary to count valid identifier characters. + // Depending on that, this function might return incomplete trigram. + unsigned ValidSymbolsCount = 0; + for (size_t I = 0; I < Roles.size(); ++I) + if (Roles[I] == Head || Roles[I] == Tail) + ++ValidSymbolsCount; + std::string LowercaseQuery = Query.lower(); DenseSet<Token> UniqueTrigrams; @@ -110,9 +138,27 @@ if (Chars.size() > 3) Chars.pop_front(); + + if (ValidSymbolsCount < 3 && Chars.size() == ValidSymbolsCount) { + while (Chars.size() < 3) { + Chars.push_back(END_MARKER); + } + auto Trigram = + Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); + UniqueTrigrams.insert(Trigram); + // The symbol does not have sufficient number of valid symbols for + // complete trigrams, all valid symbols were seen at this point. + break; + } + if (Chars.size() == 3) { auto Trigram = Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); + + if (Chars.front() == END_MARKER) + Trigram = Token(Token::Kind::Trigram, + std::string(Chars.rbegin(), Chars.rend())); + // Push unique trigrams to the result. if (!UniqueTrigrams.count(Trigram)) { UniqueTrigrams.insert(Trigram);
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits