Author: Kirill Bobyrev Date: 2021-12-07T15:46:13+01:00 New Revision: 976a74d7d2dbd19670614f603caf490cca892fdc
URL: https://github.com/llvm/llvm-project/commit/976a74d7d2dbd19670614f603caf490cca892fdc DIFF: https://github.com/llvm/llvm-project/commit/976a74d7d2dbd19670614f603caf490cca892fdc.diff LOG: [clangd] Dex Trigrams: Improve query trigram generation These are the trigrams for queries right now: - "va" -> {Trigram("va")} - "va_" -> {} (empty) This is suboptimal since the resulting query will discard the query information and return all symbols, some of which will be later be scored expensively (fuzzy matching score). This is related to https://github.com/clangd/clangd/issues/39 but does not fix it. Accidentally, because of that incorrect behavior, when user types "tok::va" there are no results (the issue is that `tok::kw___builtin_va_arg` does not have "va" token) but when "tok::va_" is typed, expected result (`tok::kw___builtin_va_arg`) shows up by accident. This is because the dex query transformer will only lookup symbols within the `tok::` namespace. There won't be many, so the returned results will contain symbol we need; this symbol will be filtered out by the expensive checks and that will be displayed in the editor. Reviewed By: sammccall Differential Revision: https://reviews.llvm.org/D113995 Added: Modified: clang-tools-extra/clangd/index/dex/Trigram.cpp clang-tools-extra/clangd/unittests/DexTests.cpp Removed: ################################################################################ diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp index fddca8a953209..e6f42f67c50e7 100644 --- a/clang-tools-extra/clangd/index/dex/Trigram.cpp +++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -12,10 +12,14 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include <cctype> +#include <limits> #include <queue> #include <string> +#include <utility> namespace clang { namespace clangd { @@ -25,21 +29,22 @@ namespace dex { template <typename Func> static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { // Apply fuzzy matching text segmentation. - std::vector<CharRole> Roles(Identifier.size()); + llvm::SmallVector<CharRole> Roles(Identifier.size()); calculateRoles(Identifier, llvm::makeMutableArrayRef(Roles.data(), Identifier.size())); std::string LowercaseIdentifier = Identifier.lower(); // For each character, store indices of the characters to which fuzzy matching - // algorithm can jump. There are 3 possible variants: + // algorithm can jump. There are 2 possible variants: // // * Next Tail - next character from the same segment // * Next Head - front character of the next segment // // Next stores tuples of three indices in the presented order, if a variant is // not available then 0 is stored. - std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size()); + llvm::SmallVector<std::array<unsigned, 2>, 12> Next( + LowercaseIdentifier.size()); unsigned NextTail = 0, NextHead = 0; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { Next[I] = {{NextTail, NextHead}}; @@ -51,14 +56,14 @@ static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { // Iterate through valid sequences of three characters Fuzzy Matcher can // process. - for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { + for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) continue; - for (const unsigned J : Next[I]) { + for (unsigned J : Next[I]) { if (J == 0) continue; - for (const unsigned K : Next[J]) { + for (unsigned K : Next[J]) { if (K == 0) continue; Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J], @@ -66,16 +71,29 @@ static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { } } } - // Emit short-query trigrams: FooBar -> f, fo, fb. - if (!LowercaseIdentifier.empty()) - Out(Trigram(LowercaseIdentifier[0])); - if (LowercaseIdentifier.size() >= 2) - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1])); - for (size_t I = 1; I < LowercaseIdentifier.size(); ++I) - if (Roles[I] == Head) { - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I])); + // Short queries semantics are diff erent. When the user dosn't type enough + // symbols to form trigrams, we still want to serve meaningful results. To + // achieve that, we form incomplete trigrams (bi- and unigrams) for the + // identifiers and also generate short trigrams on the query side from what + // is available. We allow a small number of short trigram types in order to + // prevent excessive memory usage and increase the quality of the search. + // Only the first few symbols are allowed to be used in incomplete trigrams. + // + // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams: + // "_", "_a", "a", "ab", "ad", "d", "de", "dg" + for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) { + // The first symbol might be a separator, so the loop condition should be + // stopping as soon as there is no next head or we have seen two heads. + if (Roles[Position] == Head) + ++HeadsSeen; + Out(Trigram(LowercaseIdentifier[Position])); + for (unsigned I : Next[Position]) + if (I != 0) + Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I])); + Position = Next[Position][1]; + if (Position == 0) break; - } + } } void generateIdentifierTrigrams(llvm::StringRef Identifier, @@ -101,17 +119,16 @@ void generateIdentifierTrigrams(llvm::StringRef Identifier, std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) { if (Query.empty()) return {}; - std::string LowercaseQuery = Query.lower(); - if (Query.size() < 3) // short-query trigrams only - return {Token(Token::Kind::Trigram, LowercaseQuery)}; // Apply fuzzy matching text segmentation. - std::vector<CharRole> Roles(Query.size()); + llvm::SmallVector<CharRole> Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + std::string LowercaseQuery = Query.lower(); + llvm::DenseSet<Token> UniqueTrigrams; std::string Chars; - for (unsigned I = 0; I < Query.size(); ++I) { + for (unsigned I = 0; I < LowercaseQuery.size(); ++I) { if (Roles[I] != Head && Roles[I] != Tail) continue; // Skip delimiters. Chars.push_back(LowercaseQuery[I]); @@ -121,6 +138,18 @@ std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) { UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); } + // For queries with very few letters, generateIdentifierTrigrams emulates + // outputs of this function to match the semantics. + if (UniqueTrigrams.empty()) { + // If bigram can't be formed out of letters/numbers, we prepend separator. + std::string Result(1, LowercaseQuery.front()); + for (unsigned I = 1; I < LowercaseQuery.size(); ++I) + if (Roles[I] == Head || Roles[I] == Tail) + Result += LowercaseQuery[I]; + UniqueTrigrams.insert( + Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2))); + } + return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; } diff --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp index 60d0db081dbb4..249cc101a74cd 100644 --- a/clang-tools-extra/clangd/unittests/DexTests.cpp +++ b/clang-tools-extra/clangd/unittests/DexTests.cpp @@ -386,30 +386,35 @@ TEST(DexTrigrams, IdentifierTrigrams) { trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"})); EXPECT_THAT(identifierTrigramTokens("abc_def"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde", - "cde", "def"})); + trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", "ade", + "bcd", "bde", "cde", "def"})); EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"), - trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"})); + trigramsAre({"a", "b", "ab", "bc", "abc", "bcd", "cde"})); EXPECT_THAT(identifierTrigramTokens("unique_ptr"), - trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip", - "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt", - "uep", "ept", "ptr"})); + trigramsAre({"u", "p", "un", "up", "pt", "uni", "unp", + "upt", "niq", "nip", "npt", "iqu", "iqp", "ipt", + "que", "qup", "qpt", "uep", "ept", "ptr"})); - EXPECT_THAT( - identifierTrigramTokens("TUDecl"), - trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"})); + EXPECT_THAT(identifierTrigramTokens("TUDecl"), + trigramsAre({"t", "d", "tu", "td", "de", "tud", "tde", "ude", + "dec", "ecl"})); EXPECT_THAT(identifierTrigramTokens("IsOK"), - trigramsAre({"i", "is", "io", "iso", "iok", "sok"})); + trigramsAre({"i", "o", "is", "ok", "io", "iso", "iok", "sok"})); - EXPECT_THAT( - identifierTrigramTokens("abc_defGhij__klm"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd", - "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk", - "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl", - "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); + EXPECT_THAT(identifierTrigramTokens("_pb"), + trigramsAre({"_", "_p", "p", "pb"})); + EXPECT_THAT(identifierTrigramTokens("__pb"), + trigramsAre({"_", "_p", "p", "pb"})); + + EXPECT_THAT(identifierTrigramTokens("abc_defGhij__klm"), + trigramsAre({"a", "d", "ab", "ad", "dg", "de", "abc", + "abd", "ade", "adg", "bcd", "bde", "bdg", "cde", + "cdg", "def", "deg", "dgh", "dgk", "efg", "egh", + "egk", "fgh", "fgk", "ghi", "ghk", "gkl", "hij", + "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); } TEST(DexTrigrams, QueryTrigrams) { @@ -419,8 +424,16 @@ TEST(DexTrigrams, QueryTrigrams) { EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({})); EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"})); - EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"})); - EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"_"})); + EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"_"})); + + EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m"})); + + EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"})); + EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"pb"})); EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); @@ -525,25 +538,45 @@ TEST(DexTest, FuzzyMatch) { } TEST(DexTest, ShortQuery) { - auto I = Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(), + auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(), RelationSlab()); FuzzyFindRequest Req; Req.AnyScope = true; bool Incomplete; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "Empty string is not a short query"; - Req.Query = "t"; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); - EXPECT_TRUE(Incomplete) << "Short queries have diff erent semantics"; + Req.Query = "o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head as unigram"; + + Req.Query = "_o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using delimiter and first head as bigram"; + + Req.Query = "on"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head and tail as bigram"; + + Req.Query = "ot"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first two heads as bigram"; + + Req.Query = "tw"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using second head and tail as bigram"; + + Req.Query = "tf"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using second and third heads as bigram"; - Req.Query = "tt"; + Req.Query = "fo"; EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); EXPECT_TRUE(Incomplete) << "Short queries have diff erent semantics"; - Req.Query = "ttf"; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + Req.Query = "tfs"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "3-char string is not a short query"; } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits