kbobyrev updated this revision to Diff 392380.
kbobyrev marked an inline comment as done.
kbobyrev added a comment.
No problem, thank you!
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D113995/new/
https://reviews.llvm.org/D113995
Files:
clang-tools-extra/clangd/index/dex/Trigram.cpp
clang-tools-extra/clangd/unittests/DexTests.cpp
Index: clang-tools-extra/clangd/unittests/DexTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/DexTests.cpp
+++ clang-tools-extra/clangd/unittests/DexTests.cpp
@@ -386,30 +386,35 @@
trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));
EXPECT_THAT(identifierTrigramTokens("abc_def"),
- trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde",
- "cde", "def"}));
+ trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", "ade",
+ "bcd", "bde", "cde", "def"}));
EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"),
- trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"}));
+ trigramsAre({"a", "b", "ab", "bc", "abc", "bcd", "cde"}));
EXPECT_THAT(identifierTrigramTokens("unique_ptr"),
- trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip",
- "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
- "uep", "ept", "ptr"}));
+ trigramsAre({"u", "p", "un", "up", "pt", "uni", "unp",
+ "upt", "niq", "nip", "npt", "iqu", "iqp", "ipt",
+ "que", "qup", "qpt", "uep", "ept", "ptr"}));
- EXPECT_THAT(
- identifierTrigramTokens("TUDecl"),
- trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"}));
+ EXPECT_THAT(identifierTrigramTokens("TUDecl"),
+ trigramsAre({"t", "d", "tu", "td", "de", "tud", "tde", "ude",
+ "dec", "ecl"}));
EXPECT_THAT(identifierTrigramTokens("IsOK"),
- trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));
+ trigramsAre({"i", "o", "is", "ok", "io", "iso", "iok", "sok"}));
- EXPECT_THAT(
- identifierTrigramTokens("abc_defGhij__klm"),
- trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd",
- "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk",
- "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl",
- "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"}));
+ EXPECT_THAT(identifierTrigramTokens("_pb"),
+ trigramsAre({"_", "_p", "p", "pb"}));
+ EXPECT_THAT(identifierTrigramTokens("__pb"),
+ trigramsAre({"_", "_p", "p", "pb"}));
+
+ EXPECT_THAT(identifierTrigramTokens("abc_defGhij__klm"),
+ trigramsAre({"a", "d", "ab", "ad", "dg", "de", "abc",
+ "abd", "ade", "adg", "bcd", "bde", "bdg", "cde",
+ "cdg", "def", "deg", "dgh", "dgk", "efg", "egh",
+ "egk", "fgh", "fgk", "ghi", "ghk", "gkl", "hij",
+ "hik", "hkl", "ijk", "ikl", "jkl", "klm"}));
}
TEST(DexTrigrams, QueryTrigrams) {
@@ -419,8 +424,16 @@
EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({}));
EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"}));
- EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"}));
- EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));
+ EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"_"}));
+ EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"_"}));
+
+ EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m"}));
+
+ EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"}));
+ EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"}));
+ EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"}));
+ EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"pb"}));
+ EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"pb"}));
EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
@@ -525,25 +538,45 @@
}
TEST(DexTest, ShortQuery) {
- auto I = Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(),
+ auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(),
RelationSlab());
FuzzyFindRequest Req;
Req.AnyScope = true;
bool Incomplete;
- EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
EXPECT_FALSE(Incomplete) << "Empty string is not a short query";
- Req.Query = "t";
- EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
- EXPECT_TRUE(Incomplete) << "Short queries have different semantics";
+ Req.Query = "o";
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+ EXPECT_TRUE(Incomplete) << "Using first head as unigram";
+
+ Req.Query = "_o";
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+ EXPECT_TRUE(Incomplete) << "Using delimiter and first head as bigram";
+
+ Req.Query = "on";
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+ EXPECT_TRUE(Incomplete) << "Using first head and tail as bigram";
+
+ Req.Query = "ot";
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+ EXPECT_TRUE(Incomplete) << "Using first two heads as bigram";
+
+ Req.Query = "tw";
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+ EXPECT_TRUE(Incomplete) << "Using second head and tail as bigram";
+
+ Req.Query = "tf";
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
+ EXPECT_TRUE(Incomplete) << "Using second and third heads as bigram";
- Req.Query = "tt";
+ Req.Query = "fo";
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
EXPECT_TRUE(Incomplete) << "Short queries have different semantics";
- Req.Query = "ttf";
- EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
+ Req.Query = "tfs";
+ EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix"));
EXPECT_FALSE(Incomplete) << "3-char string is not a short query";
}
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -12,10 +12,14 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
#include <cctype>
+#include <limits>
#include <queue>
#include <string>
+#include <utility>
namespace clang {
namespace clangd {
@@ -25,21 +29,22 @@
template <typename Func>
static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
// Apply fuzzy matching text segmentation.
- std::vector<CharRole> Roles(Identifier.size());
+ llvm::SmallVector<CharRole> Roles(Identifier.size());
calculateRoles(Identifier,
llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
std::string LowercaseIdentifier = Identifier.lower();
// For each character, store indices of the characters to which fuzzy matching
- // algorithm can jump. There are 3 possible variants:
+ // algorithm can jump. There are 2 possible variants:
//
// * Next Tail - next character from the same segment
// * Next Head - front character of the next segment
//
// Next stores tuples of three indices in the presented order, if a variant is
// not available then 0 is stored.
- std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
+ llvm::SmallVector<std::array<unsigned, 2>, 12> Next(
+ LowercaseIdentifier.size());
unsigned NextTail = 0, NextHead = 0;
for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
Next[I] = {{NextTail, NextHead}};
@@ -51,14 +56,14 @@
// Iterate through valid sequences of three characters Fuzzy Matcher can
// process.
- for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
+ for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) {
// Skip delimiters.
if (Roles[I] != Head && Roles[I] != Tail)
continue;
- for (const unsigned J : Next[I]) {
+ for (unsigned J : Next[I]) {
if (J == 0)
continue;
- for (const unsigned K : Next[J]) {
+ for (unsigned K : Next[J]) {
if (K == 0)
continue;
Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
@@ -66,16 +71,29 @@
}
}
}
- // Emit short-query trigrams: FooBar -> f, fo, fb.
- if (!LowercaseIdentifier.empty())
- Out(Trigram(LowercaseIdentifier[0]));
- if (LowercaseIdentifier.size() >= 2)
- Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1]));
- for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
- if (Roles[I] == Head) {
- Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I]));
+ // Short queries semantics are different. When the user dosn't type enough
+ // symbols to form trigrams, we still want to serve meaningful results. To
+ // achieve that, we form incomplete trigrams (bi- and unigrams) for the
+ // identifiers and also generate short trigrams on the query side from what
+ // is available. We allow a small number of short trigram types in order to
+ // prevent excessive memory usage and increase the quality of the search.
+ // Only the first few symbols are allowed to be used in incomplete trigrams.
+ //
+ // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
+ // "_", "_a", "a", "ab", "ad", "d", "de", "dg"
+ for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) {
+ // The first symbol might be a separator, so the loop condition should be
+ // stopping as soon as there is no next head or we have seen two heads.
+ if (Roles[Position] == Head)
+ ++HeadsSeen;
+ Out(Trigram(LowercaseIdentifier[Position]));
+ for (unsigned I : Next[Position])
+ if (I != 0)
+ Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I]));
+ Position = Next[Position][1];
+ if (Position == 0)
break;
- }
+ }
}
void generateIdentifierTrigrams(llvm::StringRef Identifier,
@@ -101,17 +119,16 @@
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
if (Query.empty())
return {};
- std::string LowercaseQuery = Query.lower();
- if (Query.size() < 3) // short-query trigrams only
- return {Token(Token::Kind::Trigram, LowercaseQuery)};
// Apply fuzzy matching text segmentation.
- std::vector<CharRole> Roles(Query.size());
+ llvm::SmallVector<CharRole> Roles(Query.size());
calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
+ std::string LowercaseQuery = Query.lower();
+
llvm::DenseSet<Token> UniqueTrigrams;
std::string Chars;
- for (unsigned I = 0; I < Query.size(); ++I) {
+ for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
if (Roles[I] != Head && Roles[I] != Tail)
continue; // Skip delimiters.
Chars.push_back(LowercaseQuery[I]);
@@ -121,6 +138,18 @@
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
}
+ // For queries with very few letters, generateIdentifierTrigrams emulates
+ // outputs of this function to match the semantics.
+ if (UniqueTrigrams.empty()) {
+ // If bigram can't be formed out of letters/numbers, we prepend separator.
+ std::string Result(1, LowercaseQuery.front());
+ for (unsigned I = 1; I < LowercaseQuery.size(); ++I)
+ if (Roles[I] == Head || Roles[I] == Tail)
+ Result += LowercaseQuery[I];
+ UniqueTrigrams.insert(
+ Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2)));
+ }
+
return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
}
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits