[PATCH] D50517: [clangd] Generate incomplete trigrams for the Dex index

Kirill Bobyrev via Phabricator via cfe-commits Fri, 10 Aug 2018 01:49:13 -0700

kbobyrev updated this revision to Diff 160071.
kbobyrev added a comment.

Complete the tests, finish the implementation.


One thought about prefix match suggestion: we should either make it more 
explicit for the index (e.g. introduce `prefixMatch` and dispatch `fuzzyMatch` 
to prefix matching in case query only contains one "true" symbol) or document 
this properly. While, as I wrote earlier, I totally support the idea of prefix 
matching queries of length 1 it might not align with some user expectations and 
it's also very implicit if we just generate tokens this way and don't mention 
it anywhere in the `DexIndex` implementation.

@ioeric, @ilya-biryukov any thoughts?


https://reviews.llvm.org/D50517

Files:
  clang-tools-extra/clangd/index/dex/Trigram.cpp
  clang-tools-extra/clangd/index/dex/Trigram.h
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp

Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -250,45 +250,60 @@
 }
 
 TEST(DexIndexTrigrams, IdentifierTrigrams) {
-  EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"}));
+  EXPECT_THAT(generateIdentifierTrigrams("X86"),
+              trigramsAre({"x86", "x$$", "x8$", "86$"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({}));
+  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("clangd"),
-              trigramsAre({"cla", "lan", "ang", "ngd"}));
+              trigramsAre({"cla", "lan", "ang", "ngd", "an$", "c$$", "cl$",
+                           "ng$", "gd$", "la$"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
-              trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
+  EXPECT_THAT(
+      generateIdentifierTrigrams("abc_def"),
+      trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def", "a$$",
+                   "ab$", "ad$", "bc$", "bd$", "cd$", "de$", "ef$"}));
 
   EXPECT_THAT(
       generateIdentifierTrigrams("a_b_c_d_e_"),
-      trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+      trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde",
+                   "a$$", "ab$", "ac$", "bc$", "bd$", "cd$", "ce$", "de$"}));
 
-  EXPECT_THAT(
-      generateIdentifierTrigrams("unique_ptr"),
-      trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
-                   "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+  EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
+              trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu",
+                           "iqp", "ipt", "que", "qup", "qpt", "uep", "ept",
+                           "ptr", "u$$", "un$", "up$", "ni$", "np$", "iq$",
+                           "ip$", "qu$", "qp$", "ue$", "ep$", "pt$", "tr$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
-              trigramsAre({"tud", "tde", "ude", "dec", "ecl"}));
-
-  EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
-              trigramsAre({"iso", "iok", "sok"}));
-
-  EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"),
-              trigramsAre({
-                  "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
-                  "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
-                  "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
-                  "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
-                  "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
-              }));
+              trigramsAre({"tud", "tde", "ude", "dec", "ecl", "t$$", "tu$",
+                           "td$", "ud$", "de$", "ec$", "cl$"}));
+
+  EXPECT_THAT(
+      generateIdentifierTrigrams("IsOK"),
+      trigramsAre({"iso", "iok", "sok", "i$$", "is$", "io$", "so$", "ok$"}));
+
+  EXPECT_THAT(
+      generateIdentifierTrigrams("abc_defGhij__klm"),
+      trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh",
+                   "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk",
+                   "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek",
+                   "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl",
+                   "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik",
+                   "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$", "ag$",
+                   "bc$", "bd$", "bg$", "cd$", "cg$", "de$", "dg$", "dk$",
+                   "ef$", "eg$", "ek$", "fg$", "fk$", "gh$", "gk$", "hi$",
+                   "hk$", "ij$", "ik$", "jk$", "kl$", "lm$"}));
 }
 
 TEST(DexIndexTrigrams, QueryTrigrams) {
-  EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
+  EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"}));
+  EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"}));
+  EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
 
-  EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({}));
+  EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
   EXPECT_THAT(generateQueryTrigrams("clangd"),
               trigramsAre({"cla", "lan", "ang", "ngd"}));
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.h
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -31,6 +31,13 @@
 namespace clangd {
 namespace dex {
 
+/// This is used to mark unigrams and bigrams and distinct them from complete
+/// trigrams. Since '$' is not present in valid identifier names, it is safe to
+/// use it as the special symbol.
+// FIXME(kbobyrev): Global variables are not good, should this be a static
+// member of Token even though it's Trigram-specific?
+const auto END_MARKER = '$';
+
 /// Returns list of unique fuzzy-search trigrams from unqualified symbol.
 ///
 /// First, given Identifier (unqualified symbol name) is segmented using
@@ -44,15 +51,27 @@
 /// to the next character, move to the start of the next segment, or skip over a
 /// segment.
 ///
+/// Special kind of trigrams - incomplete trigrams is also present in the
+/// result. Incomplete trigrams contain END_MARKER ('$') at the end. Result
+/// contains one unigram which contains the first lowercase letter or digit
+/// of given identifier. The result also contains all bigrams which are
+/// generated in the same way sequences of length 3 are created.
+///
 /// Note: the returned list of trigrams does not have duplicates, if any trigram
 /// belongs to more than one class it is only inserted once.
+// FIXME(kbobyrev): Document somewhere in DexIndex that for queries of size 1
+// it will do prefix matching instead of fuzzy matching on the identifier names,
+// because it might be not obvious for users.
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
 
 /// Returns list of unique fuzzy-search trigrams given a query.
 ///
 /// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then,
 /// the simplest trigrams - sequences of three consecutive letters and digits
 /// are extracted and returned after deduplication.
+///
+/// For short queries (Query contains less than 3 letters and digits) this
+/// returns a single trigram with all valid symbols.
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
 
 } // namespace dex
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -10,11 +10,9 @@
 #include "Trigram.h"
 #include "../../FuzzyMatch.h"
 #include "Token.h"
-
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
-
 #include <cctype>
 #include <queue>
 #include <string>
@@ -25,12 +23,14 @@
 namespace clangd {
 namespace dex {
 
-// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
-// be generating unigrams and bigrams here, too. This would prevent symbol index
-// from applying fuzzy matching on a tremendous number of symbols and allow
-// supplementary retrieval for short queries.
-//
-// Short names (total segment length <3 characters) are currently ignored.
+// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are
+// likely to be very dense and hence require special attention so that the index
+// doesn't use too much memory. Possible solution would be to construct
+// compressed posting lists which consist of ranges of DocIDs instead of
+// distinct DocIDs. A special case would be the empty query: for that case
+// TrueIterator should be implemented - an iterator which doesn't actually store
+// any PostingList within itself, but "contains" all DocIDs in range
+// [0, IndexSize).
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Identifier.size());
@@ -59,15 +59,37 @@
     }
   }
 
+  // Iterate through valid seqneces of three characters Fuzzy Matcher can
+  // process.
   DenseSet<Token> UniqueTrigrams;
   std::array<char, 4> Chars;
+  bool FoundFirstSymbol = false;
   for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
     // Skip delimiters.
     if (Roles[I] != Head && Roles[I] != Tail)
       continue;
+
+    // Generate incomplete trigram with one symbol only from the first valid
+    // symbol of the identifier.
+    if (!FoundFirstSymbol) {
+      FoundFirstSymbol = true;
+      Chars = {{LowercaseIdentifier[I], END_MARKER, END_MARKER, 0}};
+      const auto Unigram = Token(Token::Kind::Trigram, Chars.data());
+      if (!UniqueTrigrams.count(Unigram)) {
+        UniqueTrigrams.insert(Unigram);
+      }
+    }
+
     for (const unsigned J : Next[I]) {
       if (!J)
         continue;
+
+      Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J], END_MARKER, 0}};
+      const auto Bigram = Token(Token::Kind::Trigram, Chars.data());
+      if (!UniqueTrigrams.count(Bigram)) {
+        UniqueTrigrams.insert(Bigram);
+      }
+
       for (const unsigned K : Next[J]) {
         if (!K)
           continue;
@@ -89,13 +111,19 @@
   return Result;
 }
 
-// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
-// inputs (total segment length <3 characters).
+// FIXME(kbobyrev): Correctly handle empty trigrams "$$$".
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Query.size());
   calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
 
+  // Additional pass is necessary to count valid identifier characters.
+  // Depending on that, this function might return incomplete trigram.
+  unsigned ValidSymbolsCount = 0;
+  for (size_t I = 0; I < Roles.size(); ++I)
+    if (Roles[I] == Head || Roles[I] == Tail)
+      ++ValidSymbolsCount;
+
   std::string LowercaseQuery = Query.lower();
 
   DenseSet<Token> UniqueTrigrams;
@@ -110,9 +138,27 @@
 
     if (Chars.size() > 3)
       Chars.pop_front();
+
+    if (ValidSymbolsCount < 3 && Chars.size() == ValidSymbolsCount) {
+      while (Chars.size() < 3) {
+        Chars.push_back(END_MARKER);
+      }
+      auto Trigram =
+          Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+      UniqueTrigrams.insert(Trigram);
+      // The symbol does not have sufficient number of valid symbols for
+      // complete trigrams, all valid symbols were seen at this point.
+      break;
+    }
+
     if (Chars.size() == 3) {
       auto Trigram =
           Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+
+      if (Chars.front() == END_MARKER)
+        Trigram = Token(Token::Kind::Trigram,
+                        std::string(Chars.rbegin(), Chars.rend()));
+
       // Push unique trigrams to the result.
       if (!UniqueTrigrams.count(Trigram)) {
         UniqueTrigrams.insert(Trigram);

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D50517: [clangd] Generate incomplete trigrams for the Dex index

Reply via email to