[PATCH] D50517: [clangd] Generate incomplete trigrams for the Dex index

Kirill Bobyrev via Phabricator via cfe-commits Mon, 13 Aug 2018 01:58:29 -0700

This revision was automatically updated to reflect the committed changes.
Closed by commit rL339548: [clangd] Generate incomplete trigrams for the Dex 
index (authored by omtcyfz, committed by ).
Herald added a subscriber: llvm-commits.


Changed prior to commit:
  https://reviews.llvm.org/D50517?vs=160302&id=160310#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D50517

Files:
  clang-tools-extra/trunk/clangd/index/dex/Iterator.h
  clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp
  clang-tools-extra/trunk/clangd/index/dex/Trigram.h
  clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp

Index: clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp
===================================================================
--- clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp
+++ clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp
@@ -271,45 +271,57 @@
 }
 
 TEST(DexIndexTrigrams, IdentifierTrigrams) {
-  EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"}));
+  EXPECT_THAT(generateIdentifierTrigrams("X86"),
+              trigramsAre({"x86", "x$$", "x8$", "$$$"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({}));
+  EXPECT_THAT(generateIdentifierTrigrams("nl"),
+              trigramsAre({"nl$", "n$$", "$$$"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$", "$$$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("clangd"),
-              trigramsAre({"cla", "lan", "ang", "ngd"}));
+              trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd", "$$$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
-              trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
-
-  EXPECT_THAT(
-      generateIdentifierTrigrams("a_b_c_d_e_"),
-      trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+              trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde",
+                           "def", "ab$", "ad$", "$$$"}));
 
-  EXPECT_THAT(
-      generateIdentifierTrigrams("unique_ptr"),
-      trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
-                   "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+  EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
+              trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace",
+                           "bcd", "bce", "bde", "cde", "ab$", "$$$"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
+              trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt",
+                           "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep",
+                           "ept", "ptr", "un$", "up$", "$$$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
-              trigramsAre({"tud", "tde", "ude", "dec", "ecl"}));
+              trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$",
+                           "td$", "$$$"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
-              trigramsAre({"iso", "iok", "sok"}));
+              trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$", "$$$"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"),
-              trigramsAre({
-                  "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
-                  "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
-                  "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
-                  "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
-                  "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
-              }));
+  EXPECT_THAT(
+      generateIdentifierTrigrams("abc_defGhij__klm"),
+      trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh",
+                   "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk",
+                   "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek",
+                   "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl",
+                   "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik",
+                   "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$", "$$$"}));
 }
 
 TEST(DexIndexTrigrams, QueryTrigrams) {
-  EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
+  EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"}));
+  EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"}));
+  EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
+
+  EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"}));
+  EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"}));
+  EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"}));
 
-  EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({}));
+  EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
   EXPECT_THAT(generateQueryTrigrams("clangd"),
               trigramsAre({"cla", "lan", "ang", "ngd"}));
Index: clang-tools-extra/trunk/clangd/index/dex/Iterator.h
===================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Iterator.h
+++ clang-tools-extra/trunk/clangd/index/dex/Iterator.h
@@ -47,6 +47,14 @@
 /// Contains sorted sequence of DocIDs all of which belong to symbols matching
 /// certain criteria, i.e. containing a Search Token. PostingLists are values
 /// for the inverted index.
+// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are
+// likely to be very dense and hence require special attention so that the index
+// doesn't use too much memory. Possible solution would be to construct
+// compressed posting lists which consist of ranges of DocIDs instead of
+// distinct DocIDs. A special case would be the empty query: for that case
+// TrueIterator should be implemented - an iterator which doesn't actually store
+// any PostingList within itself, but "contains" all DocIDs in range
+// [0, IndexSize).
 using PostingList = std::vector<DocID>;
 /// Immutable reference to PostingList object.
 using PostingListRef = llvm::ArrayRef<DocID>;
Index: clang-tools-extra/trunk/clangd/index/dex/Trigram.h
===================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Trigram.h
+++ clang-tools-extra/trunk/clangd/index/dex/Trigram.h
@@ -36,14 +36,20 @@
 /// First, given Identifier (unqualified symbol name) is segmented using
 /// FuzzyMatch API and lowercased. After segmentation, the following technique
 /// is applied for generating trigrams: for each letter or digit in the input
-/// string the algorithms looks for the possible next and skip-1-next symbols
+/// string the algorithms looks for the possible next and skip-1-next characters
 /// which can be jumped to during fuzzy matching. Each combination of such three
-/// symbols is inserted into the result.
+/// characters is inserted into the result.
 ///
 /// Trigrams can start at any character in the input. Then we can choose to move
 /// to the next character, move to the start of the next segment, or skip over a
 /// segment.
 ///
+/// This also generates incomplete trigrams for short query scenarios:
+///  * Empty trigram: "$$$".
+///  * Unigram: the first character of the identifier.
+///  * Bigrams: a 2-char prefix of the identifier and a bigram of the first two
+///    HEAD characters (if they exist).
+//
 /// Note: the returned list of trigrams does not have duplicates, if any trigram
 /// belongs to more than one class it is only inserted once.
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
@@ -53,6 +59,10 @@
 /// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then,
 /// the simplest trigrams - sequences of three consecutive letters and digits
 /// are extracted and returned after deduplication.
+///
+/// For short queries (less than 3 characters with Head or Tail roles in Fuzzy
+/// Matching segmentation) this returns a single trigram with the first
+/// characters (up to 3) to perfrom prefix match.
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
 
 } // namespace dex
Index: clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp
===================================================================
--- clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp
+++ clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp
@@ -10,11 +10,9 @@
 #include "Trigram.h"
 #include "../../FuzzyMatch.h"
 #include "Token.h"
-
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
-
 #include <cctype>
 #include <queue>
 #include <string>
@@ -25,12 +23,11 @@
 namespace clangd {
 namespace dex {
 
-// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
-// be generating unigrams and bigrams here, too. This would prevent symbol index
-// from applying fuzzy matching on a tremendous number of symbols and allow
-// supplementary retrieval for short queries.
-//
-// Short names (total segment length <3 characters) are currently ignored.
+/// This is used to mark unigrams and bigrams and distinct them from complete
+/// trigrams. Since '$' is not present in valid identifier names, it is safe to
+/// use it as the special symbol.
+static const char END_MARKER = '$';
+
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Identifier.size());
@@ -50,17 +47,45 @@
   // not available then 0 is stored.
   std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
   unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
+  // Store two first HEAD characters in the identifier (if present).
+  std::deque<char> TwoHeads;
   for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
     Next[I] = {{NextTail, NextHead, NextNextHead}};
     NextTail = Roles[I] == Tail ? I : 0;
     if (Roles[I] == Head) {
       NextNextHead = NextHead;
       NextHead = I;
+      TwoHeads.push_front(LowercaseIdentifier[I]);
+      if (TwoHeads.size() > 2)
+        TwoHeads.pop_back();
     }
   }
 
   DenseSet<Token> UniqueTrigrams;
-  std::array<char, 4> Chars;
+
+  auto add = [&](std::string Chars) {
+    UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
+  };
+
+  // FIXME(kbobyrev): Instead of producing empty trigram for each identifier,
+  // just use True Iterator on the query side when the query string is empty.
+  add({{END_MARKER, END_MARKER, END_MARKER}});
+
+  if (TwoHeads.size() == 2)
+    add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}});
+
+  if (!LowercaseIdentifier.empty())
+    add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}});
+
+  if (LowercaseIdentifier.size() >= 2)
+    add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}});
+
+  if (LowercaseIdentifier.size() >= 3)
+    add({{LowercaseIdentifier[0], LowercaseIdentifier[1],
+          LowercaseIdentifier[2]}});
+
+  // Iterate through valid seqneces of three characters Fuzzy Matcher can
+  // process.
   for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
     // Skip delimiters.
     if (Roles[I] != Head && Roles[I] != Tail)
@@ -71,13 +96,8 @@
       for (const unsigned K : Next[J]) {
         if (!K)
           continue;
-        Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J],
-                  LowercaseIdentifier[K], 0}};
-        auto Trigram = Token(Token::Kind::Trigram, Chars.data());
-        // Push unique trigrams to the result.
-        if (!UniqueTrigrams.count(Trigram)) {
-          UniqueTrigrams.insert(Trigram);
-        }
+        add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
+              LowercaseIdentifier[K]}});
       }
     }
   }
@@ -89,33 +109,43 @@
   return Result;
 }
 
-// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
-// inputs (total segment length <3 characters).
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Query.size());
   calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
 
+  // Additional pass is necessary to count valid identifier characters.
+  // Depending on that, this function might return incomplete trigram.
+  unsigned ValidSymbolsCount = 0;
+  for (size_t I = 0; I < Roles.size(); ++I)
+    if (Roles[I] == Head || Roles[I] == Tail)
+      ++ValidSymbolsCount;
+
   std::string LowercaseQuery = Query.lower();
 
   DenseSet<Token> UniqueTrigrams;
-  std::deque<char> Chars;
 
-  for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
-    // If current symbol is delimiter, just skip it.
-    if (Roles[I] != Head && Roles[I] != Tail)
-      continue;
+  // If the number of symbols which can form fuzzy matching trigram is not
+  // sufficient, generate a single incomplete trigram for query.
+  if (ValidSymbolsCount < 3) {
+    std::string Chars = LowercaseQuery.substr(0, std::min(3UL, Query.size()));
+    Chars.append(3 - Chars.size(), END_MARKER);
+    UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
+  } else {
+    std::deque<char> Chars;
+    for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
+      // If current symbol is delimiter, just skip it.
+      if (Roles[I] != Head && Roles[I] != Tail)
+        continue;
+
+      Chars.push_back(LowercaseQuery[I]);
 
-    Chars.push_back(LowercaseQuery[I]);
+      if (Chars.size() > 3)
+        Chars.pop_front();
 
-    if (Chars.size() > 3)
-      Chars.pop_front();
-    if (Chars.size() == 3) {
-      auto Trigram =
-          Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
-      // Push unique trigrams to the result.
-      if (!UniqueTrigrams.count(Trigram)) {
-        UniqueTrigrams.insert(Trigram);
+      if (Chars.size() == 3) {
+        UniqueTrigrams.insert(
+            Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))));
       }
     }
   }

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D50517: [clangd] Generate incomplete trigrams for the Dex index

Reply via email to