[PATCH] D49417: [clangd] Implement trigram generation algorithm for new symbol index

Kirill Bobyrev via Phabricator via cfe-commits Wed, 18 Jul 2018 08:03:02 -0700

omtcyfz updated this revision to Diff 156073.
omtcyfz marked 11 inline comments as done.
omtcyfz added a comment.


Addressed all comments by Eric.

As discussed internally, I should also exercise my naming skills and come up 
with a better for the symbol index to substitute "Noctem" which doesn't point 
to any project's feature.


https://reviews.llvm.org/D49417

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/noctem/SearchToken.cpp
  clang-tools-extra/clangd/index/noctem/SearchToken.h
  clang-tools-extra/unittests/clangd/CMakeLists.txt
  clang-tools-extra/unittests/clangd/NoctemIndexTests.cpp

Index: clang-tools-extra/unittests/clangd/NoctemIndexTests.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/unittests/clangd/NoctemIndexTests.cpp
@@ -0,0 +1,95 @@
+//===-- NoctemIndexTests.cpp  -------------------------*- C++ -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "index/noctem/SearchToken.h"
+#include "llvm/ADT/SmallString.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace noctem {
+
+std::vector<llvm::SmallString<10>>
+toSmallStrings(const std::vector<std::string> Strings) {
+  std::vector<llvm::SmallString<10>> Result(Strings.size());
+  for (size_t Index = 0; Index < Strings.size(); ++Index) {
+    Result[Index] = Strings[Index];
+  }
+  return Result;
+}
+
+std::vector<SearchToken>
+getTrigrams(std::initializer_list<std::string> Trigrams) {
+  std::vector<SearchToken> Result;
+  for (const auto &Symbols : Trigrams) {
+    Result.push_back(SearchToken(Symbols, SearchToken::Kind::Trigram));
+  }
+  return Result;
+}
+
+TEST(NoctemIndexTokens, TrigramSymbolNameTokenization) {
+  EXPECT_EQ(segmentIdentifier("unique_ptr"), toSmallStrings({"unique", "ptr"}));
+
+  EXPECT_EQ(segmentIdentifier("TUDecl"), toSmallStrings({"TU", "Decl"}));
+
+  EXPECT_EQ(segmentIdentifier("table_name_"),
+            toSmallStrings({"table", "name"}));
+
+  EXPECT_EQ(segmentIdentifier("kDaysInAWeek"),
+            toSmallStrings({"k", "Days", "In", "A", "Week"}));
+
+  EXPECT_EQ(segmentIdentifier("AlternateUrlTableErrors"),
+            toSmallStrings({"Alternate", "Url", "Table", "Errors"}));
+
+  EXPECT_EQ(segmentIdentifier("IsOK"), toSmallStrings({"Is", "OK"}));
+
+  EXPECT_EQ(segmentIdentifier("ABSL_FALLTHROUGH_INTENDED"),
+            toSmallStrings({"ABSL", "FALLTHROUGH", "INTENDED"}));
+
+  EXPECT_EQ(segmentIdentifier("SystemZ"), toSmallStrings({"System", "Z"}));
+
+  EXPECT_EQ(segmentIdentifier("X86"), toSmallStrings({"X86"}));
+
+  EXPECT_EQ(segmentIdentifier("ASTNodeKind"),
+            toSmallStrings({"AST", "Node", "Kind"}));
+
+  EXPECT_EQ(segmentIdentifier("ObjCDictionaryElement"),
+            toSmallStrings({"Obj", "C", "Dictionary", "Element"}));
+
+  EXPECT_EQ(segmentIdentifier("multiple__underscores___everywhere____"),
+            toSmallStrings({"multiple", "underscores", "everywhere"}));
+
+  EXPECT_EQ(segmentIdentifier("__cuda_builtin_threadIdx_t"),
+            toSmallStrings({"cuda", "builtin", "thread", "Idx", "t"}));
+
+  EXPECT_EQ(segmentIdentifier("longUPPERCASESequence"),
+            toSmallStrings({"long", "UPPERCASE", "Sequence"}));
+}
+
+TEST(NoctemIndexTrigrams, TrigramGeneration) {
+  EXPECT_EQ(
+      generateTrigrams("a_b_c_d_e_"),
+      getTrigrams({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+
+  EXPECT_EQ(generateTrigrams("clangd"),
+            getTrigrams({"cla", "lan", "ang", "ngd"}));
+
+  EXPECT_EQ(generateTrigrams("abc_def"),
+            getTrigrams({"abc", "def", "abd", "ade", "bcd", "bde", "cde"}));
+
+  EXPECT_EQ(generateTrigrams("unique_ptr"),
+            getTrigrams({"uni", "niq", "iqu", "que", "ptr", "unp", "upt", "nip",
+                         "npt", "iqp", "ipt", "qup", "qpt", "uep", "ept"}));
+
+  EXPECT_EQ(generateTrigrams("nl"), getTrigrams({}));
+}
+
+} // namespace noctem
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/unittests/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/unittests/clangd/CMakeLists.txt
+++ clang-tools-extra/unittests/clangd/CMakeLists.txt
@@ -23,6 +23,7 @@
   GlobalCompilationDatabaseTests.cpp
   HeadersTests.cpp
   IndexTests.cpp
+  NoctemIndexTests.cpp
   QualityTests.cpp
   SourceCodeTests.cpp
   SymbolCollectorTests.cpp
Index: clang-tools-extra/clangd/index/noctem/SearchToken.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/noctem/SearchToken.h
@@ -0,0 +1,194 @@
+//===--- SearchToken.h- Symbol Search primitive ------------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SearchTokens are keys for inverted index which are mapped to the
+// corresponding posting lists. SearchToken objects represent a characteristic
+// of a symbol, which can be used to perform efficient search.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_NOCTEM_TRIGRAM_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_NOCTEM_TRIGRAM_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace noctem {
+
+/// \brief Hashable SearchToken, which represents a search token primitive.
+///
+/// The following items are examples of search atoms:
+///
+/// * Trigram for fuzzy search on unqualified symbol names.
+/// * Proximity path primitives, e.g. "symbol is defined in directory
+///   $HOME/dev/llvm or its prefix".
+/// * Scope primitives, e.g. "symbol belongs to namespace foo::bar or its
+///   prefix".
+/// * If the symbol represents a variable, token can be its type such as int,
+///   clang::Decl, …
+/// * For a symbol representing a function, this can be the
+///   return type.
+///
+/// Tokens can be used to perform more sophisticated search queries by
+/// constructing complex iterator trees.
+class SearchToken {
+public:
+  enum class Kind : short {
+    Trigram,
+    Scope,
+    Path,
+    Type,
+
+    Custom,
+  };
+
+  SearchToken(llvm::StringRef Data, Kind TokenKind);
+
+  // Returns precomputed hash.
+  size_t operator()(const SearchToken &T) const { return Hash; }
+
+  bool operator==(const SearchToken &Other) const {
+    return TokenKind == Other.TokenKind && Data == Other.Data;
+  }
+
+  const llvm::StringRef data() const { return Data; }
+
+  const Kind &kind() const { return TokenKind; }
+
+private:
+  friend llvm::hash_code hash_value(const SearchToken &Token) {
+    return Token.Hash;
+  }
+
+  /// \brief Representation which is unique among SearchToken with the same
+  /// TokenKind.
+  ///
+  /// These are the examples of Data for different TokenKinds (Token
+  /// Namespaces):
+  ///
+  /// * Trigram: 3 bytes containing trigram characters
+  /// * Scope: full scope name, e.g. "foo::bar::baz"
+  /// * Path: full or relative path to the directory
+  /// * Type: full type name or the USR associated with this type
+  llvm::SmallString<3> Data;
+  /// \brief Precomputed hash which is used as a key for inverted index.
+  size_t Hash;
+  Kind TokenKind;
+};
+
+/// \brief Splits unqualified symbol name into segments and casts letters to
+/// lowercase for trigram generation.
+///
+/// First stage of trigram generation algorithm. Given an unqualified symbol
+/// name, this outputs a sequence of string segments using the following rules:
+///
+/// * '_' is a separator. Multiple consecutive underscores are treated as a
+///   single separator. Underscores at the beginning and the end of the symbol
+///   name are skipped.
+///
+///   Examples: "unique_ptr" -> ["unique", "ptr"],
+///             "__builtin_popcount" -> ["builtin", "popcount"]
+///             "snake____case___" -> ["snake", "case"]
+///
+/// * Lowercase letter followed by an uppercase letter is a separator.
+///
+///   Examples: "kItemsCount" -> ["k", "Items", "Count"]
+///
+/// * Sequences of consecutive uppercase letters followed by a lowercase letter:
+///   the last uppercase letter is treated as the beginning of a next segment.
+///
+///   Examples: "TUDecl" -> ["TU", "Decl"]
+///             "kDaysInAWeek" -> ["k", "Days", "In", "A", "Week"]
+///
+/// Note: digits are treated as lowercase letters. Example: "X86" -> ["X86"]
+/// FIXME(kbobyrev): Change name of this and everywhere else.
+std::vector<llvm::SmallString<10>>
+segmentIdentifier(llvm::StringRef SymbolName);
+
+/// \brief Returns list of unique fuzzy-search trigrams from unqualified symbol.
+///
+/// Runs trigram generation for fuzzy-search index on segments produced by
+/// segmentIdentifier(SymbolName);
+///
+///
+/// The motivation for trigram generation algorithm is that extracted trigrams
+/// are 3-char suffixes of paths through the fuzzy matching automaton. There are
+/// four classes of extracted trigrams:
+///
+/// * The simplest one consists of consecutive 3-char sequences of each segment.
+///
+///   Example: "trigram" -> ["tri", "rig", "igr", "gra", "ram"
+///
+/// * Next class consists of front character of subsequent segments.
+///
+///   Example: ["translation", "unit", "decl"] -> ["tud"]
+///
+///   Note: skipping segments is allowed, but not more than one. For example,
+///   given ["a", "b", "c", "d", "e"] -> "ace" is allowed, but "ade" is not.
+///
+/// * Another class of trigrams consists of those with 2 charactersin one
+///   segment and the front character of subsequent segment (just as before,
+///   skipping up to one segment is allowed).
+///
+///   Example: ["ab", "c", "d", "e"] -> ["abc", "abd", "abe"]
+///   Note: similarly to the previous case, "abe" would not be allowed.
+///
+/// * The last class of trigrams is similar to the previous one: it takes one
+///   character from one segment and two front characters from the next or
+///   skip-1-next segments.
+///
+///   Example: ["a", "bc", "de", "fg"] -> ["abc", "ade"]
+///   But not "afg".
+///
+///  Note: the returned list of trigrams does not have duplicates, if any
+///  trigram belongs to more than one class it is only inserted once.
+std::vector<SearchToken>
+generateTrigrams(const std::vector<llvm::SmallString<10>> &Segments);
+
+/// \brief Combines segmentIdentifier() and generateTrigrams(Segments).
+std::vector<SearchToken> generateTrigrams(llvm::StringRef SymbolName);
+
+} // namespace noctem
+} // namespace clangd
+} // namespace clang
+
+namespace llvm {
+
+// Support SearchTokens as DenseMap keys.
+template <> struct DenseMapInfo<clang::clangd::noctem::SearchToken> {
+
+  static inline clang::clangd::noctem::SearchToken getEmptyKey() {
+    static clang::clangd::noctem::SearchToken EmptyKey(
+        "EMPTYKEY", clang::clangd::noctem::SearchToken::Kind::Custom);
+    return EmptyKey;
+  }
+
+  static inline clang::clangd::noctem::SearchToken getTombstoneKey() {
+    static clang::clangd::noctem::SearchToken TombstoneKey(
+        "TOMBSTONE_KEY", clang::clangd::noctem::SearchToken::Kind::Custom);
+    return TombstoneKey;
+  }
+
+  static unsigned getHashValue(const clang::clangd::noctem::SearchToken &Tag) {
+    return hash_value(Tag);
+  }
+
+  static bool isEqual(const clang::clangd::noctem::SearchToken &LHS,
+                      const clang::clangd::noctem::SearchToken &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+
+#endif
Index: clang-tools-extra/clangd/index/noctem/SearchToken.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/noctem/SearchToken.cpp
@@ -0,0 +1,192 @@
+//===--- SearchToken.cpp- Symbol Search primitive ----------------*- C++
+//-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SearchToken.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Twine.h"
+
+#include <cctype>
+#include <string>
+
+using namespace llvm;
+
+namespace clang {
+namespace clangd {
+namespace noctem {
+
+SearchToken::SearchToken(llvm::StringRef Data, Kind TokenKind)
+    : Data(Data), TokenKind(TokenKind) {
+  assert(TokenKind != Kind::Trigram ||
+         Data.size() == 3 && "Trigram should contain three characters.");
+  switch (TokenKind) {
+  case Kind::Trigram:
+    Hash = ((Data[0] << 16) & (Data[1] << 8) & Data[2]);
+    break;
+  default:
+    Hash = std::hash<std::string>{}(Data);
+    break;
+  }
+}
+
+// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
+// be generating unigrams and bigrams here, too. This would prevent symbol index
+// from applying fuzzy matching on a tremendous number of symbols and allow
+// supplementary retrieval for short queries.
+std::vector<SearchToken>
+generateTrigrams(const std::vector<llvm::SmallString<10>> &Segments) {
+  llvm::DenseSet<SearchToken> UniqueTrigrams;
+  std::vector<SearchToken> Trigrams;
+
+  // Extract trigrams consisting of first characters of tokens sorted by of
+  // token positions. Trigram generator is allowed to skip 1 word between each
+  // token.
+  //
+  // Example: ["a", "b", "c", "d", "e"]
+  //
+  // would produce -> ["abc", "acd", "ace", ...] (among the others)
+  //
+  // but not -> ["ade"] because two tokens ("b" and "c") would be skipped in
+  // this case.
+  for (auto FirstSegment = Segments.begin(); FirstSegment != Segments.end();
+       ++FirstSegment) {
+    for (auto SecondSegment = FirstSegment + 1;
+         (SecondSegment <= FirstSegment + 2) &&
+         (SecondSegment != Segments.end());
+         ++SecondSegment) {
+      for (auto ThirdSegment = SecondSegment + 1;
+           (ThirdSegment <= SecondSegment + 2) &&
+           (ThirdSegment != Segments.end());
+           ++ThirdSegment) {
+        SearchToken Trigram(
+            (*FirstSegment + *SecondSegment + *ThirdSegment).str(),
+            SearchToken::Kind::Trigram);
+        if (!UniqueTrigrams.count(Trigram)) {
+          UniqueTrigrams.insert(Trigram);
+          Trigrams.push_back(Trigram);
+        }
+      }
+    }
+  }
+
+  // Iterate through each token with a sliding window and extract trigrams
+  // consisting of 3 consecutive characters.
+  //
+  // Example: "delete" -> ["del", "ele", "let", "ete"]
+  for (const auto &Segment : Segments) {
+    // Token should have at least three characters to have trigram substrings.
+    if (Segment.size() < 3)
+      continue;
+
+    for (size_t Position = 0; Position + 2 < Segment.size(); ++Position)
+      Trigrams.push_back(
+          SearchToken(Segment.substr(Position, 3), SearchToken::Kind::Trigram));
+  }
+
+  for (auto FirstSegment = Segments.begin(); FirstSegment != Segments.end();
+       ++FirstSegment) {
+    for (auto SecondSegment = FirstSegment + 1;
+         (SecondSegment <= FirstSegment + 2) &&
+         (SecondSegment != Segments.end());
+         ++SecondSegment) {
+      for (size_t FirstSegmentIndex = 0;
+           FirstSegmentIndex < FirstSegment->size(); ++FirstSegmentIndex) {
+        // Extract trigrams of the third class: one character of the first token
+        // and two characters from the next or skip-1-next token.
+        if (FirstSegmentIndex + 1 < FirstSegment->size()) {
+          SearchToken Trigram((FirstSegment->substr(FirstSegmentIndex, 2) +
+                               SecondSegment->substr(0, 1))
+                                  .str(),
+                              SearchToken::Kind::Trigram);
+          if (!UniqueTrigrams.count(Trigram)) {
+            UniqueTrigrams.insert(Trigram);
+            Trigrams.push_back(Trigram);
+          }
+        }
+        // Extract trigrams of the last class: two character from the first
+        // token and front character from the next or skip-1-next token.
+        if (SecondSegment->size() > 1) {
+          SearchToken Trigram((FirstSegment->substr(FirstSegmentIndex, 1) +
+                               SecondSegment->substr(0, 2))
+                                  .str(),
+                              SearchToken::Kind::Trigram);
+          if (!UniqueTrigrams.count(Trigram)) {
+            UniqueTrigrams.insert(Trigram);
+            Trigrams.push_back(Trigram);
+          }
+        }
+      }
+    }
+  }
+
+  return Trigrams;
+}
+
+std::vector<SmallString<10>> segmentIdentifier(StringRef SymbolName) {
+  std::vector<SmallString<10>> Segments;
+  size_t SegmentStart = 0;
+  // Skip underscores at the beginning, e.g. "__builtin_popcount".
+  while (SymbolName[SegmentStart] == '_')
+    ++SegmentStart;
+
+  for (size_t Index = SegmentStart; Index + 1 < SymbolName.size(); ++Index) {
+    const char CurrentSymbol = SymbolName[Index];
+    const char NextSymbol = SymbolName[Index + 1];
+    // Skip sequences of underscores, e.g. "my__type".
+    if (CurrentSymbol == '_' && NextSymbol == '_') {
+      ++SegmentStart;
+      continue;
+    }
+
+    // Splits if the next symbol is underscore or if processed characters are
+    // [lowercase, Uppercase] which indicates beginning of next token. Digits
+    // are equivalent to lowercase symbols.
+    if ((NextSymbol == '_') ||
+        ((islower(CurrentSymbol) || isdigit(CurrentSymbol)) &&
+         isupper(NextSymbol))) {
+      Segments.push_back(
+          SymbolName.substr(SegmentStart, Index - SegmentStart + 1));
+      SegmentStart = Index + 1;
+      if (NextSymbol == '_')
+        ++SegmentStart;
+    }
+
+    // If there were N (> 1) consecutive uppercase letter the split should
+    // generate two tokens, one of which would consist of N - 1 first uppercase
+    // letters, the next token begins with the last uppercase letter.
+    //
+    // Example: "TUDecl" -> ["TU", "Decl"]
+    if (isupper(CurrentSymbol) &&
+        (islower(NextSymbol) || (isdigit(NextSymbol)))) {
+      // Don't perform split if Index points to the beginning of new token,
+      // otherwise "NamedDecl" would be split into ["N", "amed", "D", "ecl"]
+      if (Index == SegmentStart)
+        continue;
+      Segments.push_back(SymbolName.substr(SegmentStart, Index - SegmentStart));
+      SegmentStart = Index;
+    }
+  }
+
+  if (SegmentStart < SymbolName.size())
+    Segments.push_back(SymbolName.substr(SegmentStart));
+
+  // Apply lowercase text normalization.
+  for (auto &Segment : Segments)
+    std::for_each(Segment.begin(), Segment.end(), ::tolower);
+
+  return Segments;
+}
+
+std::vector<SearchToken> generateTrigrams(llvm::StringRef SymbolName) {
+  return generateTrigrams(segmentIdentifier(SymbolName));
+}
+
+} // namespace noctem
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -34,14 +34,17 @@
   TUScheduler.cpp
   URI.cpp
   XRefs.cpp
+
   index/CanonicalIncludes.cpp
   index/FileIndex.cpp
   index/Index.cpp
   index/MemIndex.cpp
   index/Merge.cpp
   index/SymbolCollector.cpp
   index/SymbolYAML.cpp
 
+  index/noctem/SearchToken.cpp
+
   LINK_LIBS
   clangAST
   clangASTMatchers

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D49417: [clangd] Implement trigram generation algorithm for new symbol index

Reply via email to