omtcyfz updated this revision to Diff 156443.
omtcyfz marked 35 inline comments as done.
omtcyfz added a comment.
Addressed most comments (aside from reusing fuzzy matching segmentation routine
and making data + hash a separate structure).
Since I already submitted my next revision (https://reviews.llvm.org/D49546)
through another account with more verbose and less confusing username I will
remove everyone from this revision and create this revision under the latter
account.
https://reviews.llvm.org/D49417
Files:
clang-tools-extra/clangd/CMakeLists.txt
clang-tools-extra/clangd/index/dex/Token.cpp
clang-tools-extra/clangd/index/dex/Token.h
clang-tools-extra/clangd/index/dex/Trigram.cpp
clang-tools-extra/clangd/index/dex/Trigram.h
clang-tools-extra/unittests/clangd/CMakeLists.txt
clang-tools-extra/unittests/clangd/DexIndexTests.cpp
Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -0,0 +1,92 @@
+//===-- DexIndexTests.cpp ----------------------------*- C++ -*-----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "index/dex/Token.h"
+#include "index/dex/Trigram.h"
+#include "gtest/gtest.h"
+
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+vector<Token> getTrigrams(std::initializer_list<string> Trigrams) {
+ vector<Token> Result;
+ for (const auto &Symbols : Trigrams) {
+ Result.push_back(Token(Symbols, Token::Kind::Trigram));
+ }
+ return Result;
+}
+
+TEST(DexIndexTokens, TrigramSymbolNameTokenization) {
+ EXPECT_EQ(segmentIdentifier("unique_ptr"), vector<string>({"unique", "ptr"}));
+
+ EXPECT_EQ(segmentIdentifier("TUDecl"), vector<string>({"TU", "Decl"}));
+
+ EXPECT_EQ(segmentIdentifier("table_name_"),
+ vector<string>({"table", "name"}));
+
+ EXPECT_EQ(segmentIdentifier("kDaysInAWeek"),
+ vector<string>({"k", "Days", "In", "A", "Week"}));
+
+ EXPECT_EQ(segmentIdentifier("AlternateUrlTableErrors"),
+ vector<string>({"Alternate", "Url", "Table", "Errors"}));
+
+ EXPECT_EQ(segmentIdentifier("IsOK"), vector<string>({"Is", "OK"}));
+
+ EXPECT_EQ(segmentIdentifier("ABSL_FALLTHROUGH_INTENDED"),
+ vector<string>({"ABSL", "FALLTHROUGH", "INTENDED"}));
+
+ EXPECT_EQ(segmentIdentifier("SystemZ"), vector<string>({"System", "Z"}));
+
+ EXPECT_EQ(segmentIdentifier("X86"), vector<string>({"X86"}));
+
+ EXPECT_EQ(segmentIdentifier("ASTNodeKind"),
+ vector<string>({"AST", "Node", "Kind"}));
+
+ EXPECT_EQ(segmentIdentifier("ObjCDictionaryElement"),
+ vector<string>({"Obj", "C", "Dictionary", "Element"}));
+
+ EXPECT_EQ(segmentIdentifier("multiple__underscores___everywhere____"),
+ vector<string>({"multiple", "underscores", "everywhere"}));
+
+ EXPECT_EQ(segmentIdentifier("__cuda_builtin_threadIdx_t"),
+ vector<string>({"cuda", "builtin", "thread", "Idx", "t"}));
+
+ EXPECT_EQ(segmentIdentifier("longUPPERCASESequence"),
+ vector<string>({"long", "UPPERCASE", "Sequence"}));
+}
+
+// FIXME(kbobyrev): Add a test for "ab_cd_ef_gh".
+TEST(DexIndexTrigrams, TrigramGeneration) {
+ EXPECT_EQ(
+ generateTrigrams(segmentIdentifier("a_b_c_d_e_")),
+ getTrigrams({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+
+ EXPECT_EQ(generateTrigrams(segmentIdentifier("clangd")),
+ getTrigrams({"cla", "lan", "ang", "ngd"}));
+
+ EXPECT_EQ(generateTrigrams(segmentIdentifier("abc_def")),
+ getTrigrams({"abc", "def", "abd", "ade", "bcd", "bde", "cde"}));
+
+ EXPECT_EQ(generateTrigrams(segmentIdentifier("unique_ptr")),
+ getTrigrams({"uni", "niq", "iqu", "que", "ptr", "unp", "upt", "nip",
+ "npt", "iqp", "ipt", "qup", "qpt", "uep", "ept"}));
+
+ EXPECT_EQ(generateTrigrams(segmentIdentifier("nl")), getTrigrams({}));
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/unittests/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/unittests/clangd/CMakeLists.txt
+++ clang-tools-extra/unittests/clangd/CMakeLists.txt
@@ -15,9 +15,10 @@
CodeCompleteTests.cpp
CodeCompletionStringsTests.cpp
ContextTests.cpp
+ DexIndexTests.cpp
DraftStoreTests.cpp
- FileIndexTests.cpp
FileDistanceTests.cpp
+ FileIndexTests.cpp
FindSymbolsTests.cpp
FuzzyMatchTests.cpp
GlobalCompilationDatabaseTests.cpp
@@ -27,11 +28,11 @@
SourceCodeTests.cpp
SymbolCollectorTests.cpp
SyncAPI.cpp
+ TUSchedulerTests.cpp
TestFS.cpp
TestTU.cpp
ThreadingTests.cpp
TraceTests.cpp
- TUSchedulerTests.cpp
URITests.cpp
XRefsTests.cpp
)
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -0,0 +1,101 @@
+//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// T
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+
+#include "Token.h"
+
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// Splits unqualified symbol name into segments and casts letters to lowercase
+/// for trigram generation.
+///
+/// First stage of trigram generation algorithm. Given an unqualified symbol
+/// name, this outputs a sequence of string segments using the following rules:
+///
+/// * '_' is a separator. Multiple consecutive underscores are treated as a
+/// single separator. Underscores at the beginning and the end of the symbol
+/// name are skipped.
+///
+/// Examples: "unique_ptr" -> ["unique", "ptr"],
+/// "__builtin_popcount" -> ["builtin", "popcount"]
+/// "snake____case___" -> ["snake", "case"]
+///
+/// * Lowercase letter followed by an uppercase letter is a separator.
+///
+/// Examples: "kItemsCount" -> ["k", "Items", "Count"]
+///
+/// * Sequences of consecutive uppercase letters followed by a lowercase letter:
+/// the last uppercase letter is treated as the beginning of a next segment.
+///
+/// Examples: "TUDecl" -> ["TU", "Decl"]
+/// "kDaysInAWeek" -> ["k", "Days", "In", "A", "Week"]
+///
+/// Note: digits are treated as lowercase letters. Example: "X86" -> ["X86"]
+///
+/// FIXME(kbobyrev): Use the same segmentation algorithm as in fuzzy matching.
+/// FIXME(kbobyrev): Return StringRefs or Offsets.
+std::vector<std::string>
+segmentIdentifier(llvm::StringRef SymbolName);
+
+/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+///
+/// Runs trigram generation for fuzzy-search index on segments produced by
+/// segmentIdentifier(SymbolName);
+///
+///
+/// The motivation for trigram generation algorithm is that extracted trigrams
+/// are 3-char suffixes of paths through the fuzzy matching automaton. There are
+/// four classes of extracted trigrams:
+///
+/// * The simplest one consists of consecutive 3-char sequences of each segment.
+///
+/// Example: "trigram" -> ["tri", "rig", "igr", "gra", "ram"
+///
+/// * Next class consists of front character of subsequent segments.
+///
+/// Example: ["translation", "unit", "decl"] -> ["tud"]
+///
+/// Note: skipping segments is allowed, but not more than one. For example,
+/// given ["a", "b", "c", "d", "e"] -> "ace" is allowed, but "ade" is not.
+///
+/// * Another class of trigrams consists of those with 2 charactersin one
+/// segment and the front character of subsequent segment (just as before,
+/// skipping up to one segment is allowed).
+///
+/// Example: ["ab", "c", "d", "e"] -> ["abc", "abd", "abe"]
+/// Note: similarly to the previous case, "abe" would not be allowed.
+///
+/// * The last class of trigrams is similar to the previous one: it takes one
+/// character from one segment and two front characters from the next or
+/// skip-1-next segments.
+///
+/// Example: ["a", "bc", "de", "fg"] -> ["abc", "ade"]
+/// But not "afg".
+///
+/// Note: the returned list of trigrams does not have duplicates, if any
+/// trigram belongs to more than one class it is only inserted once.
+std::vector<Token>
+generateTrigrams(const std::vector<std::string> &Segments);
+
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -0,0 +1,178 @@
+//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Trigram.h"
+#include "Token.h"
+
+#include "llvm/ADT/DenseSet.h"
+
+#include <cctype>
+#include <string>
+
+using namespace llvm;
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
+// be generating unigrams and bigrams here, too. This would prevent symbol index
+// from applying fuzzy matching on a tremendous number of symbols and allow
+// supplementary retrieval for short queries.
+// Short names (< 3 characters) are currently ignored.
+std::vector<Token> generateTrigrams(const std::vector<std::string> &Segments) {
+ llvm::DenseSet<Token> UniqueTrigrams;
+ std::vector<Token> Trigrams;
+
+ // Extract trigrams consisting of first characters of tokens sorted bytoken
+ // positions. Trigram generator is allowed to skip 1 word between each token.
+ //
+ // Example: ["a", "b", "c", "d", "e"]
+ //
+ // would produce -> ["abc", "acd", "ace", ...] (among the others)
+ //
+ // but not -> ["ade"] because two tokens ("b" and "c") would be skipped in
+ // this case.
+ for (auto FirstSegment = Segments.begin(); FirstSegment != Segments.end();
+ ++FirstSegment) {
+ for (auto SecondSegment = FirstSegment + 1;
+ (SecondSegment <= FirstSegment + 2) &&
+ (SecondSegment != Segments.end());
+ ++SecondSegment) {
+ for (auto ThirdSegment = SecondSegment + 1;
+ (ThirdSegment <= SecondSegment + 2) &&
+ (ThirdSegment != Segments.end());
+ ++ThirdSegment) {
+ // FIXME(kbobryev): This is wrong. Should be *FirstSegment[0] + ...
+ Token Trigram((*FirstSegment + *SecondSegment + *ThirdSegment),
+ Token::Kind::Trigram);
+ if (!UniqueTrigrams.count(Trigram)) {
+ UniqueTrigrams.insert(Trigram);
+ Trigrams.push_back(Trigram);
+ }
+ }
+ }
+ }
+
+ // Iterate through each token with a sliding window and extract trigrams
+ // consisting of 3 consecutive characters.
+ //
+ // Example: "delete" -> ["del", "ele", "let", "ete"]
+ for (const auto &Segment : Segments) {
+ // Token should have at least three characters to have trigram substrings.
+ if (Segment.size() < 3)
+ continue;
+
+ for (size_t Position = 0; Position + 2 < Segment.size(); ++Position)
+ Trigrams.push_back(
+ Token(Segment.substr(Position, 3), Token::Kind::Trigram));
+ }
+
+ // This loop generates both trigrams of the third and fourth classes. It
+ // iterates through each two "subsequent" (consecutive or skip-1-next) tokens
+ // and extracts trigrams out of each pair.
+ for (auto FirstSegment = Segments.begin(); FirstSegment != Segments.end();
+ ++FirstSegment) {
+ for (auto SecondSegment = FirstSegment + 1;
+ (SecondSegment <= FirstSegment + 2) &&
+ (SecondSegment != Segments.end());
+ ++SecondSegment) {
+ for (size_t FirstSegmentIndex = 0;
+ FirstSegmentIndex < FirstSegment->size(); ++FirstSegmentIndex) {
+ // Extract trigrams of the third class: one character of the first token
+ // and two characters from the next or skip-1-next token.
+ if (FirstSegmentIndex + 1 < FirstSegment->size()) {
+ Token Trigram((FirstSegment->substr(FirstSegmentIndex, 2) +
+ SecondSegment->substr(0, 1)),
+ Token::Kind::Trigram);
+ if (!UniqueTrigrams.count(Trigram)) {
+ UniqueTrigrams.insert(Trigram);
+ Trigrams.push_back(Trigram);
+ }
+ }
+ // Extract trigrams of the last class: two character from the first
+ // token and front character from the next or skip-1-next token.
+ if (SecondSegment->size() > 1) {
+ Token Trigram((FirstSegment->substr(FirstSegmentIndex, 1) +
+ SecondSegment->substr(0, 2)),
+ Token::Kind::Trigram);
+ if (!UniqueTrigrams.count(Trigram)) {
+ UniqueTrigrams.insert(Trigram);
+ Trigrams.push_back(Trigram);
+ }
+ }
+ }
+ }
+ }
+
+ return Trigrams;
+}
+
+std::vector<std::string> segmentIdentifier(StringRef SymbolName) {
+ std::vector<std::string> Segments;
+ size_t SegmentStart = 0;
+ // Skip underscores at the beginning, e.g. "__builtin_popcount".
+ while (SymbolName[SegmentStart] == '_')
+ ++SegmentStart;
+
+ for (size_t Index = SegmentStart; Index + 1 < SymbolName.size(); ++Index) {
+ const char CurrentSymbol = SymbolName[Index];
+ const char NextSymbol = SymbolName[Index + 1];
+ // Skip sequences of underscores, e.g. "my__type".
+ if (CurrentSymbol == '_' && NextSymbol == '_') {
+ ++SegmentStart;
+ continue;
+ }
+
+ // Splits if the next symbol is underscore or if processed characters are
+ // [lowercase, Uppercase] which indicates beginning of next token. Digits
+ // are equivalent to lowercase symbols.
+ if ((NextSymbol == '_') ||
+ ((islower(CurrentSymbol) || isdigit(CurrentSymbol)) &&
+ isupper(NextSymbol))) {
+ Segments.push_back(
+ SymbolName.substr(SegmentStart, Index - SegmentStart + 1));
+ SegmentStart = Index + 1;
+ if (NextSymbol == '_')
+ ++SegmentStart;
+ }
+
+ // If there were N (> 1) consecutive uppercase letter the split should
+ // generate two tokens, one of which would consist of N - 1 first uppercase
+ // letters, the next token begins with the last uppercase letter.
+ //
+ // Example: "TUDecl" -> ["TU", "Decl"]
+ if (isupper(CurrentSymbol) &&
+ (islower(NextSymbol) || (isdigit(NextSymbol)))) {
+ // Don't perform split if Index points to the beginning of new token,
+ // otherwise "NamedDecl" would be split into ["N", "amed", "D", "ecl"]
+ if (Index == SegmentStart)
+ continue;
+ Segments.push_back(SymbolName.substr(SegmentStart, Index - SegmentStart));
+ SegmentStart = Index;
+ }
+ }
+
+ if (SegmentStart < SymbolName.size())
+ Segments.push_back(SymbolName.substr(SegmentStart));
+
+ // Apply lowercase text normalization.
+ for (auto &Segment : Segments)
+ std::for_each(Segment.begin(), Segment.end(), ::tolower);
+
+ return Segments;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -0,0 +1,115 @@
+//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tokens are keys for inverted index which are mapped to the
+// corresponding posting lists. Token objects represent a characteristic
+// of a symbol, which can be used to perform efficient search.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+
+#include "llvm/ADT/DenseMap.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// Hashable Token, which represents a search token primitive, such as
+/// trigram for fuzzy search on unqualified symbol names.
+///
+/// Tokens can be used to perform more sophisticated search queries by
+/// constructing complex iterator trees.
+class Token {
+public:
+ /// Kind specifies Token type which defines semantics for the internal
+ /// representation (Data field), examples of such types are:
+ ///
+ /// * Trigram for fuzzy search on unqualified symbol names.
+ /// * Scope primitives, e.g. "symbol belongs to namespace foo::bar".
+ /// * If the symbol represents a variable, token can be its type such as int,
+ /// clang::Decl, …
+ /// * For a symbol representing a function, this can be the
+ /// return type.
+ ///
+ /// Each Kind has different representation (i.e. Data field contents):
+ ///
+ /// * Trigram: 3 bytes containing trigram characters
+ /// * Scope: full scope name, such as "foo::bar::baz::" or "" (global scope)
+ /// * Path: full or relative path to the directory
+ /// * Type: full type name or the USR associated with this type
+ ///
+ /// More Kinds can be added in the future.
+ enum class Kind {
+ Trigram,
+ Scope,
+ };
+
+ Token(llvm::StringRef Data, Kind TokenKind);
+
+ // Returns precomputed hash.
+ size_t hash(const Token &T) const { return Hash; }
+
+ bool operator==(const Token &Other) const {
+ return Hash == Other.Hash && TokenKind == Other.TokenKind &&
+ Data == Other.Data;
+ }
+
+ llvm::StringRef data() const { return Data; }
+
+ const Kind &kind() const { return TokenKind; }
+
+private:
+ friend llvm::hash_code hash_value(const Token &Token) { return Token.Hash; }
+
+ /// Representation which is unique among Token with the same Kind.
+ // FIXME(kbobyrev): Put this into another structure
+ std::string Data;
+ /// Precomputed hash which is used as a key for inverted index.
+ size_t Hash;
+ Kind TokenKind;
+};
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+namespace llvm {
+
+// Support Tokens as DenseMap keys.
+template <> struct DenseMapInfo<clang::clangd::dex::Token> {
+ static inline clang::clangd::dex::Token getEmptyKey() {
+ static clang::clangd::dex::Token EmptyKey(
+ "EMPTYKEY", clang::clangd::dex::Token::Kind::Scope);
+ return EmptyKey;
+ }
+
+ static inline clang::clangd::dex::Token getTombstoneKey() {
+ static clang::clangd::dex::Token TombstoneKey(
+ "TOMBSTONE_KEY", clang::clangd::dex::Token::Kind::Scope);
+ return TombstoneKey;
+ }
+
+ static unsigned getHashValue(const clang::clangd::dex::Token &Tag) {
+ return hash_value(Tag);
+ }
+
+ static bool isEqual(const clang::clangd::dex::Token &LHS,
+ const clang::clangd::dex::Token &RHS) {
+ return LHS == RHS;
+ }
+};
+
+} // namespace llvm
+
+#endif
Index: clang-tools-extra/clangd/index/dex/Token.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.cpp
@@ -0,0 +1,37 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Twine.h"
+
+#include <cctype>
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+Token::Token(llvm::StringRef Data, Kind TokenKind)
+ : Data(Data), TokenKind(TokenKind) {
+ assert(TokenKind != Kind::Trigram ||
+ Data.size() == 3 && "Trigram should contain three characters.");
+ switch (TokenKind) {
+ case Kind::Trigram:
+ Hash = ((Data[0] << 16) & (Data[1] << 8) & Data[2]);
+ break;
+ default:
+ Hash = std::hash<std::string>{}(Data);
+ break;
+ }
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -34,14 +34,18 @@
TUScheduler.cpp
URI.cpp
XRefs.cpp
+
index/CanonicalIncludes.cpp
index/FileIndex.cpp
index/Index.cpp
index/MemIndex.cpp
index/Merge.cpp
index/SymbolCollector.cpp
index/SymbolYAML.cpp
+ index/dex/Token.cpp
+ index/dex/Trigram.cpp
+
LINK_LIBS
clangAST
clangASTMatchers
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits