[PATCH] D40060: [clangd] Fuzzy match scorer

Sam McCall via Phabricator via cfe-commits Tue, 14 Nov 2017 17:39:57 -0800

sammccall created this revision.
Herald added a subscriber: mgorny.

This will be used for rescoring code completion results based on partial
identifiers.
Short-term use:


- we want to limit the number of code completion results returned to improve 
performance of global completion. The scorer will be used to rerank the results 
to return when the user has applied a filter.

Long-term use case:

- ranking of completion results from in-memory index
- merging of completion results from multiple sources (merging usually works 
best when done at the component-score level, rescoring the fuzzy-match quality 
avoids different backends needing to have comparable scores)


https://reviews.llvm.org/D40060

Files:
  clangd/CMakeLists.txt
  clangd/FuzzyMatch.cpp
  clangd/FuzzyMatch.h
  unittests/clangd/CMakeLists.txt
  unittests/clangd/FuzzyMatchTests.cpp

Index: unittests/clangd/FuzzyMatchTests.cpp
===================================================================
--- /dev/null
+++ unittests/clangd/FuzzyMatchTests.cpp
@@ -0,0 +1,122 @@
+//===-- FuzzyMatchTests.cpp - String fuzzy matcher tests --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+using namespace llvm;
+using testing::Not;
+
+struct MatchesMatcher : public testing::MatcherInterface<StringRef> {
+  StringRef Candidate;
+  MatchesMatcher(StringRef Candidate) : Candidate(Candidate) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    *OS << "Matches '" << Candidate.str() << "'";
+  }
+
+  bool MatchAndExplain(StringRef Pattern, testing::MatchResultListener *L) const override{
+    std::unique_ptr<raw_ostream> OS(L->stream() ? (raw_ostream*)(new raw_os_ostream(*L->stream())) : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    auto Result = Matcher.match(Candidate);
+    Matcher.dumpLast(*OS << "\n");
+    return !!Result;
+  }
+};
+
+// Accepts patterns that match a given word.
+// Dumps the debug tables on match failure.
+testing::Matcher<StringRef> matches(StringRef Word) {
+    return testing::MakeMatcher<StringRef>(new MatchesMatcher(Word));
+}
+
+TEST(FuzzyMatch, Matches) {
+  EXPECT_THAT("u_p", matches("unique_ptr"));
+  EXPECT_THAT("up", matches("unique_ptr"));
+  EXPECT_THAT("uq", matches("unique_ptr"));
+  EXPECT_THAT("qp", Not(matches("unique_ptr")));
+  EXPECT_THAT("log", Not(matches("SVGFEMorphologyElement")));
+}
+
+struct RankMatcher : public testing::MatcherInterface<StringRef> {
+  std::vector<StringRef> RankedStrings;
+  RankMatcher(std::initializer_list<StringRef> RankedStrings)
+      : RankedStrings(RankedStrings) {}
+
+  void DescribeTo(::std::ostream *OS) const override {
+    *OS << "Ranks strings in order: ["
+        << join(RankedStrings.begin(), RankedStrings.end(), ", ") << "]";
+  }
+
+  bool MatchAndExplain(StringRef Pattern,
+                       testing::MatchResultListener *L) const override {
+    std::unique_ptr<raw_ostream> OS(
+        L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream()))
+                    : new raw_null_ostream());
+    FuzzyMatcher Matcher(Pattern);
+    StringRef LastString;
+    Optional<float> LastScore;
+    bool Ok = true;
+    for (StringRef Str : RankedStrings) {
+      auto Score = Matcher.match(Str);
+      if (!Score) {
+        *OS << "\nDoesn't match '" << Str.str() << "'";
+        Matcher.dumpLast(*OS << "\n");
+        Ok = false;
+      } else if (LastScore && *LastScore < *Score) {
+        *OS << "\nRanks '" << Str.str() << "'=" << *Score << " above '"
+            << LastString.str() << "'=" << *LastScore;
+        Matcher.dumpLast(*OS << "\n");
+        Matcher.match(LastString);
+        Matcher.dumpLast(*OS << "\n");
+        Ok = false;
+      }
+      LastString = Str;
+      LastScore = Score;
+    }
+    return Ok;
+  }
+};
+
+// Accepts patterns that match all the strings and rank them in the given order.
+// Dumps the debug tables on match failure.
+template <typename... T>
+testing::Matcher<StringRef> ranks(T... RankedStrings) {
+  return testing::MakeMatcher<StringRef>(new RankMatcher{RankedStrings...});
+}
+
+TEST(FuzzyMatch, Ranking) {
+  EXPECT_THAT("eb", ranks("emplace_back", "embed"));
+  EXPECT_THAT("cons", ranks("console", "Console", "ArrayBufferConstructor"));
+  EXPECT_THAT("foo", ranks("foo", "Foo"));
+  EXPECT_THAT("onMess", ranks("onMessage", "onmessage", "onThisMegaEscapes"));
+  EXPECT_THAT("CC", ranks("CamelCase", "camelCase"));
+  EXPECT_THAT("cC", ranks("camelCase", "CamelCase"));
+  EXPECT_THAT("p", ranks("parse", "posix", "pafdsa", "path", "p"));
+  EXPECT_THAT("pa", ranks("parse", "path", "pafdsa"));
+  EXPECT_THAT("log", ranks("log", "ScrollLogicalPosition"));
+  EXPECT_THAT("e", ranks("else", "AbstractElement"));
+  EXPECT_THAT("workbench.sideb", ranks("workbench.sideBar.location", "workbench.editor.defaultSideBySideLayout"));
+  EXPECT_THAT("editor.r", ranks("editor.renderControlCharacter", "editor.overviewRulerlanes", "diffEditor.renderSideBySide"));
+  EXPECT_THAT("-mo", ranks("-moz-columns", "-ms-ime-mode"));
+  EXPECT_THAT("convertModelPosition", ranks("convertModelPositionToViewPosition", "convertViewToModelPosition"));
+  EXPECT_THAT("is", ranks("isValidViewletId", "import statement"));
+  EXPECT_THAT("title", ranks("window.title", "files.trimTrailingWhitespace"));
+  EXPECT_THAT("strcpy", ranks("strcpy", "strcpy_s", "strncpy"));
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang
Index: unittests/clangd/CMakeLists.txt
===================================================================
--- unittests/clangd/CMakeLists.txt
+++ unittests/clangd/CMakeLists.txt
@@ -10,6 +10,7 @@
 
 add_extra_unittest(ClangdTests
   ClangdTests.cpp
+  FuzzyMatchTests.cpp
   JSONExprTests.cpp
   TraceTests.cpp
   )
Index: clangd/FuzzyMatch.h
===================================================================
--- /dev/null
+++ clangd/FuzzyMatch.h
@@ -0,0 +1,64 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements fuzzy-matching of strings against identifiers.
+// It indicates both the existence and quality of a match:
+// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace clangd {
+
+class FuzzyMatcher {
+public:
+  FuzzyMatcher(llvm::StringRef Pattern);
+
+  // If Word matches the pattern, return a score in [0,1] (higher is better).
+  llvm::Optional<float> match(llvm::StringRef Word);
+
+  // Dump internal state from the last match() to the stream, for debugging.
+  void dumpLast(llvm::raw_ostream &) const;
+
+private:
+  constexpr static int MaxPat = 63;
+  constexpr static int MaxWord = 127;
+  enum CharRole { Unknown, Tail, Head, Separator };
+
+  bool init(llvm::StringRef Word);
+  void buildGraph();
+  void calculateRoles(const char *Text, CharRole *Out, int N);
+  int skipPenalty(int W);
+  int matchBonus(int P, int W);
+
+  char Pat[MaxPat];
+  char LPat[MaxPat];
+  int NPat;
+  char Word[MaxWord];
+  char LWord[MaxWord];
+  int NWord;
+  CharRole PatRole[MaxPat];
+  CharRole WordRole[MaxWord];
+  int Score[MaxPat+1][MaxWord+1]; // Oversized for alignment.
+  bool MatchGraph[MaxPat+1][MaxWord+1];
+  bool IsSubstring;
+  float ScoreScale;
+};
+
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clangd/FuzzyMatch.cpp
===================================================================
--- /dev/null
+++ clangd/FuzzyMatch.cpp
@@ -0,0 +1,288 @@
+//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
+// we consider the possible partial match states:
+//
+//     u n i q u e _ p t r
+//   +---------------------
+//   |A . . . . . . . . . .
+//  u|
+//   |. . . . . . . . . . .
+//  _|
+//   |. . . . . . . O . . .
+//  p|
+//   |. . . . . . . . . . B
+//
+// Each dot represents some prefix of the pattern being matched against some
+// prefix of the word.
+//   - A is the initial state: '' matched against ''
+//   - O is an intermediate state: 'u_' matched against 'unique_'
+//   - B is the target state: 'u_p' matched against 'unique_ptr'
+//
+// We aim to find the best path from A->B.
+//  - Moving right (consuming a word character)
+//    Always legal: not all word characters must match.
+//  - Moving diagonally (consuming both a word and pattern character)
+//    Legal if the characters match.
+//  - Moving down (consuming a pattern character) is never legal.
+//    Never legal: all pattern characters must match something.
+//
+// The scoring is based on heuristics:
+//  - when matching a character, apply a bonus or penalty depending on the
+//    match quality (does case match, do word segments align, etc)
+//  - when skipping a character, apply a penalty if it starts a word segment
+//  - if the first pattern character matches the middle of a word segment, then
+//    apply such a huge penalty that this match is never used.
+//
+// We treat strings as byte-sequences, so only ASCII has first-class support.
+//
+// This algorithm is inspired by VS code's client-side filtering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzyMatch.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/Support/Format.h"
+
+using namespace clang::clangd;
+using namespace llvm;
+
+const int FuzzyMatcher::MaxPat;
+const int FuzzyMatcher::MaxWord;
+
+static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; }
+// A "negative infinity" score that won't overflow.
+// We use this to mark unreachable states and forbidden solutions.
+static constexpr int AwfulScore = std::numeric_limits<int>::min() / 2;
+static bool isAwful(int S) { return S < AwfulScore/2; }
+
+FuzzyMatcher::FuzzyMatcher(StringRef Pattern)
+    : NPat(std::min<int>(MaxPat, Pattern.size())), NWord(0),
+      ScoreScale(0.5f / NPat) {
+  memcpy(Pat, Pattern.data(), NPat);
+  for (int I = 0; I < NPat; ++I)
+    LPat[I] = lower(Pat[I]);
+  Score[0][0] = 0;
+  for (int P = 0; P <= NPat; ++P)
+    for (int W = 0; W < P; ++W)
+      Score[P][W] = AwfulScore;
+  calculateRoles(Pat, PatRole, NPat);
+}
+
+Optional<float> FuzzyMatcher::match(StringRef Word) {
+  if (!NPat)
+    return 1;
+  if (!(IsSubstring = init(Word)))
+    return None;
+  buildGraph();
+  if (isAwful(Score[NPat][NWord]))
+    return None;
+  return ScoreScale * std::min(2 * NPat, std::max(0, Score[NPat][NWord]));
+}
+
+// Segmentation of words and patterns
+// We mark each character as the Head or Tail of a segment, or a Separator.
+// e.g. XMLHttpRequest_Async
+//      +--+---+------ +----
+//
+// This happens in two stages:
+//   1. We classify chars into Types:
+//        Upper is uppercase letters
+//        Lower is lowercase letters, and also numbers
+//        Punctuation is most other things
+//   2. A characters segment role can be determined by the Types of itself and
+//      its neighbors. e.g. the middle entry in (Upper, Upper, Lower) is a Head.
+//      The Empty type is used to represent the start/end of string.
+//
+// Both of these stages are performed using lookup tables.
+// Type and Role can each be represented into 2 bits, so we pack 4 into a byte.
+namespace {
+enum CharType { Empty, Lower, Upper, Punctuation };
+// 4 packed CharTypes per entry, indexed by char.
+constexpr uint8_t CharTypes[] = {
+  0x00, 0x00, 0x00, 0x00, // Control characters
+  0x00, 0x00, 0x00, 0x00, // Control characters
+  0xff, 0xff, 0xff, 0xff, // Punctuation
+  0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation.
+  0xab, 0xaa, 0xaa, 0xaa, // @ and A-O
+  0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation.
+  0x57, 0x55, 0x55, 0x55, // ` and a-o
+  0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL.
+  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower.
+  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // This includes UTF-8.
+  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+  0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+};
+// 4 packed CharRoles per entry, indexed by a packed triple of CharTypes:
+// (previous, current, next).
+constexpr uint8_t CharRoles[] = {
+  0x00, 0xaa, 0xaa, 0xff, // At start of word, Lower|Upper -> Head
+  0x00, 0x55, 0xaa, 0xff, // After Lower, Upper->Head, Lower->Tail
+  0x00, 0x55, 0x59, 0xff, // After Upper, Upper->Tail, Lower->Tail
+                          // Exception: [Upper] Upper [Lower]->Head
+  0x00, 0xaa, 0xaa, 0xff, // After Separator, Lower|Upper -> Head
+};
+template <typename T>
+T PackedLookup(const uint8_t *Data, int I) {
+  return static_cast<T>((Data[I >> 2] >> ((I&3) * 2)) & 3);
+}
+}
+void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) {
+  int Types = PackedLookup<CharType>(CharTypes, Text[0]);
+  auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; };
+  for (int I = 0; I < N - 1; ++I) {
+    Rotate(PackedLookup<CharType>(CharTypes, Text[I+1]));
+    *Out++ = PackedLookup<CharRole>(CharRoles, Types);
+  }
+  Rotate(Empty);
+  *Out++ = PackedLookup<CharRole>(CharRoles, Types);
+}
+
+// Sets up the data structures matching Word.
+// Returns false if we can cheaply determine that no match is possible.
+bool FuzzyMatcher::init(StringRef NewWord) {
+  NWord = std::min<int>(MaxWord, NewWord.size());
+  if (NPat > NWord)
+    return false;
+  memcpy(Word, NewWord.data(), NWord);
+  for (int I = 0; I < NWord; ++I)
+    LWord[I] = lower(Word[I]);
+
+  // Cheap subsequence check.
+  for (int W = 0, P = 0; P != NPat; ++W) {
+    if (W == NWord)
+      return false;
+    if (LWord[W] == LPat[P])
+      ++P;
+  }
+
+  calculateRoles(Word, WordRole, NWord);
+  return true;
+}
+
+// The forwards pass finds the mappings of Pattern onto Word.
+// Score = best score achieved matching Word[..W] against Pat[..P].
+// Unlike other tables, indices range from 0 to N *inclusive*
+// Matched = whether we chose to match Word[W] with Pat[P] or not.
+void FuzzyMatcher::buildGraph() {
+  for (int W = 0; W < NWord; ++W)
+    Score[0][W+1] = Score[0][W] - skipPenalty(W);
+  for (int P = 0; P < NPat; ++P) {
+    for (int W = P; W < NWord; ++W) {
+      int Left = Score[P+1][W];
+      if (P < NPat - 1)
+        Left -= skipPenalty(W);
+      if (LPat[P] == LWord[W]) {
+        int Diag = Score[P][W] + matchBonus(P, W);
+        if (Diag >= Left) {
+          Left = Score[P+1][W+1] = Diag;
+          Matched[P][W] = true;
+          continue;
+        }
+      }
+      Score[P+1][W+1] = Left;
+      Matched[P][W] = false;
+    }
+  }
+}
+
+int FuzzyMatcher::skipPenalty(int W) {
+  // Penalty: skipped a segment.
+  if (WordRole[W] == Head)
+    return 2;
+  return 0;
+}
+
+int FuzzyMatcher::matchBonus(int P, int W) {
+  // Forbidden: matching the first pattern character in the middle of a segment.
+  if (!P && WordRole[W] == Tail)
+    return AwfulScore;
+  int S = 0;
+  // Bonus: pattern is part of a word prefix.
+  if (P == W)
+    ++S;
+  // Bonus: case matches, or an asserted word break matches an actual.
+  if (Pat[P] == Word[W] || (PatRole[P] == Head && WordRole[W] == Head))
+    ++S;
+  // Penalty: matching inside a word where the previous didn't match.
+  if (WordRole[W] == Tail && P && !Matched[P-1][W-1])
+    S -= 3;
+  // Penalty: asserted word break but no actual.
+  if (PatRole[P] == Head && WordRole[W] == Tail)
+    --S;
+  return S;
+}
+
+void FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const {
+  OS << "=== Match \"" << StringRef(Word, NWord) << "\" against ["
+     << StringRef(Pat, NPat) << "] ===\n";
+  if (!IsSubstring) {
+    OS << "Substring check failed.\n";
+    return;
+  }
+
+  // Traverse Matched table backwards to reconstruct the Pattern/Word mapping.
+  // The Score table has cumulative scores, subtracting along this path gives
+  // us the per-letter scores.
+  int S[MaxWord];
+  bool M[MaxWord] = {};
+  for (int W = NWord - 1, P = NPat - 1; W >= 0; --W) {
+    if (P >= 0 && Matched[P][W]) {
+      S[W] = Score[P+1][W+1] - Score[P][W];
+      --P;
+      M[W] = true;
+    } else
+      S[W] = Score[P+1][W+1] - Score[P+1][W];
+  }
+  for (char C : StringRef(Word, NWord))
+    OS << " " << C << " ";
+  OS << "\n";
+  for (int I = 0, J = 0; I < NWord; I++)
+    OS << " " << (M[I] ? Pat[J++] : ' ') << " ";
+  OS << "\n";
+  for (int I = 0; I < NWord; I++)
+    OS << format("%2d ", S[I]);
+  OS << "\n";
+
+  OS << "\nSegmentation:";
+  OS << "\n'" << StringRef(Word, NWord) << "'\n ";
+  for (int I = 0; I < NWord; ++I)
+    OS << "?-+ "[static_cast<int>(WordRole[I])];
+  OS << "\n[" << StringRef(Pat, NPat) << "]\n ";
+  for (int I = 0; I < NPat; ++I)
+    OS << "?-+ "[static_cast<int>(PatRole[I])];
+  OS << "\n";
+
+  OS << "\nScoring table:\n";
+  OS << " |    ";
+  for (char C : StringRef(Word, NWord))
+    OS << "  " << C << " ";
+  OS << "\n";
+  OS << "-+----" << std::string(NWord * 4, '-') << "\n";
+  for (int I = 0; I <= NPat; ++I) {
+    OS << (I ? Pat[I-1] : ' ') << "|";
+    for (int J = 0; J <= NWord; ++J)
+      if (isAwful(Score[I][J]))
+        OS << format("%3d ", Score[I][J]);
+      else
+        OS << "    ";
+    OS << "\n";
+  }
+
+  OS << "\nMatch graph:\n";
+  OS << " |" << StringRef(Word, NWord) << "\n";
+  OS << "-+" << std::string(NWord, '-') << "\n";
+  for (int I = 0; I < NPat; ++I) {
+    OS << Pat[I] << "|";
+    for (int J = 0; J < NWord; ++J)
+      OS << " 1"[static_cast<int>(Matched[I][J])];
+    OS << "\n";
+  }
+}
Index: clangd/CMakeLists.txt
===================================================================
--- clangd/CMakeLists.txt
+++ clangd/CMakeLists.txt
@@ -8,6 +8,7 @@
   ClangdUnit.cpp
   ClangdUnitStore.cpp
   DraftStore.cpp
+  FuzzyMatch.cpp
   GlobalCompilationDatabase.cpp
   JSONRPCDispatcher.cpp
   JSONExpr.cpp

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D40060: [clangd] Fuzzy match scorer

Reply via email to