sammccall updated this revision to Diff 192650.
sammccall added a comment.
Herald added a subscriber: jdoerfert.
Add tests, fix setting the flag too late.


Repository:
  rCTE Clang Tools Extra

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D59935/new/

https://reviews.llvm.org/D59935

Files:
  clang-tidy/tool/clang-tidy-diff.py
  clang-tidy/tool/run-clang-tidy.py
  clangd/ClangdLSPServer.cpp
  clangd/ClangdLSPServer.h
  clangd/Protocol.cpp
  clangd/Protocol.h
  clangd/SourceCode.cpp
  clangd/SourceCode.h
  clangd/index/IndexAction.cpp
  clangd/index/SymbolLocation.h
  clangd/tool/ClangdMain.cpp
  test/clang-tidy/bugprone-parent-virtual-call.cpp
  test/clang-tidy/run-clang-tidy.cpp
  test/clangd/utf8.test
  unittests/clangd/IndexActionTests.cpp
  unittests/clangd/SourceCodeTests.cpp

Index: unittests/clangd/SourceCodeTests.cpp
===================================================================
--- unittests/clangd/SourceCodeTests.cpp
+++ unittests/clangd/SourceCodeTests.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 #include "Annotations.h"
+#include "Context.h"
+#include "Protocol.h"
 #include "SourceCode.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/raw_os_ostream.h"
@@ -21,14 +23,9 @@
 using llvm::HasValue;
 
 MATCHER_P2(Pos, Line, Col, "") {
-  return arg.line == Line && arg.character == Col;
+  return arg.line == int(Line) && arg.character == int(Col);
 }
 
-// The = β†’ πŸ‘† below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
-const char File[] = R"(0:0 = 0
-1:0 β†’ 8
-2:0 πŸ‘† 18)";
-
 /// A helper to make tests easier to read.
 Position position(int line, int character) {
   Position Pos;
@@ -52,8 +49,37 @@
   EXPECT_EQ(lspLength("Β₯"), 1UL);
   // astral
   EXPECT_EQ(lspLength("πŸ˜‚"), 2UL);
+
+  WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
+  EXPECT_EQ(lspLength(""), 0UL);
+  EXPECT_EQ(lspLength("ascii"), 5UL);
+  // BMP
+  EXPECT_EQ(lspLength("↓"), 3UL);
+  EXPECT_EQ(lspLength("Β₯"), 2UL);
+  // astral
+  EXPECT_EQ(lspLength("πŸ˜‚"), 4UL);
+
+  WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+  EXPECT_EQ(lspLength(""), 0UL);
+  EXPECT_EQ(lspLength("ascii"), 5UL);
+  // BMP
+  EXPECT_EQ(lspLength("↓"), 1UL);
+  EXPECT_EQ(lspLength("Β₯"), 1UL);
+  // astral
+  EXPECT_EQ(lspLength("πŸ˜‚"), 1UL);
 }
 
+// The = β†’ πŸ‘† below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes).
+const char File[] = R"(0:0 = 0
+1:0 β†’ 8
+2:0 πŸ‘† 18)";
+struct Line {
+  unsigned Number;
+  unsigned Offset;
+  unsigned Length;
+};
+Line FileLines[] = {Line{0, 0, 7}, Line{1, 8, 9}, Line{2, 18, 11}};
+
 TEST(SourceCodeTests, PositionToOffset) {
   // line out of bounds
   EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
@@ -113,6 +139,80 @@
   // line out of bounds
   EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
   EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
+
+  // Codepoints are similar, except near astral characters.
+  WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+  // line out of bounds
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
+  // first line
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, -1)),
+                       llvm::Failed()); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 0)),
+                       llvm::HasValue(0)); // first character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 3)),
+                       llvm::HasValue(3)); // middle character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 6)),
+                       llvm::HasValue(6)); // last character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7)),
+                       llvm::HasValue(7)); // the newline itself
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 7), false),
+                       llvm::HasValue(7));
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8)),
+                       llvm::HasValue(7)); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(0, 8), false),
+                       llvm::Failed()); // out of range
+  // middle line
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, -1)),
+                       llvm::Failed()); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 0)),
+                       llvm::HasValue(8)); // first character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3)),
+                       llvm::HasValue(11)); // middle character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 3), false),
+                       llvm::HasValue(11));
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 6)),
+                       llvm::HasValue(16)); // last character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 7)),
+                       llvm::HasValue(17)); // the newline itself
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8)),
+                       llvm::HasValue(17)); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(1, 8), false),
+                       llvm::Failed()); // out of range
+  // last line
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, -1)),
+                       llvm::Failed()); // out of range
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 0)),
+                       llvm::HasValue(18)); // first character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 4)),
+                       llvm::HasValue(22)); // Before astral character.
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 5), false),
+                       llvm::HasValue(26)); // after astral character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 7)),
+                       llvm::HasValue(28)); // last character
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 8)),
+                       llvm::HasValue(29)); // EOF
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(2, 9), false),
+                       llvm::Failed()); // out of range
+  // line out of bounds
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 1)), llvm::Failed());
+
+  // Test UTF-8, where transformations are trivial.
+  WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(-1, 2)), llvm::Failed());
+  EXPECT_THAT_EXPECTED(positionToOffset(File, position(3, 0)), llvm::Failed());
+  for (Line L : FileLines) {
+    EXPECT_THAT_EXPECTED(positionToOffset(File, position(L.Number, -1)),
+                         llvm::Failed()); // out of range
+    for (unsigned I = 0; I <= L.Length; ++I)
+      EXPECT_THAT_EXPECTED(positionToOffset(File, position(L.Number, I)),
+                           llvm::HasValue(L.Offset + I));
+    EXPECT_THAT_EXPECTED(positionToOffset(File, position(L.Number, L.Length+1)),
+                         llvm::HasValue(L.Offset + L.Length));
+    EXPECT_THAT_EXPECTED(
+        positionToOffset(File, position(L.Number, L.Length + 1), false),
+        llvm::Failed()); // out of range
+  }
 }
 
 TEST(SourceCodeTests, OffsetToPosition) {
@@ -134,6 +234,34 @@
   EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 8)) << "end of last line";
   EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 9)) << "EOF";
   EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 9)) << "out of bounds";
+
+  // Codepoints are similar, except near astral characters.
+  WithContextValue UTF32(kCurrentOffsetEncoding, OffsetEncoding::UTF32);
+  EXPECT_THAT(offsetToPosition(File, 0), Pos(0, 0)) << "start of file";
+  EXPECT_THAT(offsetToPosition(File, 3), Pos(0, 3)) << "in first line";
+  EXPECT_THAT(offsetToPosition(File, 6), Pos(0, 6)) << "end of first line";
+  EXPECT_THAT(offsetToPosition(File, 7), Pos(0, 7)) << "first newline";
+  EXPECT_THAT(offsetToPosition(File, 8), Pos(1, 0)) << "start of second line";
+  EXPECT_THAT(offsetToPosition(File, 12), Pos(1, 4)) << "before BMP char";
+  EXPECT_THAT(offsetToPosition(File, 13), Pos(1, 5)) << "in BMP char";
+  EXPECT_THAT(offsetToPosition(File, 15), Pos(1, 5)) << "after BMP char";
+  EXPECT_THAT(offsetToPosition(File, 16), Pos(1, 6)) << "end of second line";
+  EXPECT_THAT(offsetToPosition(File, 17), Pos(1, 7)) << "second newline";
+  EXPECT_THAT(offsetToPosition(File, 18), Pos(2, 0)) << "start of last line";
+  EXPECT_THAT(offsetToPosition(File, 21), Pos(2, 3)) << "in last line";
+  EXPECT_THAT(offsetToPosition(File, 22), Pos(2, 4)) << "before astral char";
+  EXPECT_THAT(offsetToPosition(File, 24), Pos(2, 5)) << "in astral char";
+  EXPECT_THAT(offsetToPosition(File, 26), Pos(2, 5)) << "after astral char";
+  EXPECT_THAT(offsetToPosition(File, 28), Pos(2, 7)) << "end of last line";
+  EXPECT_THAT(offsetToPosition(File, 29), Pos(2, 8)) << "EOF";
+  EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 8)) << "out of bounds";
+
+  WithContextValue UTF8(kCurrentOffsetEncoding, OffsetEncoding::UTF8);
+  for (Line L : FileLines) {
+    for (unsigned I = 0; I <= L.Length; ++I)
+      EXPECT_THAT(offsetToPosition(File, L.Offset + I), Pos(L.Number, I));
+  }
+  EXPECT_THAT(offsetToPosition(File, 30), Pos(2, 11)) << "out of bounds";
 }
 
 TEST(SourceCodeTests, IsRangeConsecutive) {
Index: unittests/clangd/IndexActionTests.cpp
===================================================================
--- unittests/clangd/IndexActionTests.cpp
+++ unittests/clangd/IndexActionTests.cpp
@@ -29,6 +29,8 @@
 
 MATCHER_P(HasDigest, Digest, "") { return arg.Digest == Digest; }
 
+MATCHER_P(HasName, Name, "") { return arg.Name == Name; }
+
 MATCHER(HasSameURI, "") {
   llvm::StringRef URI = testing::get<0>(arg);
   const std::string &Path = testing::get<1>(arg);
@@ -43,6 +45,7 @@
 
 void checkNodesAreInitialized(const IndexFileIn &IndexFile,
                               const std::vector<std::string> &Paths) {
+  ASSERT_TRUE(IndexFile.Sources);
   EXPECT_THAT(Paths.size(), IndexFile.Sources->size());
   for (llvm::StringRef Path : Paths) {
     auto URI = toUri(Path);
@@ -224,6 +227,27 @@
                                         HasDigest(digest(HeaderCode))))));
 }
 
+TEST_F(IndexActionTest, NoWarnings) {
+  std::string MainFilePath = testPath("main.cpp");
+  std::string MainCode = R"cpp(
+      void foo(int x) {
+        if (x = 1) // -Wparentheses
+          return;
+        if (x = 1) // -Wparentheses
+          return;
+      }
+      void bar() {}
+  )cpp";
+  addFile(MainFilePath, MainCode);
+  // We set -ferror-limit so the warning-promoted-to-error would be fatal.
+  // This would cause indexing to stop (if warnings weren't disabled).
+  IndexFileIn IndexFile = runIndexingAction(
+      MainFilePath, {"-ferror-limit=1", "-Wparentheses", "-Werror"});
+  ASSERT_TRUE(IndexFile.Sources);
+  ASSERT_NE(0u, IndexFile.Sources->size());
+  EXPECT_THAT(*IndexFile.Symbols, ElementsAre(HasName("foo"), HasName("bar")));
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
Index: test/clangd/utf8.test
===================================================================
--- /dev/null
+++ test/clangd/utf8.test
@@ -0,0 +1,32 @@
+# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s
+# This test verifies that we can negotiate UTF-8 offsets via protocol extension.
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{"offsetEncoding":["utf-8","utf-16"]},"trace":"off"}}
+# CHECK: "offsetEncoding": "utf-8"
+---
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"/*ΓΆ*/int x;\nint y=x;"}}}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/definition","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":1,"character":6}}}
+# /*ΓΆ*/int x;
+# 01234567890
+# x is character (and utf-16) range [9,10) but byte range [10,11).
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": [
+# CHECK-NEXT:    {
+# CHECK-NEXT:      "range": {
+# CHECK-NEXT:        "end": {
+# CHECK-NEXT:          "character": 11,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        },
+# CHECK-NEXT:        "start": {
+# CHECK-NEXT:          "character": 10,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        }
+# CHECK-NEXT:      },
+# CHECK-NEXT:      "uri": "file://{{.*}}/main.cpp"
+# CHECK-NEXT:    }
+# CHECK-NEXT:  ]
+---
+{"jsonrpc":"2.0","id":10000,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}
Index: test/clang-tidy/run-clang-tidy.cpp
===================================================================
--- test/clang-tidy/run-clang-tidy.cpp
+++ test/clang-tidy/run-clang-tidy.cpp
@@ -1,3 +1,4 @@
+// RUN: %run_clang_tidy --help
 // RUN: rm -rf %t
 // RUN: mkdir %t
 // RUN: echo "[{\"directory\":\".\",\"command\":\"clang++ -c %/t/test.cpp\",\"file\":\"%/t/test.cpp\"}]" | sed -e 's/\\/\\\\/g' > %t/compile_commands.json
Index: clangd/tool/ClangdMain.cpp
===================================================================
--- clangd/tool/ClangdMain.cpp
+++ clangd/tool/ClangdMain.cpp
@@ -9,10 +9,12 @@
 #include "Features.inc"
 #include "ClangdLSPServer.h"
 #include "Path.h"
+#include "Protocol.h"
 #include "Trace.h"
 #include "Transport.h"
 #include "index/Serialization.h"
 #include "clang/Basic/Version.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -219,6 +221,16 @@
                    "includes using index."),
     llvm::cl::init(true));
 
+static llvm::cl::opt<OffsetEncoding> ForceOffsetEncoding(
+    "offset-encoding",
+    llvm::cl::desc("Force the offsetEncoding used for character positions. "
+                   "This bypasses negotiation via client capabilities."),
+    llvm::cl::values(clEnumValN(OffsetEncoding::UTF8, "utf-8",
+                                "Offsets are in UTF-8 bytes"),
+                     clEnumValN(OffsetEncoding::UTF16, "utf-16",
+                                "Offsets are in UTF-16 code units")),
+    llvm::cl::init(OffsetEncoding::UnsupportedEncoding));
+
 namespace {
 
 /// \brief Supports a test URI scheme with relaxed constraints for lit tests.
@@ -458,9 +470,13 @@
   }
   Opts.ClangTidyOptProvider = ClangTidyOptProvider.get();
   Opts.SuggestMissingIncludes = SuggestMissingIncludes;
+  llvm::Optional<OffsetEncoding> OffsetEncodingFromFlag;
+  if (ForceOffsetEncoding != OffsetEncoding::UnsupportedEncoding)
+    OffsetEncodingFromFlag = ForceOffsetEncoding;
   ClangdLSPServer LSPServer(
       *TransportLayer, FSProvider, CCOpts, CompileCommandsDirPath,
-      /*UseDirBasedCDB=*/CompileArgsFrom == FilesystemCompileArgs, Opts);
+      /*UseDirBasedCDB=*/CompileArgsFrom == FilesystemCompileArgs,
+      OffsetEncodingFromFlag, Opts);
   llvm::set_thread_name("clangd.main");
   return LSPServer.run() ? 0
                          : static_cast<int>(ErrorResultCode::NoShutdownRequest);
Index: clangd/index/SymbolLocation.h
===================================================================
--- clangd/index/SymbolLocation.h
+++ clangd/index/SymbolLocation.h
@@ -20,6 +20,13 @@
   // Specify a position (Line, Column) of symbol. Using Line/Column allows us to
   // build LSP responses without reading the file content.
   //
+  // clangd uses the following definitions, which differ slightly from LSP:
+  //  - Line is the number of newline characters (\n) before the point.
+  //  - Column is (by default) the number of UTF-16 code between the last \n
+  //    (or start of file) and the point.
+  //    If the `offsetEncoding` protocol extension is used to negotiate UTF-8,
+  //    then it is instead the number of *bytes* since the last \n.
+  //
   // Position is encoded into 32 bits to save space.
   // If Line/Column overflow, the value will be their maximum value.
   struct Position {
@@ -37,8 +44,7 @@
     static constexpr uint32_t MaxColumn = (1 << 12) - 1;
 
   private:
-    uint32_t Line : 20; // 0-based
-    // Using UTF-16 code units.
+    uint32_t Line : 20;   // 0-based
     uint32_t Column : 12; // 0-based
   };
 
Index: clangd/index/IndexAction.cpp
===================================================================
--- clangd/index/IndexAction.cpp
+++ clangd/index/IndexAction.cpp
@@ -9,7 +9,6 @@
 #include "IndexAction.h"
 #include "index/SymbolOrigin.h"
 #include "clang/Frontend/CompilerInstance.h"
-#include "clang/Index/IndexDataConsumer.h"
 #include "clang/Index/IndexingAction.h"
 #include "clang/Tooling/Tooling.h"
 
@@ -136,6 +135,11 @@
   bool BeginInvocation(CompilerInstance &CI) override {
     // We want all comments, not just the doxygen ones.
     CI.getLangOpts().CommentOpts.ParseAllComments = true;
+    // Index the whole file even if there are warnings and -Werror is't set.
+    // Avoids some analyses too. Set in two places as we're late to the party.
+    CI.getDiagnosticOpts().IgnoreWarnings = true;
+    CI.getDiagnostics().setIgnoreAllWarnings(true);
+
     return WrapperFrontendAction::BeginInvocation(CI);
   }
 
Index: clangd/SourceCode.h
===================================================================
--- clangd/SourceCode.h
+++ clangd/SourceCode.h
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H
+#include "Context.h"
 #include "Protocol.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LangOptions.h"
@@ -34,8 +35,14 @@
 FileDigest digest(StringRef Content);
 Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID);
 
+// This context variable controls the behavior of functions in this file
+// that convert between LSP offsets and native clang byte offsets.
+// If not set, defaults to UTF-16 for backwards-compatibility.
+extern Key<OffsetEncoding> kCurrentOffsetEncoding;
+
 // Counts the number of UTF-16 code units needed to represent a string (LSP
 // specifies string lengths in UTF-16 code units).
+// Use of UTF-16 may be overridden by kCurrentOffsetEncoding.
 size_t lspLength(StringRef Code);
 
 /// Turn a [line, column] pair into an offset in Code.
Index: clangd/SourceCode.cpp
===================================================================
--- clangd/SourceCode.cpp
+++ clangd/SourceCode.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 #include "SourceCode.h"
 
+#include "Context.h"
 #include "Logger.h"
+#include "Protocol.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
@@ -15,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
 
 namespace clang {
@@ -28,6 +31,8 @@
 // Returns true if CB returned true, false if we hit the end of string.
 template <typename Callback>
 static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) {
+  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
+  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   for (size_t I = 0; I < U8.size();) {
     unsigned char C = static_cast<unsigned char>(U8[I]);
     if (LLVM_LIKELY(!(C & 0x80))) { // ASCII character.
@@ -51,31 +56,75 @@
   return false;
 }
 
-// Returns the offset into the string that matches \p Units UTF-16 code units.
-// Conceptually, this converts to UTF-16, truncates to CodeUnits, converts back
-// to UTF-8, and returns the length in bytes.
-static size_t measureUTF16(llvm::StringRef U8, int U16Units, bool &Valid) {
+// Returns the byte offset into the string that is an offset of \p Units in
+// the specified encoding.
+// Conceptually, this converts to the encoding, truncates to CodeUnits,
+// converts back to UTF-8, and returns the length in bytes.
+static size_t measureUnits(llvm::StringRef U8, int Units, OffsetEncoding Enc,
+                           bool &Valid) {
+  Valid = Units >= 0;
+  if (Units <= 0)
+    return 0;
   size_t Result = 0;
-  Valid = U16Units == 0 || iterateCodepoints(U8, [&](int U8Len, int U16Len) {
-            Result += U8Len;
-            U16Units -= U16Len;
-            return U16Units <= 0;
-          });
-  if (U16Units < 0) // Offset was into the middle of a surrogate pair.
-    Valid = false;
+  switch (Enc) {
+  case OffsetEncoding::UTF8:
+    Result = Units;
+    break;
+  case OffsetEncoding::UTF16:
+    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+      Result += U8Len;
+      Units -= U16Len;
+      return Units <= 0;
+    });
+    if (Units < 0) // Offset in the middle of a surrogate pair.
+      Valid = false;
+    break;
+  case OffsetEncoding::UTF32:
+    Valid = iterateCodepoints(U8, [&](int U8Len, int U16Len) {
+      Result += U8Len;
+      Units--;
+      return Units <= 0;
+    });
+    break;
+  case OffsetEncoding::UnsupportedEncoding:
+    llvm_unreachable("unsupported encoding");
+  }
   // Don't return an out-of-range index if we overran.
-  return std::min(Result, U8.size());
+  if (Result > U8.size()) {
+    Valid = false;
+    return U8.size();
+  }
+  return Result;
+}
+
+Key<OffsetEncoding> kCurrentOffsetEncoding;
+static OffsetEncoding lspEncoding() {
+  auto *Enc = Context::current().get(kCurrentOffsetEncoding);
+  return Enc ? *Enc : OffsetEncoding::UTF16;
 }
 
 // Like most strings in clangd, the input is UTF-8 encoded.
 size_t lspLength(llvm::StringRef Code) {
-  // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
-  // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   size_t Count = 0;
-  iterateCodepoints(Code, [&](int U8Len, int U16Len) {
-    Count += U16Len;
-    return false;
-  });
+  switch (lspEncoding()) {
+  case OffsetEncoding::UTF8:
+    Count = Code.size();
+    break;
+  case OffsetEncoding::UTF16:
+    iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+      Count += U16Len;
+      return false;
+    });
+    break;
+  case OffsetEncoding::UTF32:
+    iterateCodepoints(Code, [&](int U8Len, int U16Len) {
+      ++Count;
+      return false;
+    });
+    break;
+  case OffsetEncoding::UnsupportedEncoding:
+    llvm_unreachable("unsupported encoding");
+  }
   return Count;
 }
 
@@ -98,20 +147,18 @@
           llvm::errc::invalid_argument);
     StartOfLine = NextNL + 1;
   }
+  StringRef Line =
+      Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
 
-  size_t NextNL = Code.find('\n', StartOfLine);
-  if (NextNL == llvm::StringRef::npos)
-    NextNL = Code.size();
-
+  // P.character may be in UTF-16, transcode if necessary.
   bool Valid;
-  size_t ByteOffsetInLine = measureUTF16(
-      Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid);
+  size_t ByteInLine = measureUnits(Line, P.character, lspEncoding(), Valid);
   if (!Valid && !AllowColumnsBeyondLineLength)
     return llvm::make_error<llvm::StringError>(
-        llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
-                      P.line),
+        llvm::formatv("{0} offset {1} is invalid for line {2}", lspEncoding(),
+                      P.character, P.line),
         llvm::errc::invalid_argument);
-  return StartOfLine + ByteOffsetInLine;
+  return StartOfLine + ByteInLine;
 }
 
 Position offsetToPosition(llvm::StringRef Code, size_t Offset) {
Index: clangd/Protocol.h
===================================================================
--- clangd/Protocol.h
+++ clangd/Protocol.h
@@ -28,6 +28,7 @@
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/raw_ostream.h"
 #include <bitset>
 #include <string>
 #include <vector>
@@ -338,6 +339,21 @@
 // https://github.com/Microsoft/language-server-protocol/issues/344
 SymbolKind indexSymbolKindToSymbolKind(index::SymbolKind Kind);
 
+// Determines the encoding used to measure offsets and lengths of source in LSP.
+enum class OffsetEncoding {
+  // Any string is legal on the wire. Unrecognized encodings parse as this.
+  UnsupportedEncoding,
+  // Length counts code units of UTF-16 encoded text. (Standard LSP behavior).
+  UTF16,
+  // Length counts bytes of UTF-8 encoded text. (Clangd extension).
+  UTF8,
+  // Length counts codepoints in unicode text. (Clangd extension).
+  UTF32,
+};
+llvm::json::Value toJSON(const OffsetEncoding &);
+bool fromJSON(const llvm::json::Value &, OffsetEncoding &);
+llvm::raw_ostream &operator<<(llvm::raw_ostream &, OffsetEncoding OS);
+
 // This struct doesn't mirror LSP!
 // The protocol defines deeply nested structures for client capabilities.
 // Instead of mapping them all, this just parses out the bits we care about.
@@ -369,6 +385,9 @@
   /// Client supports CodeAction return value for textDocument/codeAction.
   /// textDocument.codeAction.codeActionLiteralSupport.
   bool CodeActionStructure = false;
+
+  /// Supported encodings for LSP character offsets. (clangd extension).
+  llvm::Optional<std::vector<OffsetEncoding>> offsetEncoding;
 };
 bool fromJSON(const llvm::json::Value &, ClientCapabilities &);
 
Index: clangd/Protocol.cpp
===================================================================
--- clangd/Protocol.cpp
+++ clangd/Protocol.cpp
@@ -16,6 +16,7 @@
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/JSON.h"
@@ -311,6 +312,11 @@
       }
     }
   }
+  if (auto *OffsetEncoding = O->get("offsetEncoding")) {
+    R.offsetEncoding.emplace();
+    if (!fromJSON(*OffsetEncoding, *R.offsetEncoding))
+      return false;
+  }
   return true;
 }
 
@@ -932,5 +938,33 @@
   return fromJSON(Params, Base);
 }
 
+static const char *toString(OffsetEncoding OE) {
+  switch (OE) {
+  case OffsetEncoding::UTF8:
+    return "utf-8";
+  case OffsetEncoding::UTF16:
+    return "utf-16";
+  case OffsetEncoding::UTF32:
+    return "utf-32";
+  case OffsetEncoding::UnsupportedEncoding:
+    return "unknown";
+  }
+}
+llvm::json::Value toJSON(const OffsetEncoding &OE) { return toString(OE); }
+bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {
+  auto Str = V.getAsString();
+  if (!Str)
+    return false;
+  OE = llvm::StringSwitch<OffsetEncoding>(*Str)
+           .Case("utf-8", OffsetEncoding::UTF8)
+           .Case("utf-16", OffsetEncoding::UTF16)
+           .Case("utf-32", OffsetEncoding::UTF32)
+           .Default(OffsetEncoding::UnsupportedEncoding);
+  return true;
+}
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OffsetEncoding Enc) {
+  return OS << toString(Enc);
+}
+
 } // namespace clangd
 } // namespace clang
Index: clangd/ClangdLSPServer.h
===================================================================
--- clangd/ClangdLSPServer.h
+++ clangd/ClangdLSPServer.h
@@ -40,6 +40,7 @@
   ClangdLSPServer(Transport &Transp, const FileSystemProvider &FSProvider,
                   const clangd::CodeCompleteOptions &CCOpts,
                   llvm::Optional<Path> CompileCommandsDir, bool UseDirBasedCDB,
+                  llvm::Optional<OffsetEncoding> ForcedOffsetEncoding,
                   const ClangdServer::Options &Opts);
   ~ClangdLSPServer();
 
@@ -165,6 +166,7 @@
   // It is destroyed before run() returns, to ensure worker threads exit.
   ClangdServer::Options ClangdServerOpts;
   llvm::Optional<ClangdServer> Server;
+  llvm::Optional<OffsetEncoding> NegotiatedOffsetEncoding;
 };
 } // namespace clangd
 } // namespace clang
Index: clangd/ClangdLSPServer.cpp
===================================================================
--- clangd/ClangdLSPServer.cpp
+++ clangd/ClangdLSPServer.cpp
@@ -13,6 +13,7 @@
 #include "Trace.h"
 #include "URI.h"
 #include "clang/Tooling/Core/Replacement.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
@@ -93,6 +94,7 @@
   MessageHandler(ClangdLSPServer &Server) : Server(Server) {}
 
   bool onNotify(llvm::StringRef Method, llvm::json::Value Params) override {
+    WithContext HandlerContext(handlerContext());
     log("<-- {0}", Method);
     if (Method == "exit")
       return false;
@@ -109,6 +111,7 @@
 
   bool onCall(llvm::StringRef Method, llvm::json::Value Params,
               llvm::json::Value ID) override {
+    WithContext HandlerContext(handlerContext());
     // Calls can be canceled by the client. Add cancellation context.
     WithContext WithCancel(cancelableRequestContext(ID));
     trace::Span Tracer(Method);
@@ -129,6 +132,7 @@
 
   bool onReply(llvm::json::Value ID,
                llvm::Expected<llvm::json::Value> Result) override {
+    WithContext HandlerContext(handlerContext());
     // We ignore replies, just log them.
     if (Result)
       log("<-- reply({0})", ID);
@@ -259,6 +263,13 @@
     if (It != RequestCancelers.end())
       It->second.first(); // Invoke the canceler.
   }
+
+  Context handlerContext() const {
+    return Context::current().derive(
+        kCurrentOffsetEncoding,
+        Server.NegotiatedOffsetEncoding.getValueOr(OffsetEncoding::UTF16));
+  }
+
   // We run cancelable requests in a context that does two things:
   //  - allows cancellation using RequestCancelers[ID]
   //  - cleans up the entry in RequestCancelers when it's no longer needed
@@ -302,6 +313,20 @@
 
 void ClangdLSPServer::onInitialize(const InitializeParams &Params,
                                    Callback<llvm::json::Value> Reply) {
+  // Determine character encoding first as it affects constructed ClangdServer.
+  if (Params.capabilities.offsetEncoding && !NegotiatedOffsetEncoding) {
+    NegotiatedOffsetEncoding = OffsetEncoding::UTF16; // fallback
+    for (OffsetEncoding Supported : *Params.capabilities.offsetEncoding)
+      if (Supported != OffsetEncoding::UnsupportedEncoding) {
+        NegotiatedOffsetEncoding = Supported;
+        break;
+      }
+  }
+  llvm::Optional<WithContextValue> WithOffsetEncoding;
+  if (NegotiatedOffsetEncoding)
+    WithOffsetEncoding.emplace(kCurrentOffsetEncoding,
+                               *NegotiatedOffsetEncoding);
+
   if (Params.rootUri && *Params.rootUri)
     ClangdServerOpts.WorkspaceRoot = Params.rootUri->file();
   else if (Params.rootPath && !Params.rootPath->empty())
@@ -331,7 +356,7 @@
   SupportsHierarchicalDocumentSymbol =
       Params.capabilities.HierarchicalDocumentSymbol;
   SupportFileStatus = Params.initializationOptions.FileStatus;
-  Reply(llvm::json::Object{
+  llvm::json::Object Result{
       {{"capabilities",
         llvm::json::Object{
             {"textDocumentSync", (int)TextDocumentSyncKind::Incremental},
@@ -369,7 +394,10 @@
                    ExecuteCommandParams::CLANGD_APPLY_TWEAK}},
              }},
             {"typeHierarchyProvider", true},
-        }}}});
+        }}}};
+  if (NegotiatedOffsetEncoding)
+    Result["offsetEncoding"] = *NegotiatedOffsetEncoding;
+  Reply(std::move(Result));
 }
 
 void ClangdLSPServer::onShutdown(const ShutdownParams &Params,
@@ -875,19 +903,19 @@
                      std::move(Reply));
 }
 
-ClangdLSPServer::ClangdLSPServer(class Transport &Transp,
-                                 const FileSystemProvider &FSProvider,
-                                 const clangd::CodeCompleteOptions &CCOpts,
-                                 llvm::Optional<Path> CompileCommandsDir,
-                                 bool UseDirBasedCDB,
-                                 const ClangdServer::Options &Opts)
+ClangdLSPServer::ClangdLSPServer(
+    class Transport &Transp, const FileSystemProvider &FSProvider,
+    const clangd::CodeCompleteOptions &CCOpts,
+    llvm::Optional<Path> CompileCommandsDir, bool UseDirBasedCDB,
+    llvm::Optional<OffsetEncoding> ForcedOffsetEncoding,
+    const ClangdServer::Options &Opts)
     : Transp(Transp), MsgHandler(new MessageHandler(*this)),
       FSProvider(FSProvider), CCOpts(CCOpts),
       SupportedSymbolKinds(defaultSymbolKinds()),
       SupportedCompletionItemKinds(defaultCompletionItemKinds()),
       UseDirBasedCDB(UseDirBasedCDB),
-      CompileCommandsDir(std::move(CompileCommandsDir)),
-      ClangdServerOpts(Opts) {
+      CompileCommandsDir(std::move(CompileCommandsDir)), ClangdServerOpts(Opts),
+      NegotiatedOffsetEncoding(ForcedOffsetEncoding) {
   // clang-format off
   MsgHandler->bind("initialize", &ClangdLSPServer::onInitialize);
   MsgHandler->bind("shutdown", &ClangdLSPServer::onShutdown);
Index: clang-tidy/tool/run-clang-tidy.py
===================================================================
--- clang-tidy/tool/run-clang-tidy.py
+++ clang-tidy/tool/run-clang-tidy.py
@@ -47,7 +47,11 @@
 import tempfile
 import threading
 import traceback
-import yaml
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
 
 is_py2 = sys.version[0] == '2'
 
@@ -199,9 +203,10 @@
                       'headers to output diagnostics from. Diagnostics from '
                       'the main file of each translation unit are always '
                       'displayed.')
-  parser.add_argument('-export-fixes', metavar='filename', dest='export_fixes',
-                      help='Create a yaml file to store suggested fixes in, '
-                      'which can be applied with clang-apply-replacements.')
+  if yaml:
+    parser.add_argument('-export-fixes', metavar='filename', dest='export_fixes',
+                        help='Create a yaml file to store suggested fixes in, '
+                        'which can be applied with clang-apply-replacements.')
   parser.add_argument('-j', type=int, default=0,
                       help='number of tidy instances to be run in parallel.')
   parser.add_argument('files', nargs='*', default=['.*'],
@@ -254,7 +259,7 @@
     max_task = multiprocessing.cpu_count()
 
   tmpdir = None
-  if args.fix or args.export_fixes:
+  if args.fix or (yaml and args.export_fixes):
     check_clang_apply_replacements_binary(args)
     tmpdir = tempfile.mkdtemp()
 
@@ -292,7 +297,7 @@
       shutil.rmtree(tmpdir)
     os.kill(0, 9)
 
-  if args.export_fixes:
+  if yaml and args.export_fixes:
     print('Writing fixes to ' + args.export_fixes + ' ...')
     try:
       merge_replacement_files(tmpdir, args.export_fixes)
Index: clang-tidy/tool/clang-tidy-diff.py
===================================================================
--- clang-tidy/tool/clang-tidy-diff.py
+++ clang-tidy/tool/clang-tidy-diff.py
@@ -36,11 +36,10 @@
 import threading
 import traceback
 
-yaml_imported = True
 try:
   import yaml
 except ImportError:
-  yaml_imported = False
+  yaml = None
 
 is_py2 = sys.version[0] == '2'
 
@@ -144,7 +143,7 @@
                       default='')
   parser.add_argument('-path', dest='build_path',
                       help='Path used to read a compile command database.')
-  if yaml_imported:
+  if yaml:
     parser.add_argument('-export-fixes', metavar='FILE', dest='export_fixes',
                         help='Create a yaml file to store suggested fixes in, '
                         'which can be applied with clang-apply-replacements.')
@@ -204,7 +203,7 @@
   max_task_count = min(len(lines_by_file), max_task_count)
 
   tmpdir = None
-  if yaml_imported and args.export_fixes:
+  if yaml and args.export_fixes:
     tmpdir = tempfile.mkdtemp()
 
   # Tasks for clang-tidy.
@@ -238,7 +237,7 @@
     # Run clang-tidy on files containing changes.
     command = [args.clang_tidy_binary]
     command.append('-line-filter=' + line_filter_json)
-    if yaml_imported and args.export_fixes:
+    if yaml and args.export_fixes:
       # Get a temporary file. We immediately close the handle so clang-tidy can
       # overwrite it.
       (handle, tmp_name) = tempfile.mkstemp(suffix='.yaml', dir=tmpdir)
@@ -253,7 +252,7 @@
   # Wait for all threads to be done.
   task_queue.join()
 
-  if yaml_imported and args.export_fixes:
+  if yaml and args.export_fixes:
     print('Writing fixes to ' + args.export_fixes + ' ...')
     try:
       merge_replacement_files(tmpdir, args.export_fixes)
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to