sammccall created this revision.
sammccall added a reviewer: hokein.
Herald added subscribers: cfe-commits, jdoerfert, kadircet, arphaman, jkorous, 
MaskRay, ioeric, ilya-biryukov.
Herald added a project: clang.

Still some pieces to go here: unit tests for new SourceCode functionality and
a command-line flag to force utf-8 mode. But wanted to get early feedback.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D58275

Files:
  clangd/ClangdLSPServer.cpp
  clangd/ClangdLSPServer.h
  clangd/Protocol.cpp
  clangd/Protocol.h
  clangd/SourceCode.cpp
  clangd/SourceCode.h
  test/clangd/utf8.test

Index: test/clangd/utf8.test
===================================================================
--- /dev/null
+++ test/clangd/utf8.test
@@ -0,0 +1,32 @@
+# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s
+# This test verifies that we can negotiate UTF-8 offsets via protocol extension.
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{"offsetEncoding":["utf-8","utf-16"]},"trace":"off"}}
+# CHECK: "offsetEncoding": "utf-8"
+---
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"/*ö*/int x;\nint y=x;"}}}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/definition","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":1,"character":6}}}
+# /*ö*/int x;
+# 01234567890
+# x is character (and utf-16) range [9,10) but byte range [10,11).
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": [
+# CHECK-NEXT:    {
+# CHECK-NEXT:      "range": {
+# CHECK-NEXT:        "end": {
+# CHECK-NEXT:          "character": 11,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        },
+# CHECK-NEXT:        "start": {
+# CHECK-NEXT:          "character": 10,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        }
+# CHECK-NEXT:      },
+# CHECK-NEXT:      "uri": "file://{{.*}}/main.cpp"
+# CHECK-NEXT:    }
+# CHECK-NEXT:  ]
+---
+{"jsonrpc":"2.0","id":10000,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}
Index: clangd/SourceCode.h
===================================================================
--- clangd/SourceCode.h
+++ clangd/SourceCode.h
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H
+#include "Context.h"
 #include "Protocol.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/LangOptions.h"
@@ -34,8 +35,14 @@
 FileDigest digest(StringRef Content);
 Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID);
 
+// This context variable controls the behavior of functions in this file
+// that convert between LSP offsets and native clang byte offsets.
+// If not set, defaults to UTF-16 for backwards-compatibility.
+extern Key<OffsetEncoding> kCurrentOffsetEncoding;
+
 // Counts the number of UTF-16 code units needed to represent a string (LSP
 // specifies string lengths in UTF-16 code units).
+// Use of UTF-16 may be overridden by kCurrentOffsetEncoding.
 size_t lspLength(StringRef Code);
 
 /// Turn a [line, column] pair into an offset in Code.
Index: clangd/SourceCode.cpp
===================================================================
--- clangd/SourceCode.cpp
+++ clangd/SourceCode.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "SourceCode.h"
 
+#include "Context.h"
 #include "Logger.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/Basic/SourceManager.h"
@@ -20,6 +21,8 @@
 namespace clang {
 namespace clangd {
 
+Key<OffsetEncoding> kCurrentOffsetEncoding;
+
 // Here be dragons. LSP positions use columns measured in *UTF-16 code units*!
 // Clangd uses UTF-8 and byte-offsets internally, so conversion is nontrivial.
 
@@ -67,8 +70,17 @@
   return std::min(Result, U8.size());
 }
 
+static bool useUTF8ForLSP() {
+  if (auto *Enc = Context::current().get(kCurrentOffsetEncoding))
+    if (*Enc == OffsetEncoding::UTF8)
+      return true;
+  return false;
+}
+
 // Like most strings in clangd, the input is UTF-8 encoded.
 size_t lspLength(llvm::StringRef Code) {
+  if (useUTF8ForLSP())
+    return Code.size();
   // A codepoint takes two UTF-16 code unit if it's astral (outside BMP).
   // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx.
   size_t Count = 0;
@@ -98,14 +110,25 @@
           llvm::errc::invalid_argument);
     StartOfLine = NextNL + 1;
   }
-
-  size_t NextNL = Code.find('\n', StartOfLine);
-  if (NextNL == llvm::StringRef::npos)
-    NextNL = Code.size();
-
+  StringRef Line =
+      Code.substr(StartOfLine).take_until([](char C) { return C == '\n'; });
+
+  // If the encoding is UTF-8, we do bounds-checking only.
+  if (useUTF8ForLSP()) {
+    if (P.character > int(Line.size())) {
+      if (AllowColumnsBeyondLineLength)
+        return StartOfLine + Line.size();
+      else
+        return llvm::make_error<llvm::StringError>(
+            llvm::formatv("UTF-8 offset {0} overruns line {1}", P.character,
+                          P.line),
+            llvm::errc::invalid_argument);
+    }
+    return StartOfLine + P.character;
+  }
+  // Otherwise P.character is in UTF-16 code units, so we have to transcode.
   bool Valid;
-  size_t ByteOffsetInLine = measureUTF16(
-      Code.substr(StartOfLine, NextNL - StartOfLine), P.character, Valid);
+  size_t ByteOffsetInLine = measureUTF16(Line, P.character, Valid);
   if (!Valid && !AllowColumnsBeyondLineLength)
     return llvm::make_error<llvm::StringError>(
         llvm::formatv("UTF-16 offset {0} is invalid for line {1}", P.character,
Index: clangd/Protocol.h
===================================================================
--- clangd/Protocol.h
+++ clangd/Protocol.h
@@ -331,6 +331,17 @@
 SymbolKind adjustKindToCapability(SymbolKind Kind,
                                   SymbolKindBitset &supportedSymbolKinds);
 
+// Determines the encoding used to measure offsets and lengths of source in LSP.
+enum class OffsetEncoding {
+  UnsupportedEncoding,
+  // Length counts code units of UTF-16 encoded text. (Standard LSP behavior).
+  UTF16,
+  // Length counts bytes of UTF-8 encoded text. (Clangd extension).
+  UTF8,
+};
+llvm::json::Value toJSON(const OffsetEncoding &);
+bool fromJSON(const llvm::json::Value &, OffsetEncoding &);
+
 // This struct doesn't mirror LSP!
 // The protocol defines deeply nested structures for client capabilities.
 // Instead of mapping them all, this just parses out the bits we care about.
@@ -362,6 +373,9 @@
   /// Client supports CodeAction return value for textDocument/codeAction.
   /// textDocument.codeAction.codeActionLiteralSupport.
   bool CodeActionStructure = false;
+
+  /// Supported encodings for LSP character offsets. (clangd extension).
+  llvm::Optional<std::vector<OffsetEncoding>> offsetEncoding;
 };
 bool fromJSON(const llvm::json::Value &, ClientCapabilities &);
 
Index: clangd/Protocol.cpp
===================================================================
--- clangd/Protocol.cpp
+++ clangd/Protocol.cpp
@@ -12,11 +12,13 @@
 
 #include "Protocol.h"
 #include "Logger.h"
+#include "SourceCode.h"
 #include "URI.h"
 #include "index/Index.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/JSON.h"
@@ -257,6 +259,11 @@
       }
     }
   }
+  if (auto *OffsetEncoding = O->get("offsetEncoding")) {
+    R.offsetEncoding.emplace();
+    if (!fromJSON(*OffsetEncoding, *R.offsetEncoding))
+      return false;
+  }
   return true;
 }
 
@@ -818,5 +825,26 @@
   return fromJSON(Params, Base);
 }
 
+llvm::json::Value toJSON(const OffsetEncoding &OE) {
+  switch (OE) {
+    case OffsetEncoding::UTF8:
+      return "utf-8";
+    case OffsetEncoding::UTF16:
+      return "utf-16";
+    case OffsetEncoding::UnsupportedEncoding:
+      return "unknown";
+  }
+}
+bool fromJSON(const llvm::json::Value &V, OffsetEncoding &OE) {
+  auto Str = V.getAsString();
+  if (!Str)
+    return false;
+  OE = llvm::StringSwitch<OffsetEncoding>(*Str)
+           .Case("utf-8", OffsetEncoding::UTF8)
+           .Case("utf-16", OffsetEncoding::UTF16)
+           .Default(OffsetEncoding::UnsupportedEncoding);
+  return true;
+}
+
 } // namespace clangd
 } // namespace clang
Index: clangd/ClangdLSPServer.h
===================================================================
--- clangd/ClangdLSPServer.h
+++ clangd/ClangdLSPServer.h
@@ -15,6 +15,7 @@
 #include "GlobalCompilationDatabase.h"
 #include "Path.h"
 #include "Protocol.h"
+#include "SourceCode.h"
 #include "Transport.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "llvm/ADT/Optional.h"
@@ -159,6 +160,7 @@
   // It is destroyed before run() returns, to ensure worker threads exit.
   ClangdServer::Options ClangdServerOpts;
   llvm::Optional<ClangdServer> Server;
+  llvm::Optional<OffsetEncoding> NegotiatedOffsetEncoding;
 };
 } // namespace clangd
 } // namespace clang
Index: clangd/ClangdLSPServer.cpp
===================================================================
--- clangd/ClangdLSPServer.cpp
+++ clangd/ClangdLSPServer.cpp
@@ -13,6 +13,7 @@
 #include "Trace.h"
 #include "URI.h"
 #include "clang/Tooling/Core/Replacement.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
@@ -93,6 +94,7 @@
   MessageHandler(ClangdLSPServer &Server) : Server(Server) {}
 
   bool onNotify(llvm::StringRef Method, llvm::json::Value Params) override {
+    WithContext HandlerContext(handlerContext());
     log("<-- {0}", Method);
     if (Method == "exit")
       return false;
@@ -109,6 +111,7 @@
 
   bool onCall(llvm::StringRef Method, llvm::json::Value Params,
               llvm::json::Value ID) override {
+    WithContext HandlerContext(handlerContext());
     // Calls can be canceled by the client. Add cancellation context.
     WithContext WithCancel(cancelableRequestContext(ID));
     trace::Span Tracer(Method);
@@ -129,6 +132,7 @@
 
   bool onReply(llvm::json::Value ID,
                llvm::Expected<llvm::json::Value> Result) override {
+    WithContext HandlerContext(handlerContext());
     // We ignore replies, just log them.
     if (Result)
       log("<-- reply({0})", ID);
@@ -259,6 +263,13 @@
     if (It != RequestCancelers.end())
       It->second.first(); // Invoke the canceler.
   }
+
+  Context handlerContext() const {
+    return Context::current().derive(
+        kCurrentOffsetEncoding,
+        Server.NegotiatedOffsetEncoding.getValueOr(OffsetEncoding::UTF16));
+  }
+
   // We run cancelable requests in a context that does two things:
   //  - allows cancellation using RequestCancelers[ID]
   //  - cleans up the entry in RequestCancelers when it's no longer needed
@@ -302,6 +313,20 @@
 
 void ClangdLSPServer::onInitialize(const InitializeParams &Params,
                                    Callback<llvm::json::Value> Reply) {
+  // Determine character encoding first as it affects constructed ClangdServer.
+  if (Params.capabilities.offsetEncoding && !NegotiatedOffsetEncoding) {
+    NegotiatedOffsetEncoding = OffsetEncoding::UTF16; // fallback
+    for (OffsetEncoding Supported : *Params.capabilities.offsetEncoding)
+      if (Supported != OffsetEncoding::UnsupportedEncoding) {
+        NegotiatedOffsetEncoding = Supported;
+        break;
+      }
+  }
+  llvm::Optional<WithContextValue> WithOffsetEncoding;
+  if (NegotiatedOffsetEncoding)
+    WithOffsetEncoding.emplace(kCurrentOffsetEncoding,
+                               *NegotiatedOffsetEncoding);
+
   if (Params.rootUri && *Params.rootUri)
     ClangdServerOpts.WorkspaceRoot = Params.rootUri->file();
   else if (Params.rootPath && !Params.rootPath->empty())
@@ -331,7 +356,7 @@
   SupportsHierarchicalDocumentSymbol =
       Params.capabilities.HierarchicalDocumentSymbol;
   SupportFileStatus = Params.initializationOptions.FileStatus;
-  Reply(llvm::json::Object{
+  llvm::json::Object Result{
       {{"capabilities",
         llvm::json::Object{
             {"textDocumentSync", (int)TextDocumentSyncKind::Incremental},
@@ -368,7 +393,10 @@
                   {ExecuteCommandParams::CLANGD_APPLY_FIX_COMMAND,
                    ExecuteCommandParams::CLANGD_APPLY_TWEAK}},
              }},
-        }}}});
+        }}}};
+  if (NegotiatedOffsetEncoding)
+    Result["offsetEncoding"] = *NegotiatedOffsetEncoding;
+  Reply(std::move(Result));
 }
 
 void ClangdLSPServer::onShutdown(const ShutdownParams &Params,
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to