[PATCH] D46000: [AST] Added a helper to extract a user-friendly text of a comment.

Ilya Biryukov via Phabricator via cfe-commits Wed, 25 Apr 2018 07:23:59 -0700

ilya-biryukov updated this revision to Diff 143928.
ilya-biryukov marked an inline comment as done.
ilya-biryukov added a comment.


- Attempt to reuse lexing code with/without command parsing.
- Get rid of SkipWs.


Repository:
  rC Clang

https://reviews.llvm.org/D46000

Files:
  include/clang/AST/CommentLexer.h
  include/clang/AST/RawCommentList.h
  lib/AST/CommentLexer.cpp
  lib/AST/RawCommentList.cpp

Index: lib/AST/RawCommentList.cpp
===================================================================
--- lib/AST/RawCommentList.cpp
+++ lib/AST/RawCommentList.cpp
@@ -335,3 +335,88 @@
              BeforeThanCompare<RawComment>(SourceMgr));
   std::swap(Comments, MergedComments);
 }
+
+std::string RawComment::getFormattedText(const ASTContext &Ctx) const {
+  auto &SourceMgr = Ctx.getSourceManager();
+  llvm::StringRef CommentText = getRawText(SourceMgr);
+  if (CommentText.empty())
+    return ""; // we couldn't retreive the comment.
+
+  llvm::BumpPtrAllocator Allocator;
+  comments::Lexer L(Allocator, Ctx.getDiagnostics(),
+                    Ctx.getCommentCommandTraits(), getSourceRange().getBegin(),
+                    CommentText.begin(), CommentText.end(),
+                    /*ParseCommentText=*/false);
+
+  std::string Result;
+  unsigned IndentColumn = 0;
+
+  // Processes one line of the comment and adds it to the result.
+  // Handles skipping the indent at the start of the line.
+  // Returns false when eof is reached and true otherwise.
+  auto LexLine = [&](bool IsFirstLine) -> bool {
+    comments::Token Tok;
+    // Lex the first token on the line. We handle it separately, because we to
+    // fix up its indentation.
+    L.lex(Tok);
+    if (Tok.is(comments::tok::eof))
+      return false;
+    if (Tok.is(comments::tok::newline)) {
+      Result += "\n";
+      return true;
+    }
+    llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
+    bool LocInvalid = false;
+    unsigned TokColumn =
+        SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
+    if (LocInvalid)
+      TokColumn = 0;
+    // Compute the length of whitespace we're allowed to skip.
+    size_t MaxSkip;
+    if (IsFirstLine) {
+      // For the first line we skip all leading ws.
+      MaxSkip = std::numeric_limits<size_t>::max();
+    } else {
+      // For the rest, we skip up to the column of first non-ws symbol on the
+      // first line..
+      MaxSkip = std::max((int)IndentColumn - (int)TokColumn, 0);
+    }
+    // Amount of leading whitespace in TokText.
+    size_t WhitespaceLen = TokText.find_first_not_of(" \t");
+    if (WhitespaceLen == StringRef::npos)
+      WhitespaceLen = TokText.size();
+    // Remember the amount of whitespace we skipped in the first line to remove
+    // indent up to that column in the following lines.
+    if (IsFirstLine)
+      IndentColumn = TokColumn + WhitespaceLen;
+
+    llvm::StringRef Trimmed = TokText.drop_front(std::min(MaxSkip, WhitespaceLen));
+    Result += Trimmed;
+    // Lex all tokens in the rest of the line.
+    for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
+      if (Tok.is(comments::tok::newline)) {
+        Result += "\n";
+        return true;
+      }
+      Result += L.getSpelling(Tok, SourceMgr);
+    }
+    // We've reached the end of the line.
+    return false;
+  };
+
+  auto DropTrailingNewLines = [](std::string &Str) {
+    while (Str.back() == '\n')
+      Str.pop_back();
+  };
+
+  // Proces first line separately to remember indent for the following lines.
+  if (!LexLine(/*IsFirstLine=*/true)) {
+    DropTrailingNewLines(Result);
+    return Result;
+  }
+  // Process the rest of the lines.
+  while (LexLine(/*IsFirstLine=*/false))
+    ;
+  DropTrailingNewLines(Result);
+  return Result;
+}
Index: lib/AST/CommentLexer.cpp
===================================================================
--- lib/AST/CommentLexer.cpp
+++ lib/AST/CommentLexer.cpp
@@ -291,159 +291,181 @@
 }
 
 void Lexer::lexCommentText(Token &T) {
+  if (ParseCommands) {
+    if (tryLexCommands(T))
+      return;
+  }
+  lexCommentTextWithoutCommands(T);
+}
+
+bool Lexer::tryLexCommands(Token &T) {
   assert(CommentState == LCS_InsideBCPLComment ||
          CommentState == LCS_InsideCComment);
+  if (!ParseCommands)
+    return false;
 
   switch (State) {
   case LS_Normal:
     break;
   case LS_VerbatimBlockFirstLine:
     lexVerbatimBlockFirstLine(T);
-    return;
+    return true;
   case LS_VerbatimBlockBody:
     lexVerbatimBlockBody(T);
-    return;
+    return true;
   case LS_VerbatimLineText:
     lexVerbatimLineText(T);
-    return;
+    return true;
   case LS_HTMLStartTag:
     lexHTMLStartTag(T);
-    return;
+    return true;
   case LS_HTMLEndTag:
     lexHTMLEndTag(T);
-    return;
+    return true;
   }
 
   assert(State == LS_Normal);
 
   const char *TokenPtr = BufferPtr;
   assert(TokenPtr < CommentEnd);
-  while (TokenPtr != CommentEnd) {
-    switch(*TokenPtr) {
-      case '\\':
-      case '@': {
-        // Commands that start with a backslash and commands that start with
-        // 'at' have equivalent semantics.  But we keep information about the
-        // exact syntax in AST for comments.
-        tok::TokenKind CommandKind =
-            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+  switch(*TokenPtr) {
+    case '\\':
+    case '@': {
+      // Commands that start with a backslash and commands that start with
+      // 'at' have equivalent semantics.  But we keep information about the
+      // exact syntax in AST for comments.
+      tok::TokenKind CommandKind =
+          (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+      TokenPtr++;
+      if (TokenPtr == CommentEnd) {
+        formTextToken(T, TokenPtr);
+        return true;
+      }
+      char C = *TokenPtr;
+      switch (C) {
+      default:
+        break;
+
+      case '\\': case '@': case '&': case '$':
+      case '#':  case '<': case '>': case '%':
+      case '\"': case '.': case ':':
+        // This is one of \\ \@ \& \$ etc escape sequences.
         TokenPtr++;
-        if (TokenPtr == CommentEnd) {
-          formTextToken(T, TokenPtr);
-          return;
-        }
-        char C = *TokenPtr;
-        switch (C) {
-        default:
-          break;
-
-        case '\\': case '@': case '&': case '$':
-        case '#':  case '<': case '>': case '%':
-        case '\"': case '.': case ':':
-          // This is one of \\ \@ \& \$ etc escape sequences.
+        if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
+          // This is the \:: escape sequence.
           TokenPtr++;
-          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
-            // This is the \:: escape sequence.
-            TokenPtr++;
-          }
-          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(UnescapedText);
-          return;
         }
+        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
+        formTokenWithChars(T, TokenPtr, tok::text);
+        T.setText(UnescapedText);
+        return true;
+      }
 
-        // Don't make zero-length commands.
-        if (!isCommandNameStartCharacter(*TokenPtr)) {
-          formTextToken(T, TokenPtr);
-          return;
-        }
+      // Don't make zero-length commands.
+      if (!isCommandNameStartCharacter(*TokenPtr)) {
+        formTextToken(T, TokenPtr);
+        return true;
+      }
 
-        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
-        unsigned Length = TokenPtr - (BufferPtr + 1);
-
-        // Hardcoded support for lexing LaTeX formula commands
-        // \f$ \f[ \f] \f{ \f} as a single command.
-        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
-          C = *TokenPtr;
-          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
-            TokenPtr++;
-            Length++;
-          }
-        }
+      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
+      unsigned Length = TokenPtr - (BufferPtr + 1);
 
-        StringRef CommandName(BufferPtr + 1, Length);
-
-        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
-        if (!Info) {
-          if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
-            StringRef CorrectedName = Info->Name;
-            SourceLocation Loc = getSourceLocation(BufferPtr);
-            SourceLocation EndLoc = getSourceLocation(TokenPtr);
-            SourceRange FullRange = SourceRange(Loc, EndLoc);
-            SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
-            Diag(Loc, diag::warn_correct_comment_command_name)
-              << FullRange << CommandName << CorrectedName
-              << FixItHint::CreateReplacement(CommandRange, CorrectedName);
-          } else {
-            formTokenWithChars(T, TokenPtr, tok::unknown_command);
-            T.setUnknownCommandName(CommandName);
-            Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
-                << SourceRange(T.getLocation(), T.getEndLocation());
-            return;
-          }
-        }
-        if (Info->IsVerbatimBlockCommand) {
-          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
-          return;
+      // Hardcoded support for lexing LaTeX formula commands
+      // \f$ \f[ \f] \f{ \f} as a single command.
+      if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
+        C = *TokenPtr;
+        if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
+          TokenPtr++;
+          Length++;
         }
-        if (Info->IsVerbatimLineCommand) {
-          setupAndLexVerbatimLine(T, TokenPtr, Info);
-          return;
+      }
+
+      StringRef CommandName(BufferPtr + 1, Length);
+
+      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
+      if (!Info) {
+        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
+          StringRef CorrectedName = Info->Name;
+          SourceLocation Loc = getSourceLocation(BufferPtr);
+          SourceLocation EndLoc = getSourceLocation(TokenPtr);
+          SourceRange FullRange = SourceRange(Loc, EndLoc);
+          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
+          Diag(Loc, diag::warn_correct_comment_command_name)
+            << FullRange << CommandName << CorrectedName
+            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
+        } else {
+          formTokenWithChars(T, TokenPtr, tok::unknown_command);
+          T.setUnknownCommandName(CommandName);
+          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
+              << SourceRange(T.getLocation(), T.getEndLocation());
+          return true;
         }
-        formTokenWithChars(T, TokenPtr, CommandKind);
-        T.setCommandID(Info->getID());
-        return;
       }
+      if (Info->IsVerbatimBlockCommand) {
+        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
+        return true;
+      }
+      if (Info->IsVerbatimLineCommand) {
+        setupAndLexVerbatimLine(T, TokenPtr, Info);
+        return true;
+      }
+      formTokenWithChars(T, TokenPtr, CommandKind);
+      T.setCommandID(Info->getID());
+      return true;
+    }
 
-      case '&':
-        lexHTMLCharacterReference(T);
-        return;
+    case '&':
+      lexHTMLCharacterReference(T);
+      return true;
 
-      case '<': {
-        TokenPtr++;
-        if (TokenPtr == CommentEnd) {
-          formTextToken(T, TokenPtr);
-          return;
-        }
-        const char C = *TokenPtr;
-        if (isHTMLIdentifierStartingCharacter(C))
-          setupAndLexHTMLStartTag(T);
-        else if (C == '/')
-          setupAndLexHTMLEndTag(T);
-        else
-          formTextToken(T, TokenPtr);
-        return;
+    case '<': {
+      TokenPtr++;
+      if (TokenPtr == CommentEnd) {
+        formTextToken(T, TokenPtr);
+        return true;
       }
+      const char C = *TokenPtr;
+      if (isHTMLIdentifierStartingCharacter(C))
+        setupAndLexHTMLStartTag(T);
+      else if (C == '/')
+        setupAndLexHTMLEndTag(T);
+      else
+        formTextToken(T, TokenPtr);
+      return true;
+    }
 
-      case '\n':
-      case '\r':
+    default:
+      // Signal lexing code for an ordinary comment should be run instead.
+      return false;
+  }
+}
+
+void Lexer::lexCommentTextWithoutCommands(Token &T) {
+  assert(CommentState == LCS_InsideBCPLComment ||
+         CommentState == LCS_InsideCComment);
+  assert(State == LS_Normal);
+
+  const char *TokenPtr = BufferPtr;
+  assert(TokenPtr < CommentEnd);
+  switch (*TokenPtr) {
+    case '\n':
+    case '\r':
         TokenPtr = skipNewline(TokenPtr, CommentEnd);
         formTokenWithChars(T, TokenPtr, tok::newline);
 
         if (CommentState == LCS_InsideCComment)
-          skipLineStartingDecorations();
+        skipLineStartingDecorations();
         return;
 
-      default: {
-        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
-                         find_first_of("\n\r\\@&<");
+    default: {
+        size_t End =
+            StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\\@&<");
         if (End != StringRef::npos)
-          TokenPtr += End;
+        TokenPtr += End;
         else
-          TokenPtr = CommentEnd;
+        TokenPtr = CommentEnd;
         formTextToken(T, TokenPtr);
         return;
-      }
     }
   }
 }
@@ -727,14 +749,13 @@
 }
 
 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
-             const CommandTraits &Traits,
-             SourceLocation FileLoc,
-             const char *BufferStart, const char *BufferEnd):
-    Allocator(Allocator), Diags(Diags), Traits(Traits),
-    BufferStart(BufferStart), BufferEnd(BufferEnd),
-    FileLoc(FileLoc), BufferPtr(BufferStart),
-    CommentState(LCS_BeforeComment), State(LS_Normal) {
-}
+             const CommandTraits &Traits, SourceLocation FileLoc,
+             const char *BufferStart, const char *BufferEnd,
+             bool ParseCommands)
+    : Allocator(Allocator), Diags(Diags), Traits(Traits),
+      BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
+      BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
+      ParseCommands(ParseCommands) {}
 
 void Lexer::lex(Token &T) {
 again:
Index: include/clang/AST/RawCommentList.h
===================================================================
--- include/clang/AST/RawCommentList.h
+++ include/clang/AST/RawCommentList.h
@@ -111,6 +111,29 @@
     return extractBriefText(Context);
   }
 
+  /// Returns sanitized comment text, suitable for presentation in editor UIs.
+  /// E.g. will transform:
+  ///     // This is a long multiline comment.
+  ///     //   Parts of it  might be indented.
+  ///     /* The comments styles might be mixed. */
+  ///  into
+  ///     "This is a long multiline comment.\n"
+  ///     "  Parts of it  might be indented.\n"
+  ///     "The comments styles might be mixed."
+  /// Also removes leading indentation and sanitizes some common cases:
+  ///     /* This is a first line.
+  ///      *   This is a second line. It is indented.
+  ///      * This is a third line. */
+  /// and
+  ///     /* This is a first line.
+  ///          This is a second line. It is indented.
+  ///     This is a third line. */
+  /// will both turn into:
+  ///     "This is a first line.\n"
+  ///     "  This is a second line. It is indented.\n"
+  ///     "This is a third line."
+  std::string getFormattedText(const ASTContext &Context) const;
+
   /// Parse the comment, assuming it is attached to decl \c D.
   comments::FullComment *parse(const ASTContext &Context,
                                const Preprocessor *PP, const Decl *D) const;
Index: include/clang/AST/CommentLexer.h
===================================================================
--- include/clang/AST/CommentLexer.h
+++ include/clang/AST/CommentLexer.h
@@ -281,6 +281,11 @@
   /// command, including command marker.
   SmallString<16> VerbatimBlockEndCommandName;
 
+  /// If true, the commands, html tags, etc will be parsed and reported as
+  /// separate tokens inside the comment body. If false, the comment text will
+  /// be parsed into text and newline tokens.
+  bool ParseCommands;
+
   /// Given a character reference name (e.g., "lt"), return the character that
   /// it stands for (e.g., "<").
   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
@@ -315,12 +320,22 @@
   /// Eat string matching regexp \code \s*\* \endcode.
   void skipLineStartingDecorations();
 
-  /// Lex stuff inside comments.  CommentEnd should be set correctly.
+  /// Calls lexCommentText(With|Without)Commands, depending on value of
+  /// ParseCommands.
   void lexCommentText(Token &T);
 
-  void setupAndLexVerbatimBlock(Token &T,
-                                const char *TextBegin,
-                                char Marker, const CommandInfo *Info);
+  /// Try to lex commands, html tags and verbatim blocks inside comment text.
+  /// CommentEnd should be set correctly.
+  /// Returns true if command was lexed and no further processing is required.
+  /// Returns false if no special tokens were found and lexing of ordinary comment text need to be performed instead.
+  bool tryLexCommands(Token &T);
+
+  /// Lex only newlines and text inside comments. CommentEnd should be set
+  /// correctly.
+  void lexCommentTextWithoutCommands(Token &T);
+
+  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
+                                const CommandInfo *Info);
 
   void lexVerbatimBlockFirstLine(Token &T);
 
@@ -343,14 +358,13 @@
 
 public:
   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
-        const CommandTraits &Traits,
-        SourceLocation FileLoc,
-        const char *BufferStart, const char *BufferEnd);
+        const CommandTraits &Traits, SourceLocation FileLoc,
+        const char *BufferStart, const char *BufferEnd,
+        bool ParseCommands = true);
 
   void lex(Token &T);
 
-  StringRef getSpelling(const Token &Tok,
-                        const SourceManager &SourceMgr,
+  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
                         bool *Invalid = nullptr) const;
 };

_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D46000: [AST] Added a helper to extract a user-friendly text of a comment.

Reply via email to