ilya-biryukov updated this revision to Diff 143928.
ilya-biryukov marked an inline comment as done.
ilya-biryukov added a comment.
- Attempt to reuse lexing code with/without command parsing.
- Get rid of SkipWs.
Repository:
rC Clang
https://reviews.llvm.org/D46000
Files:
include/clang/AST/CommentLexer.h
include/clang/AST/RawCommentList.h
lib/AST/CommentLexer.cpp
lib/AST/RawCommentList.cpp
Index: lib/AST/RawCommentList.cpp
===================================================================
--- lib/AST/RawCommentList.cpp
+++ lib/AST/RawCommentList.cpp
@@ -335,3 +335,88 @@
BeforeThanCompare<RawComment>(SourceMgr));
std::swap(Comments, MergedComments);
}
+
+std::string RawComment::getFormattedText(const ASTContext &Ctx) const {
+ auto &SourceMgr = Ctx.getSourceManager();
+ llvm::StringRef CommentText = getRawText(SourceMgr);
+ if (CommentText.empty())
+ return ""; // we couldn't retreive the comment.
+
+ llvm::BumpPtrAllocator Allocator;
+ comments::Lexer L(Allocator, Ctx.getDiagnostics(),
+ Ctx.getCommentCommandTraits(), getSourceRange().getBegin(),
+ CommentText.begin(), CommentText.end(),
+ /*ParseCommentText=*/false);
+
+ std::string Result;
+ unsigned IndentColumn = 0;
+
+ // Processes one line of the comment and adds it to the result.
+ // Handles skipping the indent at the start of the line.
+ // Returns false when eof is reached and true otherwise.
+ auto LexLine = [&](bool IsFirstLine) -> bool {
+ comments::Token Tok;
+ // Lex the first token on the line. We handle it separately, because we to
+ // fix up its indentation.
+ L.lex(Tok);
+ if (Tok.is(comments::tok::eof))
+ return false;
+ if (Tok.is(comments::tok::newline)) {
+ Result += "\n";
+ return true;
+ }
+ llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
+ bool LocInvalid = false;
+ unsigned TokColumn =
+ SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
+ if (LocInvalid)
+ TokColumn = 0;
+ // Compute the length of whitespace we're allowed to skip.
+ size_t MaxSkip;
+ if (IsFirstLine) {
+ // For the first line we skip all leading ws.
+ MaxSkip = std::numeric_limits<size_t>::max();
+ } else {
+ // For the rest, we skip up to the column of first non-ws symbol on the
+ // first line..
+ MaxSkip = std::max((int)IndentColumn - (int)TokColumn, 0);
+ }
+ // Amount of leading whitespace in TokText.
+ size_t WhitespaceLen = TokText.find_first_not_of(" \t");
+ if (WhitespaceLen == StringRef::npos)
+ WhitespaceLen = TokText.size();
+ // Remember the amount of whitespace we skipped in the first line to remove
+ // indent up to that column in the following lines.
+ if (IsFirstLine)
+ IndentColumn = TokColumn + WhitespaceLen;
+
+ llvm::StringRef Trimmed = TokText.drop_front(std::min(MaxSkip, WhitespaceLen));
+ Result += Trimmed;
+ // Lex all tokens in the rest of the line.
+ for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
+ if (Tok.is(comments::tok::newline)) {
+ Result += "\n";
+ return true;
+ }
+ Result += L.getSpelling(Tok, SourceMgr);
+ }
+ // We've reached the end of the line.
+ return false;
+ };
+
+ auto DropTrailingNewLines = [](std::string &Str) {
+ while (Str.back() == '\n')
+ Str.pop_back();
+ };
+
+ // Proces first line separately to remember indent for the following lines.
+ if (!LexLine(/*IsFirstLine=*/true)) {
+ DropTrailingNewLines(Result);
+ return Result;
+ }
+ // Process the rest of the lines.
+ while (LexLine(/*IsFirstLine=*/false))
+ ;
+ DropTrailingNewLines(Result);
+ return Result;
+}
Index: lib/AST/CommentLexer.cpp
===================================================================
--- lib/AST/CommentLexer.cpp
+++ lib/AST/CommentLexer.cpp
@@ -291,159 +291,181 @@
}
void Lexer::lexCommentText(Token &T) {
+ if (ParseCommands) {
+ if (tryLexCommands(T))
+ return;
+ }
+ lexCommentTextWithoutCommands(T);
+}
+
+bool Lexer::tryLexCommands(Token &T) {
assert(CommentState == LCS_InsideBCPLComment ||
CommentState == LCS_InsideCComment);
+ if (!ParseCommands)
+ return false;
switch (State) {
case LS_Normal:
break;
case LS_VerbatimBlockFirstLine:
lexVerbatimBlockFirstLine(T);
- return;
+ return true;
case LS_VerbatimBlockBody:
lexVerbatimBlockBody(T);
- return;
+ return true;
case LS_VerbatimLineText:
lexVerbatimLineText(T);
- return;
+ return true;
case LS_HTMLStartTag:
lexHTMLStartTag(T);
- return;
+ return true;
case LS_HTMLEndTag:
lexHTMLEndTag(T);
- return;
+ return true;
}
assert(State == LS_Normal);
const char *TokenPtr = BufferPtr;
assert(TokenPtr < CommentEnd);
- while (TokenPtr != CommentEnd) {
- switch(*TokenPtr) {
- case '\\':
- case '@': {
- // Commands that start with a backslash and commands that start with
- // 'at' have equivalent semantics. But we keep information about the
- // exact syntax in AST for comments.
- tok::TokenKind CommandKind =
- (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+ switch(*TokenPtr) {
+ case '\\':
+ case '@': {
+ // Commands that start with a backslash and commands that start with
+ // 'at' have equivalent semantics. But we keep information about the
+ // exact syntax in AST for comments.
+ tok::TokenKind CommandKind =
+ (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
+ TokenPtr++;
+ if (TokenPtr == CommentEnd) {
+ formTextToken(T, TokenPtr);
+ return true;
+ }
+ char C = *TokenPtr;
+ switch (C) {
+ default:
+ break;
+
+ case '\\': case '@': case '&': case '$':
+ case '#': case '<': case '>': case '%':
+ case '\"': case '.': case ':':
+ // This is one of \\ \@ \& \$ etc escape sequences.
TokenPtr++;
- if (TokenPtr == CommentEnd) {
- formTextToken(T, TokenPtr);
- return;
- }
- char C = *TokenPtr;
- switch (C) {
- default:
- break;
-
- case '\\': case '@': case '&': case '$':
- case '#': case '<': case '>': case '%':
- case '\"': case '.': case ':':
- // This is one of \\ \@ \& \$ etc escape sequences.
+ if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
+ // This is the \:: escape sequence.
TokenPtr++;
- if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
- // This is the \:: escape sequence.
- TokenPtr++;
- }
- StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
- formTokenWithChars(T, TokenPtr, tok::text);
- T.setText(UnescapedText);
- return;
}
+ StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(UnescapedText);
+ return true;
+ }
- // Don't make zero-length commands.
- if (!isCommandNameStartCharacter(*TokenPtr)) {
- formTextToken(T, TokenPtr);
- return;
- }
+ // Don't make zero-length commands.
+ if (!isCommandNameStartCharacter(*TokenPtr)) {
+ formTextToken(T, TokenPtr);
+ return true;
+ }
- TokenPtr = skipCommandName(TokenPtr, CommentEnd);
- unsigned Length = TokenPtr - (BufferPtr + 1);
-
- // Hardcoded support for lexing LaTeX formula commands
- // \f$ \f[ \f] \f{ \f} as a single command.
- if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
- C = *TokenPtr;
- if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
- TokenPtr++;
- Length++;
- }
- }
+ TokenPtr = skipCommandName(TokenPtr, CommentEnd);
+ unsigned Length = TokenPtr - (BufferPtr + 1);
- StringRef CommandName(BufferPtr + 1, Length);
-
- const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
- if (!Info) {
- if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
- StringRef CorrectedName = Info->Name;
- SourceLocation Loc = getSourceLocation(BufferPtr);
- SourceLocation EndLoc = getSourceLocation(TokenPtr);
- SourceRange FullRange = SourceRange(Loc, EndLoc);
- SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
- Diag(Loc, diag::warn_correct_comment_command_name)
- << FullRange << CommandName << CorrectedName
- << FixItHint::CreateReplacement(CommandRange, CorrectedName);
- } else {
- formTokenWithChars(T, TokenPtr, tok::unknown_command);
- T.setUnknownCommandName(CommandName);
- Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
- << SourceRange(T.getLocation(), T.getEndLocation());
- return;
- }
- }
- if (Info->IsVerbatimBlockCommand) {
- setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
- return;
+ // Hardcoded support for lexing LaTeX formula commands
+ // \f$ \f[ \f] \f{ \f} as a single command.
+ if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
+ C = *TokenPtr;
+ if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
+ TokenPtr++;
+ Length++;
}
- if (Info->IsVerbatimLineCommand) {
- setupAndLexVerbatimLine(T, TokenPtr, Info);
- return;
+ }
+
+ StringRef CommandName(BufferPtr + 1, Length);
+
+ const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
+ if (!Info) {
+ if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
+ StringRef CorrectedName = Info->Name;
+ SourceLocation Loc = getSourceLocation(BufferPtr);
+ SourceLocation EndLoc = getSourceLocation(TokenPtr);
+ SourceRange FullRange = SourceRange(Loc, EndLoc);
+ SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
+ Diag(Loc, diag::warn_correct_comment_command_name)
+ << FullRange << CommandName << CorrectedName
+ << FixItHint::CreateReplacement(CommandRange, CorrectedName);
+ } else {
+ formTokenWithChars(T, TokenPtr, tok::unknown_command);
+ T.setUnknownCommandName(CommandName);
+ Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
+ << SourceRange(T.getLocation(), T.getEndLocation());
+ return true;
}
- formTokenWithChars(T, TokenPtr, CommandKind);
- T.setCommandID(Info->getID());
- return;
}
+ if (Info->IsVerbatimBlockCommand) {
+ setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
+ return true;
+ }
+ if (Info->IsVerbatimLineCommand) {
+ setupAndLexVerbatimLine(T, TokenPtr, Info);
+ return true;
+ }
+ formTokenWithChars(T, TokenPtr, CommandKind);
+ T.setCommandID(Info->getID());
+ return true;
+ }
- case '&':
- lexHTMLCharacterReference(T);
- return;
+ case '&':
+ lexHTMLCharacterReference(T);
+ return true;
- case '<': {
- TokenPtr++;
- if (TokenPtr == CommentEnd) {
- formTextToken(T, TokenPtr);
- return;
- }
- const char C = *TokenPtr;
- if (isHTMLIdentifierStartingCharacter(C))
- setupAndLexHTMLStartTag(T);
- else if (C == '/')
- setupAndLexHTMLEndTag(T);
- else
- formTextToken(T, TokenPtr);
- return;
+ case '<': {
+ TokenPtr++;
+ if (TokenPtr == CommentEnd) {
+ formTextToken(T, TokenPtr);
+ return true;
}
+ const char C = *TokenPtr;
+ if (isHTMLIdentifierStartingCharacter(C))
+ setupAndLexHTMLStartTag(T);
+ else if (C == '/')
+ setupAndLexHTMLEndTag(T);
+ else
+ formTextToken(T, TokenPtr);
+ return true;
+ }
- case '\n':
- case '\r':
+ default:
+ // Signal lexing code for an ordinary comment should be run instead.
+ return false;
+ }
+}
+
+void Lexer::lexCommentTextWithoutCommands(Token &T) {
+ assert(CommentState == LCS_InsideBCPLComment ||
+ CommentState == LCS_InsideCComment);
+ assert(State == LS_Normal);
+
+ const char *TokenPtr = BufferPtr;
+ assert(TokenPtr < CommentEnd);
+ switch (*TokenPtr) {
+ case '\n':
+ case '\r':
TokenPtr = skipNewline(TokenPtr, CommentEnd);
formTokenWithChars(T, TokenPtr, tok::newline);
if (CommentState == LCS_InsideCComment)
- skipLineStartingDecorations();
+ skipLineStartingDecorations();
return;
- default: {
- size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
- find_first_of("\n\r\\@&<");
+ default: {
+ size_t End =
+ StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\\@&<");
if (End != StringRef::npos)
- TokenPtr += End;
+ TokenPtr += End;
else
- TokenPtr = CommentEnd;
+ TokenPtr = CommentEnd;
formTextToken(T, TokenPtr);
return;
- }
}
}
}
@@ -727,14 +749,13 @@
}
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
- const CommandTraits &Traits,
- SourceLocation FileLoc,
- const char *BufferStart, const char *BufferEnd):
- Allocator(Allocator), Diags(Diags), Traits(Traits),
- BufferStart(BufferStart), BufferEnd(BufferEnd),
- FileLoc(FileLoc), BufferPtr(BufferStart),
- CommentState(LCS_BeforeComment), State(LS_Normal) {
-}
+ const CommandTraits &Traits, SourceLocation FileLoc,
+ const char *BufferStart, const char *BufferEnd,
+ bool ParseCommands)
+ : Allocator(Allocator), Diags(Diags), Traits(Traits),
+ BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
+ BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
+ ParseCommands(ParseCommands) {}
void Lexer::lex(Token &T) {
again:
Index: include/clang/AST/RawCommentList.h
===================================================================
--- include/clang/AST/RawCommentList.h
+++ include/clang/AST/RawCommentList.h
@@ -111,6 +111,29 @@
return extractBriefText(Context);
}
+ /// Returns sanitized comment text, suitable for presentation in editor UIs.
+ /// E.g. will transform:
+ /// // This is a long multiline comment.
+ /// // Parts of it might be indented.
+ /// /* The comments styles might be mixed. */
+ /// into
+ /// "This is a long multiline comment.\n"
+ /// " Parts of it might be indented.\n"
+ /// "The comments styles might be mixed."
+ /// Also removes leading indentation and sanitizes some common cases:
+ /// /* This is a first line.
+ /// * This is a second line. It is indented.
+ /// * This is a third line. */
+ /// and
+ /// /* This is a first line.
+ /// This is a second line. It is indented.
+ /// This is a third line. */
+ /// will both turn into:
+ /// "This is a first line.\n"
+ /// " This is a second line. It is indented.\n"
+ /// "This is a third line."
+ std::string getFormattedText(const ASTContext &Context) const;
+
/// Parse the comment, assuming it is attached to decl \c D.
comments::FullComment *parse(const ASTContext &Context,
const Preprocessor *PP, const Decl *D) const;
Index: include/clang/AST/CommentLexer.h
===================================================================
--- include/clang/AST/CommentLexer.h
+++ include/clang/AST/CommentLexer.h
@@ -281,6 +281,11 @@
/// command, including command marker.
SmallString<16> VerbatimBlockEndCommandName;
+ /// If true, the commands, html tags, etc will be parsed and reported as
+ /// separate tokens inside the comment body. If false, the comment text will
+ /// be parsed into text and newline tokens.
+ bool ParseCommands;
+
/// Given a character reference name (e.g., "lt"), return the character that
/// it stands for (e.g., "<").
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
@@ -315,12 +320,22 @@
/// Eat string matching regexp \code \s*\* \endcode.
void skipLineStartingDecorations();
- /// Lex stuff inside comments. CommentEnd should be set correctly.
+ /// Calls lexCommentText(With|Without)Commands, depending on value of
+ /// ParseCommands.
void lexCommentText(Token &T);
- void setupAndLexVerbatimBlock(Token &T,
- const char *TextBegin,
- char Marker, const CommandInfo *Info);
+ /// Try to lex commands, html tags and verbatim blocks inside comment text.
+ /// CommentEnd should be set correctly.
+ /// Returns true if command was lexed and no further processing is required.
+ /// Returns false if no special tokens were found and lexing of ordinary comment text need to be performed instead.
+ bool tryLexCommands(Token &T);
+
+ /// Lex only newlines and text inside comments. CommentEnd should be set
+ /// correctly.
+ void lexCommentTextWithoutCommands(Token &T);
+
+ void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
+ const CommandInfo *Info);
void lexVerbatimBlockFirstLine(Token &T);
@@ -343,14 +358,13 @@
public:
Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
- const CommandTraits &Traits,
- SourceLocation FileLoc,
- const char *BufferStart, const char *BufferEnd);
+ const CommandTraits &Traits, SourceLocation FileLoc,
+ const char *BufferStart, const char *BufferEnd,
+ bool ParseCommands = true);
void lex(Token &T);
- StringRef getSpelling(const Token &Tok,
- const SourceManager &SourceMgr,
+ StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr,
bool *Invalid = nullptr) const;
};
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits