ymandel created this revision.
ymandel added a reviewer: gribozavr.
Herald added a project: clang.
This patch adds `getAssociatedRange` which, for a given decl, computes preceding
and trailing text that would conceptually be associated with the decl by the
reader. This includes comments, whitespace, and separators like ';'.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D72153
Files:
clang/include/clang/Tooling/Transformer/SourceCode.h
clang/lib/Tooling/Transformer/SourceCode.cpp
clang/unittests/Tooling/SourceCodeTest.cpp
Index: clang/unittests/Tooling/SourceCodeTest.cpp
===================================================================
--- clang/unittests/Tooling/SourceCodeTest.cpp
+++ clang/unittests/Tooling/SourceCodeTest.cpp
@@ -17,6 +17,8 @@
using namespace clang;
using llvm::ValueIs;
+using testing::Eq;
+using tooling::getAssociatedRange;
using tooling::getExtendedText;
using tooling::getRangeForEdit;
using tooling::getText;
@@ -41,12 +43,43 @@
std::function<void(CallExpr *, ASTContext *Context)> OnCall;
};
+struct DeclaratorDeclsVisitor : TestVisitor<DeclaratorDeclsVisitor> {
+ bool VisitDeclaratorDecl(DeclaratorDecl *Decl) {
+ OnDecl(Decl, Context);
+ return true;
+ }
+
+ std::function<void(DeclaratorDecl *, ASTContext *Context)> OnDecl;
+};
+
// Equality matcher for `clang::CharSourceRange`, which lacks `operator==`.
MATCHER_P(EqualsRange, R, "") {
return arg.isTokenRange() == R.isTokenRange() &&
arg.getBegin() == R.getBegin() && arg.getEnd() == R.getEnd();
}
+MATCHER_P2(EqualsAnnotatedRange, SM, R, "") {
+ if (arg.getBegin().isMacroID()) {
+ *result_listener << "which starts in a macro";
+ return false;
+ }
+ if (arg.getEnd().isMacroID()) {
+ *result_listener << "which ends in a macro";
+ return false;
+ }
+
+ unsigned Begin = SM->getFileOffset(arg.getBegin());
+ unsigned End = SM->getFileOffset(arg.getEnd());
+
+ *result_listener << "which is [" << Begin << ",";
+ if (arg.isTokenRange()) {
+ *result_listener << End << "]";
+ return Begin == R.Begin && End + 1 == R.End;
+ }
+ *result_listener << End << ")";
+ return Begin == R.Begin && End == R.End;
+}
+
static ::testing::Matcher<CharSourceRange> AsRange(const SourceManager &SM,
llvm::Annotations::Range R) {
return EqualsRange(CharSourceRange::getCharRange(
@@ -122,6 +155,95 @@
Visitor.runOver("int foo() { return foo() + 3; }");
}
+TEST(SourceCodeTest, getAssociatedRange) {
+ struct DeclaratorDeclsVisitor : TestVisitor<DeclaratorDeclsVisitor> {
+ llvm::Annotations Code;
+
+ DeclaratorDeclsVisitor() : Code("$r[[]]") {}
+ bool VisitDeclaratorDecl(DeclaratorDecl *Decl) {
+ EXPECT_THAT(
+ getAssociatedRange(*Decl, *Context),
+ EqualsAnnotatedRange(&Context->getSourceManager(), Code.range("r")))
+ << Code.code();
+ return true;
+ }
+ bool runOverWithComments(StringRef Code) {
+ std::vector<std::string> Args = {"-std=c++11", "-fparse-all-comments"};
+ return tooling::runToolOnCodeWithArgs(CreateTestAction(), Code, Args);
+ }
+ };
+
+ DeclaratorDeclsVisitor Visitor;
+
+ // Includes newline.
+ Visitor.Code = llvm::Annotations("$r[[int x = 4;]]");
+ Visitor.runOver(Visitor.Code.code());
+
+ // Includes newline and semicolon.
+ Visitor.Code = llvm::Annotations("$r[[int x = 4;\n]]");
+ Visitor.runOver(Visitor.Code.code());
+
+ // Includes trailing comments.
+ Visitor.Code = llvm::Annotations("$r[[int x = 4; // Comment\n]]");
+ Visitor.runOver(Visitor.Code.code());
+ Visitor.Code = llvm::Annotations("$r[[int x = 4; /* Comment */\n]]");
+ Visitor.runOver(Visitor.Code.code());
+
+ // Does *not* include trailing comments when another entity appears between
+ // the decl and the comment.
+ Visitor.Code = llvm::Annotations("$r[[int x = 4;]] class C {}; // Comment\n");
+ Visitor.runOver(Visitor.Code.code());
+
+ // Includes leading comments.
+ Visitor.Code = llvm::Annotations("$r[[// Comment.\nint x = 4;]]");
+ Visitor.runOverWithComments(Visitor.Code.code());
+ Visitor.Code = llvm::Annotations("$r[[// Comment.\nint x = 4;\n]]");
+ Visitor.runOverWithComments(Visitor.Code.code());
+ Visitor.Code = llvm::Annotations("$r[[/* Comment.*/\nint x = 4;\n]]");
+ Visitor.runOverWithComments(Visitor.Code.code());
+ // ... even when separated by multiple empty lines.
+ Visitor.Code = llvm::Annotations("$r[[// Comment.\n\n\nint x = 4;\n]]");
+ Visitor.runOverWithComments(Visitor.Code.code());
+
+ // Includes multi-line comments.
+ Visitor.Code = llvm::Annotations(R"cpp(
+ $r[[/* multi
+ * line
+ * comment
+ */
+ int x;]])cpp");
+ Visitor.runOverWithComments(Visitor.Code.code());
+ Visitor.Code = llvm::Annotations(R"cpp(
+ $r[[// multi
+ // line
+ // comment
+ int x;]])cpp");
+ Visitor.runOverWithComments(Visitor.Code.code());
+
+ // Does not include comments before a *series* of declarations.
+ Visitor.Code = llvm::Annotations("// Comment.\n$r[[int x = 4;\n]]class foo {};\n");
+ Visitor.runOverWithComments(Visitor.Code.code());
+
+ // Includes attributes.
+ Visitor.Code = llvm::Annotations(R"cpp(
+ #define ATTR __attribute__((deprecated("message")))
+ $r[[ATTR
+ int x;]])cpp");
+ Visitor.runOverWithComments(Visitor.Code.code());
+
+ // Includes attributes and comments together.
+ Visitor.Code = llvm::Annotations(R"cpp(
+ #define ATTR __attribute__((deprecated("message")))
+ $r[[ATTR
+ // Commment.
+ int x;]])cpp");
+ Visitor.runOverWithComments(Visitor.Code.code());
+
+ // Includes comments even in the presence of trailing whitespace.
+ Visitor.Code = llvm::Annotations("$r[[// Comment.\nint x = 4;]] ");
+ Visitor.runOverWithComments(Visitor.Code.code());
+}
+
TEST(SourceCodeTest, EditRangeWithMacroExpansionsShouldSucceed) {
// The call expression, whose range we are extracting, includes two macro
// expansions.
Index: clang/lib/Tooling/Transformer/SourceCode.cpp
===================================================================
--- clang/lib/Tooling/Transformer/SourceCode.cpp
+++ clang/lib/Tooling/Transformer/SourceCode.cpp
@@ -10,6 +10,13 @@
//
//===----------------------------------------------------------------------===//
#include "clang/Tooling/Transformer/SourceCode.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/Comment.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
#include "clang/Lex/Lexer.h"
using namespace clang;
@@ -63,3 +70,282 @@
return Range;
}
+
+static std::unique_ptr<Lexer> initLexer(const SourceManager &SM,
+ SourceLocation Loc,
+ const LangOptions &LangOpts) {
+ bool Invalid = false;
+ auto FileOffset = SM.getDecomposedLoc(Loc);
+ llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
+ assert(!Invalid && "Cannot get file/offset");
+ return std::make_unique<Lexer>(SM.getLocForStartOfFile(FileOffset.first),
+ LangOpts, File.begin(),
+ File.data() + FileOffset.second, File.end());
+}
+
+static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
+ return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
+}
+
+static bool contains(const std::set<tok::TokenKind> &Terminators,
+ const Token &Tok) {
+ return Terminators.count(Tok.getKind()) > 0;
+}
+
+// Returns the location of the last token that is associated with the an entity
+// whose last token starts at 'EntityLast'. The returned location is an
+// expansion location.
+//
+// Associated tokens include comments, horizontal whitespace and 'Terminators'
+// -- optional tokens, which, if any are found, will be included; if
+// 'Terminators' is empty, we will not include any extra tokens beyond comments
+// and horizontal whitespace.
+static SourceLocation getEntityEndLoc(
+ const SourceManager& SM, SourceLocation EntityLast,
+ const std::set<tok::TokenKind>& Terminators,
+ const LangOptions& LangOpts) {
+ assert (EntityLast.isValid() && "Invalid end location found.");
+
+ // We remember the last location of a non-horizontal-whitespace token we have
+ // lexed; this is the location up to which we will want to delete.
+ // FIXME: Support using the spelling loc here for cases where we want to
+ // analyze the macro text.
+ SourceLocation Last = SM.getExpansionRange(EntityLast).getEnd();
+ std::unique_ptr<Lexer> Lexer = initLexer(SM, Last, LangOpts);
+ // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
+ Lexer->SetKeepWhitespaceMode(true);
+
+ // Generally, the code we want to include looks like this ([] are optional),
+ // If Terminated is not empty:
+ // ... <terminator> [ <comment> ] [ <newline> ]
+ // Otherwise:
+ // [ <comment> ] [ <newline> ]
+
+ Token Tok;
+ bool Terminated = false;
+
+ // First, lex to the current token (which is the last token of the range that
+ // we know to be deleted. Then, we process the first token separately from the
+ // rest based on conditions that hold specifically for that first token.
+ //
+ // We do not search for a terminator if none is required or we've already
+ // encountered it. Also, if the original `EntityLast` location was in a macro
+ // expansion, we don't have visibility into the text, so we assume we've
+ // already terminated.
+ //
+ // FIXME: This handling of macros is too conservative. When the end of the
+ // expansion coincides with the end of the node, we can still safely
+ // analyze. But, it is more complicated, because we need to start by lexing
+ // the spelling loc for the first token and then switch to the expansion loc.
+ //
+ // (EntityLast.isMacroID() &&
+ // !Lexer::isAtEndOfMacroExpansion(EntityLast, SM, LangOpts));
+ Lexer->LexFromRawLexer(Tok);
+ if (Terminators.empty() || EntityLast.isMacroID() ||
+ contains(Terminators, Tok)) {
+ Terminated = true;
+ }
+
+ while (!Terminated) {
+ // Lex the next token we want to possibly expand the range with.
+ Lexer->LexFromRawLexer(Tok);
+
+ switch (Tok.getKind()) {
+ case tok::eof:
+ // Unexpected separators.
+ case tok::l_brace:
+ case tok::r_brace:
+ case tok::comma:
+ return Last;
+ // Whitespace pseudo-tokens.
+ case tok::unknown:
+ if (startsWithNewline(SM, Tok))
+ // Include at least until the end of the line.
+ Last = Tok.getLocation();
+ break;
+ default:
+ if (contains(Terminators, Tok))
+ Terminated = true;
+ Last = Tok.getLocation();
+ break;
+ }
+ }
+
+ do {
+ // Lex the next token we want to possibly expand the range with.
+ Lexer->LexFromRawLexer(Tok);
+
+ switch (Tok.getKind()) {
+ case tok::unknown:
+ if (startsWithNewline(SM, Tok))
+ // We're done, but include until this newline.
+ return Tok.getLocation();
+ break;
+ case tok::comment:
+ // Include any comments we find on the way.
+ Last = Tok.getLocation();
+ break;
+ // Special case including of extra semicolons or commas if any terminator
+ // is a semicolon or comma.
+ // Note that extra commas only happen when the end location is a macro
+ // location; we are safe to remove the comma, as removing the comma
+ // will not break anything that removing the entity wouldn't have
+ // already broken.
+ case tok::semi:
+ case tok::comma:
+ if (contains(Terminators, Tok)) {
+ Last = Tok.getLocation();
+ break;
+ }
+ // Found an unrelated token; stop and don't include it.
+ return Last;
+ default:
+ // Found an unrelated token; stop and don't include it.
+ return Last;
+ }
+ } while (true);
+}
+
+// Returns the expected terminator tokens for the given declaration.
+//
+// If we do not know the correct terminator token, returns an empty set.
+//
+// There are cases where we have more than one possible terminator (for example,
+// we find either a comma or a semicolon after a VarDecl).
+static std::set<tok::TokenKind> getTerminators(const Decl &D) {
+ if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
+ return {tok::semi};
+
+ if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
+ return {tok::r_brace, tok::semi};
+
+ if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
+ return {tok::comma, tok::semi};
+
+ return {};
+}
+
+// Is `Loc` "separate" from any following syntactic entity? That is, either
+// there is no following entity, or it is separated by something meaningful
+// (e.g. an empty line, a comment). Since this is a heuristic, we return false
+// when in doubt. `Loc` should point at either a newline following an entity or
+// a non-whitespace token that ends an entity.
+static bool isSeparate(const SourceManager &SM, SourceLocation End,
+ const LangOptions &LangOpts) {
+ // If the first character is a newline, we'll check for an empty line as a
+ // separator. However, we can't identify an empty line using tokens, so we
+ // analyse the characters. If we try to use tokens, we'll just end up with a
+ // whitespace token, whose characters we'd have to analyse anyhow.
+ Token Tok;
+ const char* LocChars = SM.getCharacterData(End);
+ if (isVerticalWhitespace(LocChars[0])) {
+ for (int i = 1; isWhitespace(LocChars[i]); ++i)
+ if (isVerticalWhitespace(LocChars[i]))
+ return true;
+ // We didn't find an empty line, so lex the next token (skipping the newline
+ // at `End`).
+ bool Failed =
+ Lexer::getRawToken(End, Tok, SM, LangOpts, /*IgnoreWhiteSpace=*/true);
+ if (Failed)
+ return false;
+ } else {
+ // Skip current (non-newline) token.
+ llvm::Optional<Token> MaybeTok = Lexer::findNextToken(End, SM, LangOpts);
+ if (!MaybeTok)
+ return false;
+ Tok = *MaybeTok;
+ }
+
+ switch (Tok.getKind()) {
+ case tok::comment:
+ case tok::r_brace:
+ case tok::eof:
+ return true;
+ default:
+ return false;
+ }
+}
+
+CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
+ ASTContext &Context) {
+ const SourceManager &SM = Context.getSourceManager();
+ const LangOptions &LangOpts = Context.getLangOpts();
+ // Be sure to maintain this range as a token range when modifying its end
+ // location.
+ CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
+
+ // First, expand to the start of the template<> declaration if necessary.
+ if (const auto* Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
+ if (const auto* T = Record->getDescribedClassTemplate())
+ if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
+ Range.setBegin(T->getBeginLoc());
+ } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
+ if (const auto *T = F->getDescribedFunctionTemplate())
+ if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
+ Range.setBegin(T->getBeginLoc());
+ }
+
+ // Next, expand the end location past trailing comments to include a potential
+ // newline at the end of the decl's line.
+ Range.setEnd(getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl),
+ LangOpts));
+
+ // Finally, expand to include preceeding associated comments. We ignore any
+ // comments that are not preceeding the decl, since we've already skipped
+ // trailing comments with getEntityEndLoc.
+ if (const RawComment *Comment =
+ Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
+ // Only include a preceding comment if:
+ // * it is separate from any following entity (so, there are no other
+ // entities it could refer to), and
+ // * it is not a IfThisThenThat lint check.
+ if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
+ Range.getBegin()) &&
+ isSeparate(SM, Range.getEnd(), LangOpts)) {
+ const StringRef CommentText = Comment->getRawText(SM);
+ if (!CommentText.contains("LINT.IfChange") &&
+ !CommentText.contains("LINT.ThenChange"))
+ Range.setBegin(Comment->getBeginLoc());
+ }
+
+ // Add leading attributes.
+ for (auto* Attr : Decl.attrs()) {
+ if (Attr->getLocation().isInvalid() ||
+ !SM.isBeforeInTranslationUnit(Attr->getLocation(),
+ Range.getBegin()))
+ continue;
+ Range.setBegin(Attr->getLocation());
+
+ // Extend to the left '[[' or '__attribute((' if we saw the attribute,
+ // unless it is not a valid location.
+ bool Invalid;
+ StringRef Source =
+ SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
+ if (Invalid)
+ continue;
+ llvm::StringRef BeforeAttr =
+ Source.substr(0, SM.getFileOffset(Range.getBegin()));
+ llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
+
+ for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
+ // Handle whitespace between attribute prefix and attribute value.
+ if (BeforeAttrStripped.endswith(Prefix)) {
+ // Move start to start position of prefix, which is
+ // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
+ // positions to the left.
+ Range.setBegin(Range.getBegin().getLocWithOffset(
+ static_cast<int>(-BeforeAttr.size() + BeforeAttrStripped.size() -
+ Prefix.size())));
+ break;
+ // If we didn't see '[[' or '__attribute' it's probably coming from a
+ // macro expansion which is already handled by getExpansionRange(),
+ // below.
+ }
+ }
+ }
+
+ // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
+ // Range.getBegin() may be inside an expansion.
+ Range.setBegin(SM.getExpansionRange(Range.getBegin()).getBegin());
+ return Range;
+}
Index: clang/include/clang/Tooling/Transformer/SourceCode.h
===================================================================
--- clang/include/clang/Tooling/Transformer/SourceCode.h
+++ clang/include/clang/Tooling/Transformer/SourceCode.h
@@ -20,9 +20,10 @@
namespace clang {
namespace tooling {
-/// Extends \p Range to include the token \p Next, if it immediately follows the
-/// end of the range. Otherwise, returns \p Range unchanged.
-CharSourceRange maybeExtendRange(CharSourceRange Range, tok::TokenKind Next,
+/// Extends \p Range to include the token \p Terminator, if it immediately
+/// follows the end of the range. Otherwise, returns \p Range unchanged.
+CharSourceRange maybeExtendRange(CharSourceRange Range,
+ tok::TokenKind Terminator,
ASTContext &Context);
/// Returns the source range spanning the node, extended to include \p Next, if
@@ -35,6 +36,11 @@
Next, Context);
}
+/// Returns the logical source range of the node, extended to include associated
+/// comments and whitespace before and after the node, and associated
+/// terminators.
+CharSourceRange getAssociatedRange(const Decl &D, ASTContext &Context);
+
/// Returns the source-code text in the specified range.
StringRef getText(CharSourceRange Range, const ASTContext &Context);
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits