[PATCH] D72153: [libTooling] Add function to determine associated text of a declaration.

Yitzhak Mandelbaum via Phabricator via cfe-commits Fri, 03 Jan 2020 07:07:39 -0800

ymandel created this revision.
ymandel added a reviewer: gribozavr.
Herald added a project: clang.


This patch adds `getAssociatedRange` which, for a given decl, computes preceding
and trailing text that would conceptually be associated with the decl by the
reader. This includes comments, whitespace, and separators like ';'.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D72153

Files:
  clang/include/clang/Tooling/Transformer/SourceCode.h
  clang/lib/Tooling/Transformer/SourceCode.cpp
  clang/unittests/Tooling/SourceCodeTest.cpp

Index: clang/unittests/Tooling/SourceCodeTest.cpp
===================================================================
--- clang/unittests/Tooling/SourceCodeTest.cpp
+++ clang/unittests/Tooling/SourceCodeTest.cpp
@@ -17,6 +17,8 @@
 using namespace clang;
 
 using llvm::ValueIs;
+using testing::Eq;
+using tooling::getAssociatedRange;
 using tooling::getExtendedText;
 using tooling::getRangeForEdit;
 using tooling::getText;
@@ -41,12 +43,43 @@
   std::function<void(CallExpr *, ASTContext *Context)> OnCall;
 };
 
+struct DeclaratorDeclsVisitor : TestVisitor<DeclaratorDeclsVisitor> {
+  bool VisitDeclaratorDecl(DeclaratorDecl *Decl) {
+    OnDecl(Decl, Context);
+    return true;
+  }
+
+  std::function<void(DeclaratorDecl *, ASTContext *Context)> OnDecl;
+};
+
 // Equality matcher for `clang::CharSourceRange`, which lacks `operator==`.
 MATCHER_P(EqualsRange, R, "") {
   return arg.isTokenRange() == R.isTokenRange() &&
          arg.getBegin() == R.getBegin() && arg.getEnd() == R.getEnd();
 }
 
+MATCHER_P2(EqualsAnnotatedRange, SM, R, "") {
+  if (arg.getBegin().isMacroID()) {
+    *result_listener << "which starts in a macro";
+    return false;
+  }
+  if (arg.getEnd().isMacroID()) {
+    *result_listener << "which ends in a macro";
+    return false;
+  }
+
+  unsigned Begin = SM->getFileOffset(arg.getBegin());
+  unsigned End = SM->getFileOffset(arg.getEnd());
+
+  *result_listener << "which is [" << Begin << ",";
+  if (arg.isTokenRange()) {
+    *result_listener << End << "]";
+    return Begin == R.Begin && End + 1 == R.End;
+  }
+  *result_listener << End << ")";
+  return Begin == R.Begin && End == R.End;
+}
+
 static ::testing::Matcher<CharSourceRange> AsRange(const SourceManager &SM,
                                                    llvm::Annotations::Range R) {
   return EqualsRange(CharSourceRange::getCharRange(
@@ -122,6 +155,95 @@
   Visitor.runOver("int foo() { return foo() + 3; }");
 }
 
+TEST(SourceCodeTest, getAssociatedRange) {
+  struct DeclaratorDeclsVisitor : TestVisitor<DeclaratorDeclsVisitor> {
+    llvm::Annotations Code;
+
+    DeclaratorDeclsVisitor() : Code("$r[[]]") {}
+    bool VisitDeclaratorDecl(DeclaratorDecl *Decl) {
+      EXPECT_THAT(
+          getAssociatedRange(*Decl, *Context),
+          EqualsAnnotatedRange(&Context->getSourceManager(), Code.range("r")))
+          << Code.code();
+      return true;
+    }
+    bool runOverWithComments(StringRef Code) {
+      std::vector<std::string> Args = {"-std=c++11", "-fparse-all-comments"};
+      return tooling::runToolOnCodeWithArgs(CreateTestAction(), Code, Args);
+    }
+  };
+
+  DeclaratorDeclsVisitor Visitor;
+
+  // Includes newline.
+  Visitor.Code = llvm::Annotations("$r[[int x = 4;]]");
+  Visitor.runOver(Visitor.Code.code());
+
+  // Includes newline and semicolon.
+  Visitor.Code = llvm::Annotations("$r[[int x = 4;\n]]");
+  Visitor.runOver(Visitor.Code.code());
+
+  // Includes trailing comments.
+  Visitor.Code = llvm::Annotations("$r[[int x = 4; // Comment\n]]");
+  Visitor.runOver(Visitor.Code.code());
+  Visitor.Code = llvm::Annotations("$r[[int x = 4; /* Comment */\n]]");
+  Visitor.runOver(Visitor.Code.code());
+
+  // Does *not* include trailing comments when another entity appears between
+  // the decl and the comment.
+  Visitor.Code = llvm::Annotations("$r[[int x = 4;]] class C {}; // Comment\n");
+  Visitor.runOver(Visitor.Code.code());
+
+  // Includes leading comments.
+  Visitor.Code = llvm::Annotations("$r[[// Comment.\nint x = 4;]]");
+  Visitor.runOverWithComments(Visitor.Code.code());
+  Visitor.Code = llvm::Annotations("$r[[// Comment.\nint x = 4;\n]]");
+  Visitor.runOverWithComments(Visitor.Code.code());
+  Visitor.Code = llvm::Annotations("$r[[/* Comment.*/\nint x = 4;\n]]");
+  Visitor.runOverWithComments(Visitor.Code.code());
+  // ... even when separated by multiple empty lines.
+  Visitor.Code = llvm::Annotations("$r[[// Comment.\n\n\nint x = 4;\n]]");
+  Visitor.runOverWithComments(Visitor.Code.code());
+
+  // Includes  multi-line comments.
+  Visitor.Code = llvm::Annotations(R"cpp(
+      $r[[/* multi
+       * line
+       * comment
+       */
+      int x;]])cpp");
+  Visitor.runOverWithComments(Visitor.Code.code());
+  Visitor.Code = llvm::Annotations(R"cpp(
+      $r[[// multi
+      // line
+      // comment
+      int x;]])cpp");
+  Visitor.runOverWithComments(Visitor.Code.code());
+
+  // Does not include comments before a *series* of declarations.
+  Visitor.Code = llvm::Annotations("// Comment.\n$r[[int x = 4;\n]]class foo {};\n");
+  Visitor.runOverWithComments(Visitor.Code.code());
+
+  // Includes attributes.
+  Visitor.Code = llvm::Annotations(R"cpp(
+      #define ATTR __attribute__((deprecated("message")))
+      $r[[ATTR
+      int x;]])cpp");
+  Visitor.runOverWithComments(Visitor.Code.code());
+
+  // Includes attributes and comments together.
+  Visitor.Code = llvm::Annotations(R"cpp(
+      #define ATTR __attribute__((deprecated("message")))
+      $r[[ATTR
+      // Commment.
+      int x;]])cpp");
+  Visitor.runOverWithComments(Visitor.Code.code());
+
+  // Includes comments even in the presence of trailing whitespace.
+  Visitor.Code = llvm::Annotations("$r[[// Comment.\nint x = 4;]]  ");
+  Visitor.runOverWithComments(Visitor.Code.code());
+}
+
 TEST(SourceCodeTest, EditRangeWithMacroExpansionsShouldSucceed) {
   // The call expression, whose range we are extracting, includes two macro
   // expansions.
Index: clang/lib/Tooling/Transformer/SourceCode.cpp
===================================================================
--- clang/lib/Tooling/Transformer/SourceCode.cpp
+++ clang/lib/Tooling/Transformer/SourceCode.cpp
@@ -10,6 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 #include "clang/Tooling/Transformer/SourceCode.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Attr.h"
+#include "clang/AST/Comment.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Expr.h"
 #include "clang/Lex/Lexer.h"
 
 using namespace clang;
@@ -63,3 +70,282 @@
 
   return Range;
 }
+
+static std::unique_ptr<Lexer> initLexer(const SourceManager &SM,
+                                        SourceLocation Loc,
+                                        const LangOptions &LangOpts) {
+  bool Invalid = false;
+  auto FileOffset = SM.getDecomposedLoc(Loc);
+  llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
+  assert(!Invalid && "Cannot get file/offset");
+  return std::make_unique<Lexer>(SM.getLocForStartOfFile(FileOffset.first),
+                                 LangOpts, File.begin(),
+                                 File.data() + FileOffset.second, File.end());
+}
+
+static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
+  return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
+}
+
+static bool contains(const std::set<tok::TokenKind> &Terminators,
+                         const Token &Tok) {
+  return Terminators.count(Tok.getKind()) > 0;
+}
+
+// Returns the location of the last token that is associated with the an entity
+// whose last token starts at 'EntityLast'. The returned location is an
+// expansion location.
+//
+// Associated tokens include comments, horizontal whitespace and 'Terminators'
+// -- optional tokens, which, if any are found, will be included; if
+// 'Terminators' is empty, we will not include any extra tokens beyond comments
+// and horizontal whitespace.
+static SourceLocation getEntityEndLoc(
+    const SourceManager& SM, SourceLocation EntityLast,
+    const std::set<tok::TokenKind>& Terminators,
+    const LangOptions& LangOpts) {
+  assert (EntityLast.isValid() && "Invalid end location found.");
+
+  // We remember the last location of a non-horizontal-whitespace token we have
+  // lexed; this is the location up to which we will want to delete.
+  // FIXME: Support using the spelling loc here for cases where we want to
+  // analyze the macro text.
+  SourceLocation Last = SM.getExpansionRange(EntityLast).getEnd();
+  std::unique_ptr<Lexer> Lexer = initLexer(SM, Last, LangOpts);
+  // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
+  Lexer->SetKeepWhitespaceMode(true);
+
+  // Generally, the code we want to include looks like this ([] are optional),
+  // If Terminated is not empty:
+  // ... <terminator> [ <comment> ] [ <newline> ]
+  // Otherwise:
+  //   [ <comment> ] [ <newline> ]
+
+  Token Tok;
+  bool Terminated = false;
+
+  // First, lex to the current token (which is the last token of the range that
+  // we know to be deleted. Then, we process the first token separately from the
+  // rest based on conditions that hold specifically for that first token.
+  //
+  // We do not search for a terminator if none is required or we've already
+  // encountered it. Also, if the original `EntityLast` location was in a macro
+  // expansion, we don't have visibility into the text, so we assume we've
+  // already terminated.
+  //
+  // FIXME: This handling of macros is too conservative. When the end of the
+  // expansion coincides with the end of the node, we can still safely
+  // analyze. But, it is more complicated, because we need to start by lexing
+  // the spelling loc for the first token and then switch to the expansion loc.
+  //
+  // (EntityLast.isMacroID() &&
+  //   !Lexer::isAtEndOfMacroExpansion(EntityLast, SM, LangOpts));
+  Lexer->LexFromRawLexer(Tok);
+  if (Terminators.empty() || EntityLast.isMacroID() ||
+      contains(Terminators, Tok)) {
+    Terminated = true;
+  }
+
+  while (!Terminated) {
+    // Lex the next token we want to possibly expand the range with.
+    Lexer->LexFromRawLexer(Tok);
+
+    switch (Tok.getKind()) {
+    case tok::eof:
+    // Unexpected separators.
+    case tok::l_brace:
+    case tok::r_brace:
+    case tok::comma:
+      return Last;
+    // Whitespace pseudo-tokens.
+    case tok::unknown:
+      if (startsWithNewline(SM, Tok))
+        // Include at least until the end of the line.
+        Last = Tok.getLocation();
+      break;
+    default:
+      if (contains(Terminators, Tok))
+        Terminated = true;
+      Last = Tok.getLocation();
+      break;
+    }
+  }
+
+  do {
+    // Lex the next token we want to possibly expand the range with.
+    Lexer->LexFromRawLexer(Tok);
+
+    switch (Tok.getKind()) {
+    case tok::unknown:
+      if (startsWithNewline(SM, Tok))
+        // We're done, but include until this newline.
+        return Tok.getLocation();
+      break;
+    case tok::comment:
+      // Include any comments we find on the way.
+      Last = Tok.getLocation();
+      break;
+    // Special case including of extra semicolons or commas if any terminator
+    // is a semicolon or comma.
+    // Note that extra commas only happen when the end location is a macro
+    // location; we are safe to remove the comma, as removing the comma
+    // will not break anything that removing the entity wouldn't have
+    // already broken.
+    case tok::semi:
+    case tok::comma:
+      if (contains(Terminators, Tok)) {
+        Last = Tok.getLocation();
+        break;
+      }
+      // Found an unrelated token; stop and don't include it.
+      return Last;
+    default:
+      // Found an unrelated token; stop and don't include it.
+      return Last;
+    }
+  } while (true);
+}
+
+// Returns the expected terminator tokens for the given declaration.
+//
+// If we do not know the correct terminator token, returns an empty set.
+//
+// There are cases where we have more than one possible terminator (for example,
+// we find either a comma or a semicolon after a VarDecl).
+static std::set<tok::TokenKind> getTerminators(const Decl &D) {
+  if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
+    return {tok::semi};
+
+  if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
+    return {tok::r_brace, tok::semi};
+
+  if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
+    return {tok::comma, tok::semi};
+
+  return {};
+}
+
+// Is `Loc` "separate" from any following syntactic entity? That is, either
+// there is no following entity, or it is separated by something meaningful
+// (e.g. an empty line, a comment). Since this is a heuristic, we return false
+// when in doubt. `Loc` should point at either a newline following an entity or
+// a non-whitespace token that ends an entity.
+static bool isSeparate(const SourceManager &SM, SourceLocation End,
+                       const LangOptions &LangOpts) {
+  // If the first character is a newline, we'll check for an empty line as a
+  // separator. However, we can't identify an empty line using tokens, so we
+  // analyse the characters. If we try to use tokens, we'll just end up with a
+  // whitespace token, whose characters we'd have to analyse anyhow.
+  Token Tok;
+  const char* LocChars = SM.getCharacterData(End);
+  if (isVerticalWhitespace(LocChars[0])) {
+    for (int i = 1; isWhitespace(LocChars[i]); ++i)
+      if (isVerticalWhitespace(LocChars[i]))
+        return true;
+    // We didn't find an empty line, so lex the next token (skipping the newline
+    // at `End`).
+    bool Failed =
+        Lexer::getRawToken(End, Tok, SM, LangOpts, /*IgnoreWhiteSpace=*/true);
+    if (Failed)
+      return false;
+  } else {
+    // Skip current (non-newline) token.
+    llvm::Optional<Token> MaybeTok = Lexer::findNextToken(End, SM, LangOpts);
+    if (!MaybeTok)
+      return false;
+    Tok = *MaybeTok;
+  }
+
+  switch (Tok.getKind()) {
+  case tok::comment:
+  case tok::r_brace:
+  case tok::eof:
+    return true;
+  default:
+    return false;
+  }
+}
+
+CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
+                                            ASTContext &Context) {
+  const SourceManager &SM = Context.getSourceManager();
+  const LangOptions &LangOpts = Context.getLangOpts();
+  // Be sure to maintain this range as a token range when modifying its end
+  // location.
+  CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
+
+  // First, expand to the start of the template<> declaration if necessary.
+  if (const auto* Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
+    if (const auto* T = Record->getDescribedClassTemplate())
+      if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
+        Range.setBegin(T->getBeginLoc());
+  } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
+    if (const auto *T = F->getDescribedFunctionTemplate())
+      if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
+        Range.setBegin(T->getBeginLoc());
+  }
+
+  // Next, expand the end location past trailing comments to include a potential
+  // newline at the end of the decl's line.
+  Range.setEnd(getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl),
+                              LangOpts));
+
+  // Finally, expand to include preceeding associated comments. We ignore any
+  // comments that are not preceeding the decl, since we've already skipped
+  // trailing comments with getEntityEndLoc.
+  if (const RawComment *Comment =
+          Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
+    // Only include a preceding comment if:
+    // * it is separate from any following entity (so, there are no other
+    //   entities it could refer to), and
+    // * it is not a IfThisThenThat lint check.
+    if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
+                                     Range.getBegin()) &&
+        isSeparate(SM, Range.getEnd(), LangOpts)) {
+      const StringRef CommentText = Comment->getRawText(SM);
+      if (!CommentText.contains("LINT.IfChange") &&
+          !CommentText.contains("LINT.ThenChange"))
+        Range.setBegin(Comment->getBeginLoc());
+    }
+
+  // Add leading attributes.
+  for (auto* Attr : Decl.attrs()) {
+    if (Attr->getLocation().isInvalid() ||
+        !SM.isBeforeInTranslationUnit(Attr->getLocation(),
+                                           Range.getBegin()))
+      continue;
+    Range.setBegin(Attr->getLocation());
+
+    // Extend to the left '[[' or '__attribute((' if we saw the attribute,
+    // unless it is not a valid location.
+    bool Invalid;
+    StringRef Source =
+        SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
+    if (Invalid)
+      continue;
+    llvm::StringRef BeforeAttr =
+        Source.substr(0, SM.getFileOffset(Range.getBegin()));
+    llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
+
+    for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
+      // Handle whitespace between attribute prefix and attribute value.
+      if (BeforeAttrStripped.endswith(Prefix)) {
+        // Move start to start position of prefix, which is
+        // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
+        // positions to the left.
+        Range.setBegin(Range.getBegin().getLocWithOffset(
+            static_cast<int>(-BeforeAttr.size() + BeforeAttrStripped.size() -
+                             Prefix.size())));
+        break;
+        // If we didn't see '[[' or '__attribute' it's probably coming from a
+        // macro expansion which is already handled by getExpansionRange(),
+        // below.
+      }
+    }
+  }
+
+  // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
+  // Range.getBegin() may be inside an expansion.
+  Range.setBegin(SM.getExpansionRange(Range.getBegin()).getBegin());
+  return Range;
+}
Index: clang/include/clang/Tooling/Transformer/SourceCode.h
===================================================================
--- clang/include/clang/Tooling/Transformer/SourceCode.h
+++ clang/include/clang/Tooling/Transformer/SourceCode.h
@@ -20,9 +20,10 @@
 namespace clang {
 namespace tooling {
 
-/// Extends \p Range to include the token \p Next, if it immediately follows the
-/// end of the range. Otherwise, returns \p Range unchanged.
-CharSourceRange maybeExtendRange(CharSourceRange Range, tok::TokenKind Next,
+/// Extends \p Range to include the token \p Terminator, if it immediately
+/// follows the end of the range. Otherwise, returns \p Range unchanged.
+CharSourceRange maybeExtendRange(CharSourceRange Range,
+                                 tok::TokenKind Terminator,
                                  ASTContext &Context);
 
 /// Returns the source range spanning the node, extended to include \p Next, if
@@ -35,6 +36,11 @@
                           Next, Context);
 }
 
+/// Returns the logical source range of the node, extended to include associated
+/// comments and whitespace before and after the node, and associated
+/// terminators.
+CharSourceRange getAssociatedRange(const Decl &D, ASTContext &Context);
+
 /// Returns the source-code text in the specified range.
 StringRef getText(CharSourceRange Range, const ASTContext &Context);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D72153: [libTooling] Add function to determine associated text of a declaration.

Reply via email to