johannes updated this revision to Diff 103163.
johannes added a comment.

- Add the option to not use compilation databases.
- Add a basic test.


https://reviews.llvm.org/D34329

Files:
  include/clang/Tooling/ASTDiff/ASTDiff.h
  lib/Tooling/ASTDiff/ASTDiff.cpp
  lib/Tooling/ASTDiff/CMakeLists.txt
  lib/Tooling/CMakeLists.txt
  test/Tooling/clang-diff-basic.cpp
  tools/CMakeLists.txt
  tools/clang-diff/CMakeLists.txt
  tools/clang-diff/ClangDiff.cpp

Index: tools/clang-diff/ClangDiff.cpp
===================================================================
--- /dev/null
+++ tools/clang-diff/ClangDiff.cpp
@@ -0,0 +1,101 @@
+//===- ClangDiff.cpp - compare source files by AST nodes ------*- C++ -*- -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a tool for syntax tree based comparison using
+// Tooling/ASTDiff.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/ASTDiff/ASTDiff.h"
+#include "clang/Tooling/CommonOptionsParser.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace tooling;
+
+static cl::OptionCategory ClangDiffCategory("clang-diff options");
+
+static cl::opt<bool>
+    DumpAST("ast-dump",
+            cl::desc("Print the internal representation of the AST as JSON."),
+            cl::init(false), cl::cat(ClangDiffCategory));
+
+static cl::opt<bool> NoCompilationDatabase(
+    "no-compilation-database",
+    cl::desc(
+        "Do not attempt to load build settigns from a compilation database"),
+    cl::init(false), cl::cat(ClangDiffCategory));
+
+static cl::opt<std::string> SourcePath(cl::Positional, cl::desc("<source>"),
+                                       cl::Required,
+                                       cl::cat(ClangDiffCategory));
+
+static cl::opt<std::string> DestinationPath(cl::Positional,
+                                            cl::desc("<destination>"),
+                                            cl::Optional,
+                                            cl::cat(ClangDiffCategory));
+
+static std::unique_ptr<ASTUnit> getAST(const StringRef Filename) {
+  std::string ErrorMessage;
+  std::unique_ptr<CompilationDatabase> Compilations;
+  if (!NoCompilationDatabase)
+    Compilations =
+        CompilationDatabase::autoDetectFromSource(Filename, ErrorMessage);
+  if (!Compilations) {
+    if (!NoCompilationDatabase)
+      llvm::errs()
+          << "Error while trying to load a compilation database, running "
+             "without flags.\n"
+          << ErrorMessage;
+    Compilations.reset(
+        new FixedCompilationDatabase(".", std::vector<std::string>()));
+  }
+  std::array<std::string, 1> Files = {{Filename}};
+  ClangTool Tool(*Compilations, Files);
+  std::vector<std::unique_ptr<ASTUnit>> ASTs;
+  Tool.buildASTs(ASTs);
+  if (ASTs.size() != Files.size())
+    return nullptr;
+  return std::move(ASTs[0]);
+}
+
+int main(int argc, const char **argv) {
+  cl::HideUnrelatedOptions(ClangDiffCategory);
+  if (!cl::ParseCommandLineOptions(argc, argv)) {
+    cl::PrintOptionValues();
+    return 1;
+  }
+
+  if (DumpAST) {
+    if (!DestinationPath.empty()) {
+      llvm::errs() << "Error: Please specify exactly one filename.\n";
+      return 1;
+    }
+    std::unique_ptr<ASTUnit> AST = getAST(SourcePath);
+    if (!AST)
+      return 1;
+    clang::diff::TreeRoot Tree(AST->getASTContext());
+    Tree.printAsJson();
+    return 0;
+  }
+
+  if (DestinationPath.empty()) {
+    llvm::errs() << "Error: Exactly two paths are required.\n";
+    return 1;
+  }
+
+  std::unique_ptr<ASTUnit> Src = getAST(SourcePath);
+  std::unique_ptr<ASTUnit> Dst = getAST(DestinationPath);
+  if (!Src || !Dst)
+    return 1;
+
+  clang::diff::runDiff(Src->getASTContext(), Dst->getASTContext());
+
+  return 0;
+}
Index: tools/clang-diff/CMakeLists.txt
===================================================================
--- /dev/null
+++ tools/clang-diff/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_clang_executable(clang-diff
+  ClangDiff.cpp
+  )
+
+target_link_libraries(clang-diff
+  clangAST
+  clangBasic
+  clangFrontend
+  clangLex
+  clangTooling
+  clangToolingASTDiff
+  )
Index: tools/CMakeLists.txt
===================================================================
--- tools/CMakeLists.txt
+++ tools/CMakeLists.txt
@@ -2,6 +2,7 @@
 
 add_clang_subdirectory(diagtool)
 add_clang_subdirectory(driver)
+add_clang_subdirectory(clang-diff)
 add_clang_subdirectory(clang-format)
 add_clang_subdirectory(clang-format-vs)
 add_clang_subdirectory(clang-fuzzer)
Index: test/Tooling/clang-diff-basic.cpp
===================================================================
--- /dev/null
+++ test/Tooling/clang-diff-basic.cpp
@@ -0,0 +1,65 @@
+// RUN: awk '/^.. src/{f=1;next}/^.. end src/{f=0}f' %s > %T/src.cpp
+// RUN: awk '/^.. dst/{f=1;next}/^.. end dst/{f=0}f' %s > %T/dst.cpp
+// RUN: clang-diff -no-compilation-database %T/src.cpp %T/dst.cpp
+// RUN: clang-diff -no-compilation-database %T/src.cpp %T/dst.cpp | FileCheck %s
+
+// src
+
+namespace src {
+  const char str[] = "the string";
+  ;
+  ;
+  ;
+  ;
+}
+
+int on = 1 * 2 * 3 * 4;
+int b = on * 2;
+
+class X {
+  const char *foo(int i) {
+    if (i == 0)
+      return "Foo!";
+    return 0;
+  }
+
+public:
+  X(){};
+
+  int id(int i) { return i; }
+};
+// end src
+
+// dst:
+// CHECK: Match NamespaceDecl: src{{.*}} to NamespaceDecl: src
+namespace src {
+  ;
+  ;
+  ;
+  ;
+}
+// CHECK-NOT: Match NamespaceDecl: src{{.*}} to NamespaceDecl: dst
+namespace dst {
+}
+
+// CHECK: Match VarDecl: on(int){{.*}} to VarDecl: one(double)
+// CHECK: Update VarDecl: on(int){{.*}} to one(double)
+double one = 1 * 2 * 55;
+// CHECK: Update DeclRefExpr
+int b = one * 2;
+
+class X {
+  const char *foo(int i) {
+    if (i == 0)
+      return "Bar";
+    // CHECK: Insert IfStmt{{.*}} into IfStmt
+    // CHECK: Insert BinaryOperator: =={{.*}} into IfStmt
+    else if (i == -1)
+      return "Foo!";
+    return 0;
+  }
+  // CHECK: Delete AccessSpecDecl: public
+  X(){};
+  // CHECK: Delete CXXMethodDecl
+};
+// end dst
Index: lib/Tooling/CMakeLists.txt
===================================================================
--- lib/Tooling/CMakeLists.txt
+++ lib/Tooling/CMakeLists.txt
@@ -5,6 +5,7 @@
 
 add_subdirectory(Core)
 add_subdirectory(Refactoring)
+add_subdirectory(ASTDiff)
 
 add_clang_library(clangTooling
   ArgumentsAdjusters.cpp
Index: lib/Tooling/ASTDiff/CMakeLists.txt
===================================================================
--- /dev/null
+++ lib/Tooling/ASTDiff/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  # Option
+  Support
+  )
+
+add_clang_library(clangToolingASTDiff
+  ASTDiff.cpp
+  LINK_LIBS
+  clangBasic
+  clangAST
+  # clangToolingCore
+  )
Index: lib/Tooling/ASTDiff/ASTDiff.cpp
===================================================================
--- /dev/null
+++ lib/Tooling/ASTDiff/ASTDiff.cpp
@@ -0,0 +1,814 @@
+//===- ASTDiff.cpp - AST differencing implementation-----------*- C++ -*- -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains definitons for the AST differencing interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/ASTDiff/ASTDiff.h"
+
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#include <limits>
+#include <memory>
+#include <unordered_set>
+
+using namespace llvm;
+using namespace clang;
+
+namespace clang {
+namespace diff {
+
+namespace {
+// Count the number of non-null nodes
+struct NodeCountVisitor : public RecursiveASTVisitor<NodeCountVisitor> {
+  int Count = 0;
+  const TreeRoot &Root;
+  NodeCountVisitor(const TreeRoot &Root) : Root(Root) {}
+  bool TraverseDecl(Decl *D) {
+    if (Root.discardNode(D))
+      return true;
+    ++Count;
+    RecursiveASTVisitor<NodeCountVisitor>::TraverseDecl(D);
+    return true;
+  }
+  bool TraverseStmt(Stmt *S) {
+    if (Root.discardNode(S))
+      return true;
+    ++Count;
+    RecursiveASTVisitor<NodeCountVisitor>::TraverseStmt(S);
+    return true;
+  }
+  bool TraverseType(QualType T) { return true; }
+};
+} // end anonymous namespace
+
+namespace {
+// Sets Height, Parent and Children for each node.
+struct PreorderVisitor : public RecursiveASTVisitor<PreorderVisitor> {
+  int Id = 0, Depth = 0;
+  NodeId Parent = NoNodeId;
+  TreeRoot &Root;
+
+  PreorderVisitor(TreeRoot &Root) : Root(Root) {}
+
+  template <class T> std::tuple<NodeId, NodeId> PreTraverse(T *ASTNode) {
+    NodeId MyId = Id;
+    Node &N = Root.getMutableNode(MyId);
+    N.Parent = Parent;
+    N.Depth = Depth;
+    N.ASTNode = ast_type_traits::DynTypedNode::create(*ASTNode);
+    assert(!N.ASTNode.getNodeKind().isNone() &&
+           "Expected nodes to have a valid kind.");
+    if (Parent != NoNodeId) {
+      Node &P = Root.getMutableNode(Parent);
+      P.Children.push_back(MyId);
+    }
+    Parent = MyId;
+    ++Id;
+    ++Depth;
+    return {MyId, Root.getNode(MyId).Parent};
+  }
+  void PostTraverse(std::tuple<NodeId, NodeId> State) {
+    NodeId MyId, PreviousParent;
+    std::tie(MyId, PreviousParent) = State;
+    Parent = PreviousParent;
+    --Depth;
+    if (MyId == NoNodeId)
+      return;
+    Node &N = Root.getMutableNode(MyId);
+    if (N.isLeaf())
+      Root.Leaves.push_back(MyId);
+    N.Height = 1;
+    for (NodeId Child : N.Children)
+      N.Height = std::max(N.Height, 1 + Root.getNode(Child).Height);
+  }
+  bool TraverseDecl(Decl *D) {
+    if (Root.discardNode(D))
+      return true;
+    auto SavedState = PreTraverse(D);
+    RecursiveASTVisitor<PreorderVisitor>::TraverseDecl(D);
+    PostTraverse(SavedState);
+    return true;
+  }
+  bool TraverseStmt(Stmt *S) {
+    if (Root.discardNode(S))
+      return true;
+    auto SavedState = PreTraverse(S);
+    RecursiveASTVisitor<PreorderVisitor>::TraverseStmt(S);
+    PostTraverse(SavedState);
+    return true;
+  }
+  bool TraverseType(QualType T) { return true; }
+};
+} // end anonymous namespace
+
+TreeRoot::TreeRoot(ASTContext &AST) : AST(AST) {
+  auto *TUD = AST.getTranslationUnitDecl();
+  // Run the above visitors to initialize the tree.
+  NodeCountVisitor NodeCounter(*this);
+  NodeCounter.TraverseDecl(TUD);
+  setSize(NodeCounter.Count);
+  PreorderVisitor PreorderWalker(*this);
+  PreorderWalker.TraverseDecl(TUD);
+  setLeftMostDescendant();
+  int PostorderId = 0;
+  PostorderIds.resize(Nodes.size());
+  std::function<void(NodeId)> PostorderTraverse = [&](NodeId Id) {
+    for (NodeId Child : getNode(Id).Children)
+      PostorderTraverse(Child);
+    PostorderIds[Id] = PostorderId;
+    ++PostorderId;
+  };
+  PostorderTraverse(root());
+}
+
+template <class T> bool TreeRoot::discardNode(T *N) const {
+  if (!N)
+    return true;
+  SourceLocation SLoc = N->getLocStart();
+  const SourceManager &SrcMgr = AST.getSourceManager();
+  return SLoc.isValid() && SrcMgr.isInSystemHeader(SLoc);
+}
+
+int TreeRoot::getSubtreePostorder(std::vector<NodeId> &Ids, NodeId Root) const {
+  int Leaves = 0;
+  std::function<void(NodeId)> Traverse = [&](NodeId Id) {
+    const Node &N = getNode(Id);
+    for (NodeId Child : N.Children)
+      Traverse(Child);
+    if (N.isLeaf())
+      ++Leaves;
+    Ids.push_back(Id);
+  };
+  Traverse(Root);
+  return Leaves;
+}
+
+std::vector<NodeId> TreeRoot::getSubtreeBfs(NodeId Id) const {
+  std::vector<NodeId> Ids;
+  size_t Expanded = 0;
+  Ids.push_back(Id);
+  while (Expanded < Ids.size())
+    for (NodeId Child : getNode(Ids[Expanded++]).Children)
+      Ids.push_back(Child);
+  return Ids;
+}
+
+int TreeRoot::getNumberOfDescendants(NodeId Id) const {
+  return PostorderIds[Id] - getNode(Id).LeftMostDescendant;
+}
+
+std::string TreeRoot::getSourceString(SourceLocation SourceBegin,
+                                      SourceLocation SourceEnd) const {
+  const SourceManager &SrcMgr = AST.getSourceManager();
+  const char *Begin = SrcMgr.getCharacterData(SourceBegin);
+  const char *End = SrcMgr.getCharacterData(SourceEnd);
+  return {Begin, static_cast<size_t>(End - Begin)};
+}
+
+std::string TreeRoot::label(NodeId Id) const {
+  const Node &N = getNode(Id);
+  const ast_type_traits::DynTypedNode &DTN = N.ASTNode;
+  if (auto *X = DTN.get<BinaryOperator>())
+    return X->getOpcodeStr();
+  if (auto *X = DTN.get<AccessSpecDecl>())
+    return getSourceString(X->getAccessSpecifierLoc(), X->getColonLoc());
+  if (auto *X = DTN.get<IntegerLiteral>()) {
+    SmallString<256> Str;
+    X->getValue().toString(Str, /*Radix=*/10, /*Signed=*/false);
+    return Str.str();
+  }
+  if (auto *X = DTN.get<StringLiteral>())
+    return X->getString();
+  if (auto *X = DTN.get<ValueDecl>())
+    return X->getNameAsString() + "(" + X->getType().getAsString() + ")";
+  if (auto *X = DTN.get<DeclStmt>())
+    return "";
+  if (auto *X = DTN.get<TranslationUnitDecl>())
+    return "";
+  std::string Label;
+  if (auto *X = DTN.get<DeclRefExpr>()) {
+    if (X->hasQualifier() && X->getQualifier()->getAsIdentifier())
+      Label += std::string(X->getQualifier()->getAsIdentifier()->getName());
+    Label += X->getDecl()->getNameAsString();
+    return Label;
+  }
+  if (auto *X = DTN.get<NamedDecl>())
+    Label += X->getNameAsString() + ";";
+  if (auto *X = DTN.get<TypedefNameDecl>())
+    return Label + X->getUnderlyingType().getAsString() + ";";
+  if (auto *X = DTN.get<NamespaceDecl>())
+    return Label;
+  if (auto *X = DTN.get<TypeDecl>())
+    if (X->getTypeForDecl())
+      Label +=
+          X->getTypeForDecl()->getCanonicalTypeInternal().getAsString() + ";";
+  if (auto *X = DTN.get<Decl>())
+    return Label;
+  if (auto *X = DTN.get<Stmt>())
+    return Label;
+  if (auto *X = DTN.get<QualType>())
+    llvm_unreachable("Types are not included.\n");
+  llvm_unreachable("Fatal: unhandled AST node.\n");
+}
+
+void TreeRoot::printTree(raw_ostream &OS, NodeId Id) const {
+  if (Id == NoNodeId)
+    Id = root();
+  const Node &N = getNode(Id);
+  for (int I = 0; I < N.Depth; ++I)
+    OS << " ";
+  OS << showNode(Id) << "\n";
+  for (NodeId Child : N.Children)
+    printTree(OS, Child);
+}
+
+std::string TreeRoot::showNode(NodeId Id) const {
+  if (Id == NoNodeId)
+    return "None";
+  std::string Label;
+  if (label(Id) != "")
+    Label = formatv(": {0}", label(Id));
+  return formatv("{0}{1}({2})", getNode(Id).getTypeLabel(), Label,
+                 PostorderIds[Id]);
+}
+
+void TreeRoot::printNodeAsJson(raw_ostream &OS, NodeId Id) const {
+  auto N = getNode(Id);
+  std::string Label;
+  if (label(Id) != "")
+    Label = formatv(R"(,"label":"{0}")", label(Id));
+  OS << formatv(R"({"typeLabel":"{0}"{1},"children":[)", N.getTypeLabel(),
+                Label);
+  if (N.Children.size() > 0) {
+    printNodeAsJson(OS, N.Children[0]);
+    for (size_t I = 1; I < N.Children.size(); ++I) {
+      OS << ",";
+      printNodeAsJson(OS, N.Children[I]);
+    }
+  }
+  OS << "]}";
+}
+
+void TreeRoot::printAsJson(raw_ostream &OS) const {
+  OS << R"({"root":)";
+  printNodeAsJson(OS, root());
+  OS << "}\n";
+}
+
+void TreeRoot::printAsJson() const { printAsJson(llvm::outs()); }
+
+void TreeRoot::setLeftMostDescendant() {
+  for (NodeId Id : Leaves) {
+    Node &N = getMutableNode(Id);
+    N.LeftMostDescendant = Id;
+    NodeId Cur = Id;
+    while (N.Parent != NoNodeId && getNode(N.Parent).Children.front() == Cur) {
+      Cur = N.Parent;
+      getMutableNode(Cur).LeftMostDescendant = Id;
+    }
+  }
+}
+
+Mappings::Mappings(const TreeRoot &T1, const TreeRoot &T2) : T1(T1), T2(T2) {
+  // Maximum possible size after patching one tree.
+  int Size = T1.getSize() + T2.getSize();
+  SrcToDst = llvm::make_unique<NodeId[]>(Size);
+  DstToSrc = llvm::make_unique<NodeId[]>(Size);
+  // set everything to NoNodeId == -1
+  memset(SrcToDst.get(), NoNodeId, Size * sizeof(NodeId));
+  memset(DstToSrc.get(), NoNodeId, Size * sizeof(NodeId));
+}
+
+void Mappings::link(NodeId Src, NodeId Dst) {
+  SrcToDst[Src] = Dst;
+  DstToSrc[Dst] = Src;
+}
+
+void Mappings::printMapping(raw_ostream &OS) const {
+  for (NodeId Id1 = 0, Id2; Id1 < T1.getSize(); ++Id1)
+    if ((Id2 = getDst(Id1)) != NoNodeId)
+      OS << formatv("Match {0} to {1}\n", T1.showNode(Id1), T2.showNode(Id2));
+}
+
+void Mappings::printMapping() const { printMapping(llvm::outs()); }
+
+/// Identifies a node in this subtree by its postorder offset.
+using SNodeId = int;
+
+class Subtree {
+private:
+  /// The parent tree.
+  const TreeRoot &Tree;
+  /// Maps SNodeIds to original ids.
+  std::vector<NodeId> RootIds;
+  std::vector<NodeId> LeftMostDescendants;
+
+public:
+  std::vector<NodeId> KeyRoots;
+
+  Subtree(const TreeRoot &Tree, NodeId SubtreeRoot) : Tree(Tree) {
+    int Leaves = Tree.getSubtreePostorder(RootIds, SubtreeRoot);
+    setLeftMostDescendants();
+    computeKeyRoots(Leaves);
+  }
+
+  int getSizeS() const { return RootIds.size(); }
+  NodeId getIdInRoot(SNodeId Id) const {
+    assert(Id > 0 && Id <= getSizeS() && "Invalid subtree node index.");
+    return RootIds[Id - 1];
+  }
+  const Node &getNodeS(SNodeId Id) const {
+    return Tree.getNode(getIdInRoot(Id));
+  }
+  const std::string labelS(SNodeId Id) const {
+    return Tree.label(getIdInRoot(Id));
+  }
+  SNodeId getLeftMostDescendant(SNodeId Id) const {
+    assert(Id > 0 && Id <= getSizeS() && "Invalid subtree node index.");
+    return LeftMostDescendants[Id - 1];
+  }
+
+private:
+  void setLeftMostDescendants() {
+    LeftMostDescendants.resize(getSizeS());
+    for (SNodeId I = 0; I < getSizeS(); ++I) {
+      NodeId Cur = getIdInRoot(I + 1);
+      while (!Tree.getNode(Cur).isLeaf())
+        Cur = Tree.getNode(Cur).Children[0];
+      // TODO why does this work
+      LeftMostDescendants[I] = 1 + (Tree.PostorderIds[Cur] - getIdInRoot(1));
+    }
+    assert(LeftMostDescendants[0] == 0 &&
+           "The leftmost node should have subtree index 0.");
+  }
+  void computeKeyRoots(int Leaves) {
+    KeyRoots.resize(Leaves);
+    std::unordered_set<SNodeId> Visited;
+    int K = Leaves - 1;
+    for (int I = getSizeS(); I > 0; --I) {
+      SNodeId LeftDesc = getLeftMostDescendant(I);
+      if (Visited.count(LeftDesc))
+        continue;
+      assert(K >= 0 && "K should be non-negative");
+      KeyRoots[K] = I;
+      Visited.insert(LeftDesc);
+      --K;
+    }
+  }
+};
+
+// Computes an optimal mapping between two trees
+class ZsMatcher {
+  using Pairs = std::vector<std::pair<NodeId, NodeId>>;
+
+  Subtree S1;
+  Subtree S2;
+  std::unique_ptr<std::unique_ptr<double[]>[]> TreeDist, ForestDist;
+
+public:
+  ZsMatcher(const TreeRoot &T1, const TreeRoot &T2, NodeId Id1, NodeId Id2)
+      : S1(T1, Id1), S2(T2, Id2) {
+    TreeDist =
+        llvm::make_unique<std::unique_ptr<double[]>[]>(S1.getSizeS() + 1);
+    ForestDist =
+        llvm::make_unique<std::unique_ptr<double[]>[]>(S1.getSizeS() + 1);
+    for (int I = 0; I < S1.getSizeS() + 1; ++I) {
+      TreeDist[I] = llvm::make_unique<double[]>(S2.getSizeS() + 1);
+      ForestDist[I] = llvm::make_unique<double[]>(S2.getSizeS() + 1);
+    }
+  }
+
+  Pairs matchZs() {
+    Pairs Matches;
+
+    computeTreeDist();
+
+    bool RootNodePair = true;
+    Pairs TreePairs{{S1.getSizeS(), S2.getSizeS()}};
+
+    NodeId LastRow, LastCol, FirstRow, FirstCol, Row, Col;
+    while (!TreePairs.empty()) {
+      std::tie(LastRow, LastCol) = TreePairs.back();
+      TreePairs.pop_back();
+
+      if (!RootNodePair) {
+        computeForestDist(LastRow, LastCol);
+      }
+
+      RootNodePair = false;
+
+      FirstRow = S1.getLeftMostDescendant(LastRow);
+      FirstCol = S2.getLeftMostDescendant(LastCol);
+
+      Row = LastRow;
+      Col = LastCol;
+
+      while (Row > FirstRow || Col > FirstCol) {
+        if (Row > FirstRow &&
+            ForestDist[Row - 1][Col] + 1 == ForestDist[Row][Col]) {
+          --Row;
+        } else if (Col > FirstCol &&
+                   ForestDist[Row][Col - 1] + 1 == ForestDist[Row][Col]) {
+          --Col;
+        } else {
+          NodeId LMD1 = S1.getLeftMostDescendant(Row);
+          NodeId LMD2 = S2.getLeftMostDescendant(Col);
+          if (LMD1 == S1.getLeftMostDescendant(LastRow) &&
+              LMD2 == S2.getLeftMostDescendant(LastCol)) {
+            assert(S1.getNodeS(Row).hasSameType(S2.getNodeS(Col)) &&
+                   "Must not match nodes of different kind.");
+            Matches.emplace_back(S1.getIdInRoot(Row), S2.getIdInRoot(Col));
+            --Row;
+            --Col;
+          } else {
+            TreePairs.emplace_back(Row, Col);
+            Row = LMD1;
+            Col = LMD2;
+          }
+        }
+      }
+    }
+    return Matches;
+  }
+
+private:
+  // TODO Use string editing distance if applicable.
+  double getUpdateCost(NodeId Id1, NodeId Id2) {
+    if (!S1.getNodeS(Id1).hasSameType(S2.getNodeS(Id2)))
+      return std::numeric_limits<double>::max();
+    if (S1.labelS(Id1) == S2.labelS(Id2))
+      return 0;
+    return 1;
+  }
+  void computeTreeDist() {
+    for (NodeId Id1 : S1.KeyRoots)
+      for (NodeId Id2 : S2.KeyRoots)
+        computeForestDist(Id1, Id2);
+  }
+  void dumpForestDist() const {
+    for (int I = 0; I < S1.getSizeS() + 1; ++I) {
+      llvm::errs() << "[";
+      for (int J = 0; J < S2.getSizeS() + 1; ++J)
+        llvm::errs() << formatv("{0}{1: 2}", (J == 0 ? "" : " "),
+                                ForestDist[I][J]);
+      llvm::errs() << "]\n";
+    }
+  }
+  void computeForestDist(SNodeId Id1, SNodeId Id2) {
+    assert(Id1 > 0 && Id2 > 0 && "Expecting offsets greater than 0.");
+    SNodeId LMD1 = S1.getLeftMostDescendant(Id1);
+    SNodeId LMD2 = S2.getLeftMostDescendant(Id2);
+
+    ForestDist[LMD1][LMD2] = 0;
+    for (SNodeId D1 = LMD1 + 1; D1 <= Id1; ++D1) {
+      double DeletionCost = 1.0;
+      ForestDist[D1][LMD2] = ForestDist[D1 - 1][LMD2] + DeletionCost;
+      for (SNodeId D2 = LMD2 + 1; D2 <= Id2; ++D2) {
+        double InsertionCost = 1;
+        ForestDist[LMD1][D2] = ForestDist[LMD1][D2 - 1] + InsertionCost;
+        SNodeId DLMD1 = S1.getLeftMostDescendant(D1);
+        SNodeId DLMD2 = S2.getLeftMostDescendant(D2);
+        if (DLMD1 == LMD1 && DLMD2 == LMD2) {
+          double UpdateCost = getUpdateCost(D1, D2);
+          ForestDist[D1][D2] =
+              std::min(std::min(ForestDist[D1 - 1][D2] + DeletionCost,
+                                ForestDist[D1][D2 - 1] + InsertionCost),
+                       ForestDist[D1 - 1][D2 - 1] + UpdateCost);
+          TreeDist[D1][D2] = ForestDist[D1][D2];
+        } else {
+          ForestDist[D1][D2] =
+              std::min(std::min(ForestDist[D1 - 1][D2] + DeletionCost,
+                                ForestDist[D1][D2 - 1] + InsertionCost),
+                       ForestDist[DLMD1][DLMD2] + TreeDist[D1][D2]);
+        }
+      }
+    }
+  }
+};
+
+namespace {
+// Compares nodes by their depth.
+struct HeightLess {
+  const TreeRoot &Tree;
+  HeightLess(const TreeRoot &Tree) : Tree(Tree) {}
+  bool operator()(NodeId Id1, NodeId Id2) const {
+    return Tree.getNode(Id1).Height < Tree.getNode(Id2).Height;
+  }
+};
+} // end anonymous namespace
+
+// Priority queue for nodes, sorted descendingly by their height.
+class PriorityList {
+  const TreeRoot &Tree;
+  HeightLess Comparator;
+  std::vector<NodeId> Container;
+  PriorityQueue<NodeId, std::vector<NodeId>, HeightLess> List;
+
+public:
+  PriorityList(const TreeRoot &Tree)
+      : Tree(Tree), Comparator(Tree), List(Comparator, Container) {}
+
+  void push(NodeId id) { List.push(id); }
+
+  std::vector<NodeId> pop() {
+    int Max = peekMax();
+    std::vector<NodeId> Result;
+    if (Max == 0)
+      return Result;
+    while (peekMax() == Max) {
+      Result.push_back(List.top());
+      List.pop();
+    }
+    // TODO this is here to get a stable output, not a good heuristic
+    std::sort(Result.begin(), Result.end());
+    return Result;
+  }
+  int peekMax() const {
+    if (List.empty())
+      return 0;
+    return Tree.getNode(List.top()).Height;
+  }
+  void open(NodeId Id) {
+    for (NodeId Child : Tree.getNode(Id).Children)
+      push(Child);
+  }
+};
+
+/// Computes the differences between two ASTs
+class ClangDiff {
+private:
+  TreeRoot &T1;
+  TreeRoot &T2;
+
+  // Returns true if the two subtrees have the same structure
+  // and equal node kinds (does not compare labels).
+  bool isomorphic(NodeId Id1, NodeId Id2) const {
+    const Node &N1 = T1.getNode(Id1);
+    const Node &N2 = T2.getNode(Id2);
+    if (!N1.hasSameType(N2) || N1.Children.size() != N2.Children.size() ||
+        T1.label(Id1) != T2.label(Id2))
+      return false;
+    for (size_t Id = 0; Id < N1.Children.size(); ++Id)
+      if (!isomorphic(N1.Children[Id], N2.Children[Id]))
+        return false;
+    return true;
+  }
+
+  // TODO This is too restrictive, we want to allow multiple mapping
+  // candidates
+  // for nodes and resolve the ambiguity later.
+  bool isMappingAllowed(const Mappings &M, NodeId Id1, NodeId Id2) const {
+    const Node &N1 = T1.getNode(Id1);
+    const Node &N2 = T2.getNode(Id2);
+    bool AnyMapped = M.hasSrc(Id1) || M.hasDst(Id2);
+    bool SameType = N1.hasSameType(N2);
+    NodeId P1 = N1.Parent;
+    NodeId P2 = N2.Parent;
+    bool ParentsSameType = (P1 == NoNodeId && P2 == NoNodeId) ||
+                           (P1 != NoNodeId && P2 != NoNodeId &&
+                            T1.getNode(P1).hasSameType(T2.getNode(P2)));
+    return !AnyMapped && SameType && ParentsSameType;
+  }
+
+  // Adds all corresponding subtrees of the two nodes to the mappings.
+  // The two nodes must be isomorphic.
+  void addIsomorphicSubTrees(Mappings &M, NodeId Id1, NodeId Id2) const {
+    assert(isomorphic(Id1, Id2) &&
+           "Can only be called on isomorphic subtrees.");
+    M.link(Id1, Id2);
+    const Node &N1 = T1.getNode(Id1);
+    const Node &N2 = T2.getNode(Id2);
+    for (size_t Id = 0; Id < N1.Children.size(); ++Id)
+      addIsomorphicSubTrees(M, N1.Children[Id], N2.Children[Id]);
+  }
+
+  // Uses an optimal albeit slow algorithm to compute mappings for two
+  // subtrees, but only if both have fewer nodes than MaxSize.
+  void addOptimalMappings(Mappings &M, NodeId Id1, NodeId Id2) const {
+    if (std::max(T1.getNumberOfDescendants(Id1),
+                 T2.getNumberOfDescendants(Id2)) >= MaxSize)
+      return;
+    ZsMatcher Matcher(T1, T2, Id1, Id2);
+    std::vector<std::pair<NodeId, NodeId>> R = Matcher.matchZs();
+    for (const auto Tuple : R) {
+      NodeId Src = Tuple.first;
+      NodeId Dst = Tuple.second;
+      if (isMappingAllowed(M, Src, Dst))
+        M.link(Src, Dst);
+    }
+  }
+
+  // Computes the ratio of common descendants between the two nodes.
+  // Descendants are only considered to be equal when they are mapped in M.
+  double dice(const Mappings &M, NodeId Id1, NodeId Id2) const {
+    if (Id1 == NoNodeId || Id2 == NoNodeId)
+      return 0.0;
+    int CommonDescendants = 0;
+    const Node &N1 = T1.getNode(Id1);
+    for (NodeId Id = N1.LeftMostDescendant; Id < Id1; ++Id)
+      CommonDescendants += int(M.hasSrc(Id));
+    return 2.0 * CommonDescendants /
+           (T1.getNumberOfDescendants(Id1) + T2.getNumberOfDescendants(Id2));
+  }
+
+  // Returns the node that has the highest degree of similarity.
+  NodeId findCandidate(const Mappings &M, NodeId Id1) const {
+    NodeId Candidate = NoNodeId;
+    double BestDiceValue = 0.0;
+    const Node &N1 = T1.getNode(Id1);
+    for (NodeId Id2 = 0; Id2 < T2.getSize(); ++Id2) {
+      const Node &N2 = T2.getNode(Id2);
+      if (!N1.hasSameType(N2))
+        continue;
+      if (M.hasDst(Id2))
+        continue;
+      double DiceValue = dice(M, Id1, Id2);
+      if (DiceValue > BestDiceValue) {
+        BestDiceValue = DiceValue;
+        Candidate = Id2;
+      }
+    }
+    return Candidate;
+  }
+
+  // Tries to match any yet unmapped nodes, in a bottom-up fashion.
+  void matchBottomUp(Mappings &M) const {
+    for (NodeId Id1 = 0; Id1 < T1.getSize(); ++Id1) {
+      if (Id1 == T1.root()) {
+        M.link(T1.root(), T2.root());
+        addOptimalMappings(M, T1.root(), T2.root());
+        break;
+      }
+      const Node &N1 = T1.getNode(Id1);
+      bool Matched = M.hasSrc(Id1);
+      bool MatchedChildren =
+          std::any_of(N1.Children.begin(), N1.Children.end(),
+                      [&](NodeId Child) { return M.hasSrc(Child); });
+      if (Matched || !MatchedChildren)
+        continue;
+      NodeId Id2 = findCandidate(M, Id1);
+      if (Id2 == NoNodeId || dice(M, Id1, Id2) < MinDice ||
+          !isMappingAllowed(M, Id1, Id2))
+        continue;
+      M.link(Id1, Id2);
+      addOptimalMappings(M, Id1, Id2);
+    }
+  }
+
+  // Returns a set of mappings of isomorphic subtrees.
+  Mappings matchTopDown() const {
+    PriorityList L1(T1);
+    PriorityList L2(T2);
+
+    Mappings M(T1, T2);
+
+    L1.push(T1.root());
+    L2.push(T2.root());
+
+    int Max1, Max2;
+    while (std::min(Max1 = L1.peekMax(), Max2 = L2.peekMax()) > MinHeight) {
+      if (Max1 > Max2) {
+        for (NodeId Id : L1.pop())
+          L1.open(Id);
+        continue;
+      }
+      if (Max2 > Max1) {
+        for (NodeId Id : L2.pop())
+          L2.open(Id);
+        continue;
+      }
+      std::vector<NodeId> H1, H2;
+      H1 = L1.pop();
+      H2 = L2.pop();
+      for (NodeId Id1 : H1)
+        for (NodeId Id2 : H2)
+          if (isomorphic(Id1, Id2) && isMappingAllowed(M, Id1, Id2))
+            addIsomorphicSubTrees(M, Id1, Id2);
+      for (NodeId Id1 : H1)
+        if (!M.hasSrc(Id1))
+          L1.open(Id1);
+      for (NodeId Id2 : H2)
+        if (!M.hasDst(Id2))
+          L2.open(Id2);
+    }
+    return M;
+  }
+
+  // Finds an edit script that converts T1 to T2.
+  std::vector<Change> computeChanges(Mappings &M) {
+    std::vector<Change> Changes;
+    for (NodeId Id2 : T2.getSubtreeBfs(T2.root())) {
+      const Node &N2 = T2.getNode(Id2);
+      NodeId Id1 = M.getSrc(Id2);
+      if (Id1 != NoNodeId) {
+        assert(T1.getNode(Id1).hasSameType(N2) &&
+               "Matched nodes with different kinds.");
+        if (T1.label(Id1) != T2.label(Id2)) {
+          Changes.emplace_back(Update, Id1, Id2, /*UNUSED Position=*/0);
+        }
+        continue;
+      }
+      NodeId P2 = N2.Parent;
+      NodeId P1 = M.getSrc(P2);
+      assert(P1 != NoNodeId);
+      Node &Parent1 = T1.getMutableNode(P1);
+      const Node &Parent2 = T2.getNode(P2);
+      auto &Siblings1 = Parent1.Children;
+      const auto &Siblings2 = Parent2.Children;
+      size_t Position;
+      for (Position = 0; Position < Siblings2.size(); ++Position)
+        if (Siblings2[Position] == Id2 || Position >= Siblings1.size())
+          break;
+      Changes.emplace_back(Insert, Id2, P2, Position);
+      Node PatchNode;
+      PatchNode.Parent = P1;
+      PatchNode.LeftMostDescendant = N2.LeftMostDescendant;
+      PatchNode.Depth = N2.Depth;
+      PatchNode.ASTNode = N2.ASTNode;
+      // TODO update Depth if needed
+      NodeId PatchNodeId = T1.getSize();
+      // TODO maybe choose a different data structure for Children.
+      Siblings1.insert(Siblings1.begin() + Position, PatchNodeId);
+      T1.addNode(PatchNode);
+      M.link(PatchNodeId, Id2);
+    }
+    for (NodeId Id1 = 0; Id1 < T1.getSize(); ++Id1) {
+      NodeId Id2 = M.getDst(Id1);
+      if (Id2 == NoNodeId)
+        Changes.emplace_back(Delete, Id1, Id2, /*UNUSED Position=*/0);
+    }
+    return Changes;
+  }
+
+  // Prints an edit action.
+  void printChange(const Change &Chg) const { printChange(llvm::outs(), Chg); }
+  void printChange(raw_ostream &OS, const Change &Chg) const {
+    ChangeKind Kind;
+    NodeId Id1, Id2;
+    size_t Position;
+    std::tie(Kind, Id1, Id2, Position) = Chg;
+    std::string S;
+    switch (Kind) {
+    case Delete:
+      S = formatv("Delete {0}", T1.showNode(Id1));
+      break;
+    case Update:
+      S = formatv("Update {0} to {1}", T1.showNode(Id1), T2.label(Id2));
+      break;
+    case Insert:
+      S = formatv("Insert {0} into {1} at {2}", T2.showNode(Id1),
+                  T2.showNode(Id2), Position);
+      break;
+    case Move:
+      llvm_unreachable("TODO");
+      break;
+    };
+    OS << S << "\n";
+  }
+
+  Mappings match() const {
+    Mappings M = matchTopDown();
+    matchBottomUp(M);
+    return M;
+  }
+
+public:
+  // During top-down matching, only consider nodes of at least this height.
+  int MinHeight = 2;
+  // During bottom-up matching, match only nodes with at least this value as
+  // the ratio of their common descendants.
+  double MinDice = 0.2;
+  // Whenever two subtrees are matched in the bottom-up phase, the optimal
+  // mapping is computed, unless the size of either subtrees exceeds this.
+  int MaxSize = 100;
+
+  ClangDiff(TreeRoot &T1, TreeRoot &T2) : T1(T1), T2(T2) {}
+
+  void printDiff() {
+    Mappings M = match();
+    M.printMapping();
+    auto Changes = computeChanges(M);
+    for (const auto &C : Changes)
+      printChange(C);
+  }
+};
+
+void runDiff(ASTContext &AST1, ASTContext &AST2) {
+  TreeRoot T1(AST1);
+  TreeRoot T2(AST2);
+  ClangDiff DiffTool(T1, T2);
+  DiffTool.printDiff();
+}
+
+} // end namespace diff
+} // end namespace clang
Index: include/clang/Tooling/ASTDiff/ASTDiff.h
===================================================================
--- /dev/null
+++ include/clang/Tooling/ASTDiff/ASTDiff.h
@@ -0,0 +1,128 @@
+//===- ASTDiff.h - AST differencing API -----------------------*- C++ -*- -===//
+//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file specifies an interface that can be used to compare C++ syntax
+// trees.
+//
+// We use the gumtree algorithm which combines a heuristic top-down search that
+// is able to match large subtrees that are equivalent, with an optimal
+// algorithm to match small subtrees.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_ASTDIFF_ASTDIFF_H
+#define LLVM_CLANG_TOOLING_ASTDIFF_ASTDIFF_H
+
+#include "clang/AST/ASTTypeTraits.h"
+
+using namespace llvm;
+using namespace clang;
+
+namespace clang {
+namespace diff {
+
+/// Within a tree, this identifies a node by its preorder offset.
+using NodeId = int;
+
+/// Sentinel value for invalid nodes.
+const NodeId NoNodeId = -1;
+
+/// Represents a Clang AST node, alongside some additional information.
+struct Node {
+  NodeId Parent, LeftMostDescendant;
+  int Depth, Height;
+  ast_type_traits::DynTypedNode ASTNode;
+  // Maybe there is a better way to store children than this.
+  SmallVector<NodeId, 4> Children;
+
+  ast_type_traits::ASTNodeKind getType() const { return ASTNode.getNodeKind(); }
+  bool hasSameType(const Node &Other) const {
+    return getType().isSame(Other.getType()) ||
+           (getType().isNone() && Other.getType().isNone());
+  }
+  const StringRef getTypeLabel() const { return getType().asStringRef(); }
+  bool isLeaf() const { return Children.empty(); }
+};
+
+/// Represents the AST of a TranslationUnit.
+class TreeRoot {
+private:
+  /// Nodes in preorder.
+  std::vector<Node> Nodes;
+  void setRightMostDescendant();
+  void setLeftMostDescendant();
+  void setSize(size_t Size) { Nodes.resize(Size); }
+  std::string getSourceString(SourceLocation SourceBegin,
+                              SourceLocation SourceEnd) const;
+
+public:
+  ASTContext &AST;
+  std::vector<NodeId> Leaves;
+  std::vector<int> PostorderIds;
+
+  TreeRoot(ASTContext &AST);
+
+  int getSize() const { return Nodes.size(); }
+  NodeId root() const { return 0; }
+
+  const Node &getNode(NodeId Id) const { return Nodes[Id]; }
+  Node &getMutableNode(NodeId Id) { return Nodes[Id]; }
+  bool isValidNodeId(NodeId Id) const { return Id >= 0 && Id < getSize(); }
+  void addNode(Node &N) { Nodes.push_back(N); }
+  template <class T> bool discardNode(T *N) const;
+
+  int getSubtreePostorder(std::vector<NodeId> &Ids, NodeId Root) const;
+  std::vector<NodeId> getSubtreeBfs(NodeId Id) const;
+  int getNumberOfDescendants(NodeId Id) const;
+
+  /// Returns the value of the node.
+  std::string label(NodeId Id) const;
+  void printTree(raw_ostream &OS, NodeId Id = NoNodeId) const;
+  /// Return the node as "<kind>[: <label>](<postorder-id)"
+  std::string showNode(NodeId Id) const;
+  void printNodeAsJson(raw_ostream &OS, NodeId Id) const;
+  void printAsJson() const;
+  void printAsJson(raw_ostream &OS) const;
+};
+
+/// Maps nodes of the left tree to ones on the right, and vice versa.
+// Supports fast insertion and lookup.
+// TODO allow multiple mappings
+class Mappings {
+  std::unique_ptr<NodeId[]> SrcToDst, DstToSrc;
+  const TreeRoot &T1, &T2;
+
+public:
+  Mappings(const TreeRoot &T1, const TreeRoot &T2);
+
+  void link(NodeId Src, NodeId Dst);
+
+  NodeId getDst(NodeId Src) const { return SrcToDst[Src]; }
+  NodeId getSrc(NodeId Dst) const { return DstToSrc[Dst]; }
+  bool hasSrc(NodeId Src) const { return SrcToDst[Src] != NoNodeId; }
+  bool hasDst(NodeId Dst) const { return DstToSrc[Dst] != NoNodeId; }
+
+  void printMapping() const;
+  void printMapping(raw_ostream &OS) const;
+};
+
+/// (Update, src, dst, _  ): update the value of node src to match dst.
+/// (Insert, src, dst, pos): insert src as child of dst at offset pos.
+/// (Delete, src, _,   _  ): delete node src.
+/// (Move,   src, dst, pos): move src to a child of dst at offset pos.
+enum ChangeKind { Delete, Update, Insert, Move };
+using Change = std::tuple<ChangeKind, NodeId, NodeId, size_t>;
+
+void runDiff(ASTContext &AST1, ASTContext &AST2);
+
+} // end namespace diff
+} // end namespace clang
+
+#endif
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to