johannes updated this revision to Diff 103147.
johannes added a comment.

- style fixes
- do not compare nodes from system headers


https://reviews.llvm.org/D34329

Files:
  include/clang/Tooling/ASTDiff/ASTDiff.h
  lib/Tooling/ASTDiff/ASTDiff.cpp
  lib/Tooling/ASTDiff/CMakeLists.txt
  lib/Tooling/CMakeLists.txt
  tools/CMakeLists.txt
  tools/clang-diff/CMakeLists.txt
  tools/clang-diff/ClangDiff.cpp

Index: tools/clang-diff/ClangDiff.cpp
===================================================================
--- /dev/null
+++ tools/clang-diff/ClangDiff.cpp
@@ -0,0 +1,94 @@
+//===- ClangDiff.cpp - compare source files by AST nodes ------*- C++ -*- -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/ASTDiff/ASTDiff.h"
+#include "clang/Tooling/CommonOptionsParser.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace tooling;
+
+static cl::OptionCategory ClangDiffCategory("clang-diff options");
+
+static cl::opt<bool>
+    DumpAST("ast-dump",
+            cl::desc("Print the internal representation of the AST as JSON."),
+            cl::init(false), cl::cat(ClangDiffCategory));
+
+static cl::opt<std::string> SourcePath(cl::Positional, cl::desc("<source>"),
+                                       cl::Required,
+                                       cl::cat(ClangDiffCategory));
+
+static cl::opt<std::string> DestinationPath(cl::Positional,
+                                            cl::desc("<destination>"),
+                                            cl::Optional,
+                                            cl::cat(ClangDiffCategory));
+
+static std::unique_ptr<ASTUnit> getAST(const StringRef Filename) {
+  std::string ErrorMessage;
+  std::unique_ptr<CompilationDatabase> Compilations =
+      CompilationDatabase::autoDetectFromSource(Filename, ErrorMessage);
+  if (!Compilations) {
+    errs() << "Error while trying to load a compilation database, running "
+              "without flags.\n"
+           << ErrorMessage;
+    Compilations.reset(
+        new FixedCompilationDatabase(".", std::vector<std::string>()));
+  }
+  std::array<std::string, 1> Files = {{Filename}};
+  ClangTool Tool(*Compilations, Files);
+  std::vector<std::unique_ptr<ASTUnit>> ASTs;
+  Tool.buildASTs(ASTs);
+  if (ASTs.size() != Files.size()) {
+    return nullptr;
+  }
+  return std::move(ASTs[0]);
+}
+
+int main(int argc, const char **argv) {
+  cl::HideUnrelatedOptions(ClangDiffCategory);
+  if (!cl::ParseCommandLineOptions(argc, argv)) {
+    cl::PrintOptionValues();
+    return 1;
+  }
+
+  if (DumpAST) {
+    if (!DestinationPath.empty()) {
+      errs() << "Error: Please specify exactly one filename.\n";
+      return 1;
+    }
+    std::unique_ptr<ASTUnit> AST = getAST(SourcePath);
+    if (!AST) {
+      return 1;
+    }
+    clang::diff::TreeRoot Tree(*AST);
+    Tree.dumpAsJson();
+    return 0;
+  }
+
+  if (DestinationPath.empty()) {
+    errs() << "Error: Exactly two paths are required.\n";
+    return 1;
+  }
+
+  std::unique_ptr<ASTUnit> Src = getAST(SourcePath);
+  std::unique_ptr<ASTUnit> Dst = getAST(DestinationPath);
+  if (!Src || !Dst) {
+    return 1;
+  }
+
+  clang::diff::runDiff(*Src, *Dst);
+
+  return 0;
+}
Index: tools/clang-diff/CMakeLists.txt
===================================================================
--- /dev/null
+++ tools/clang-diff/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_clang_executable(clang-diff
+  ClangDiff.cpp
+  )
+
+target_link_libraries(clang-diff
+  clangAST
+  clangBasic
+  clangFrontend
+  clangLex
+  clangTooling
+  clangToolingASTDiff
+  )
Index: tools/CMakeLists.txt
===================================================================
--- tools/CMakeLists.txt
+++ tools/CMakeLists.txt
@@ -2,6 +2,7 @@
 
 add_clang_subdirectory(diagtool)
 add_clang_subdirectory(driver)
+add_clang_subdirectory(clang-diff)
 add_clang_subdirectory(clang-format)
 add_clang_subdirectory(clang-format-vs)
 add_clang_subdirectory(clang-fuzzer)
Index: lib/Tooling/CMakeLists.txt
===================================================================
--- lib/Tooling/CMakeLists.txt
+++ lib/Tooling/CMakeLists.txt
@@ -5,6 +5,7 @@
 
 add_subdirectory(Core)
 add_subdirectory(Refactoring)
+add_subdirectory(ASTDiff)
 
 add_clang_library(clangTooling
   ArgumentsAdjusters.cpp
Index: lib/Tooling/ASTDiff/CMakeLists.txt
===================================================================
--- /dev/null
+++ lib/Tooling/ASTDiff/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(LLVM_LINK_COMPONENTS
+  # Option
+  Support
+  )
+
+add_clang_library(clangToolingASTDiff
+  ASTDiff.cpp
+  LINK_LIBS
+  clangBasic
+  clangAST
+  # clangToolingCore
+  )
Index: lib/Tooling/ASTDiff/ASTDiff.cpp
===================================================================
--- /dev/null
+++ lib/Tooling/ASTDiff/ASTDiff.cpp
@@ -0,0 +1,836 @@
+//===- ASTDiff.cpp - AST differencing implementation-----------*- C++ -*- -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/ASTDiff/ASTDiff.h"
+
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Support/FormatVariadic.h"
+
+#include <limits>
+#include <memory>
+#include <unordered_set>
+
+using namespace llvm;
+using namespace clang;
+
+namespace clang {
+namespace diff {
+
+static bool isRelevantNode(Decl *D) { return D != nullptr; }
+static bool isRelevantNode(Stmt *S) { return S != nullptr; }
+
+namespace {
+// Count the number of non-null nodes
+struct NodeCountVisitor : public RecursiveASTVisitor<NodeCountVisitor> {
+  int Count = 0;
+  bool TraverseDecl(Decl *D) {
+    if (isRelevantNode(D)) {
+      ++Count;
+      RecursiveASTVisitor<NodeCountVisitor>::TraverseDecl(D);
+    }
+    return true;
+  }
+  bool TraverseStmt(Stmt *S) {
+    if (isRelevantNode(S)) {
+      ++Count;
+      RecursiveASTVisitor<NodeCountVisitor>::TraverseStmt(S);
+    }
+    return true;
+  }
+  bool TraverseType(QualType T) { return true; }
+};
+} // namespace
+
+namespace {
+// Sets Height, Parent and Children for each node.
+struct PreorderVisitor : public RecursiveASTVisitor<PreorderVisitor> {
+  int Id = 0, Depth = 0;
+  NodeId Parent = NoNodeId;
+  TreeRoot &Root;
+
+  PreorderVisitor(TreeRoot &Root) : Root(Root) {}
+
+  template <class T> std::tuple<NodeId, NodeId> PreTraverse(T *ASTNode) {
+    assert(ASTNode != nullptr);
+    NodeId MyId = Id;
+    Node &N = Root.getMutableNode(MyId);
+    N.Parent = Parent;
+    N.Depth = Depth;
+    N.ASTNode = ast_type_traits::DynTypedNode::create(*ASTNode);
+    assert(!N.ASTNode.getNodeKind().isNone());
+    if (Parent != NoNodeId) {
+      Node &P = Root.getMutableNode(Parent);
+      P.Children.push_back(MyId);
+    }
+    Parent = MyId;
+    ++Id;
+    ++Depth;
+    return {MyId, Root.getNode(MyId).Parent};
+  }
+  void PostTraverse(std::tuple<NodeId, NodeId> State) {
+    NodeId MyId, PreviousParent;
+    std::tie(MyId, PreviousParent) = State;
+    Parent = PreviousParent;
+    --Depth;
+    if (MyId != NoNodeId) {
+      Node &N = Root.getMutableNode(MyId);
+      if (N.isLeaf()) {
+        Root.Leaves.push_back(MyId);
+      }
+      N.Height = 1;
+      for (NodeId Child : N.Children) {
+        N.Height = std::max(N.Height, 1 + Root.getNode(Child).Height);
+      }
+    }
+  }
+  bool TraverseDecl(Decl *D) {
+    if (isRelevantNode(D)) {
+      auto SavedState = PreTraverse(D);
+      RecursiveASTVisitor<PreorderVisitor>::TraverseDecl(D);
+      PostTraverse(SavedState);
+    }
+    return true;
+  }
+  bool TraverseStmt(Stmt *S) {
+    if (isRelevantNode(S)) {
+      auto SavedState = PreTraverse(S);
+      RecursiveASTVisitor<PreorderVisitor>::TraverseStmt(S);
+      PostTraverse(SavedState);
+    }
+    return true;
+  }
+  bool TraverseType(QualType T) { return true; }
+};
+} // namespace
+
+TreeRoot::TreeRoot(ASTUnit &AST) : AST(AST) {
+  auto *TUD = AST.getASTContext().getTranslationUnitDecl();
+  // Run the above visitors to initialize the tree.
+  NodeCountVisitor NodeCounter;
+  NodeCounter.TraverseDecl(TUD);
+  setSize(NodeCounter.Count);
+  PreorderVisitor PreorderWalker(*this);
+  PreorderWalker.TraverseDecl(TUD);
+  setLeftMostDescendant();
+  int PostorderId = 0;
+  PostorderIds.resize(Nodes.size());
+  std::function<void(NodeId)> PostorderTraverse = [&](NodeId Id) {
+    for (NodeId Child : getNode(Id).Children) {
+      PostorderTraverse(Child);
+    }
+    PostorderIds[Id] = PostorderId;
+    ++PostorderId;
+  };
+  PostorderTraverse(root());
+}
+
+int TreeRoot::getSubtreePostorder(std::vector<NodeId> &Ids, NodeId Root) const {
+  int Leaves = 0;
+  std::function<void(NodeId)> Traverse = [&](NodeId Id) {
+    const Node &N = getNode(Id);
+    for (NodeId Child : N.Children) {
+      Traverse(Child);
+    }
+    if (N.isLeaf()) {
+      ++Leaves;
+    }
+    Ids.push_back(Id);
+  };
+  Traverse(Root);
+  return Leaves;
+}
+std::vector<NodeId> TreeRoot::getSubtreeBfs(NodeId Id) const {
+  std::vector<NodeId> Ids;
+  size_t Expanded = 0;
+  Ids.push_back(Id);
+  while (Expanded < Ids.size()) {
+    for (NodeId Child : getNode(Ids[Expanded++]).Children) {
+      Ids.push_back(Child);
+    }
+  }
+  return Ids;
+}
+
+int TreeRoot::getNumberOfDescendants(NodeId Id) const {
+  return PostorderIds[Id] - getNode(Id).LeftMostDescendant;
+}
+
+std::string TreeRoot::label(NodeId Id) const {
+  assert(isValidNodeId(Id));
+  const Node &N = getNode(Id);
+  const ast_type_traits::DynTypedNode &DTN = N.ASTNode;
+  if (auto *X = DTN.get<FieldDecl>()) {
+    return X->getName();
+  }
+  if (auto *X = DTN.get<BinaryOperator>()) {
+    return X->getOpcodeStr();
+  }
+  if (auto *X = DTN.get<IntegerLiteral>()) {
+    auto Tmp = X->getValue();
+    SmallString<256> Str;
+    Tmp.toString(Str, /*Radix=*/10, /*Signed=*/false);
+    return Str.str();
+  }
+  if (auto *X = DTN.get<StringLiteral>()) {
+    return X->getString();
+  }
+  if (auto *X = DTN.get<CompoundStmt>()) {
+    return "";
+  }
+  if (auto *X = DTN.get<DeclStmt>()) {
+    return "";
+  }
+  if (auto *X = DTN.get<DeclRefExpr>()) {
+    return "";
+  }
+  if (auto *X = DTN.get<Decl>()) {
+    return "";
+  }
+  if (auto *X = DTN.get<Stmt>()) {
+    return "";
+  }
+  if (auto *X = DTN.get<QualType>()) {
+    return "";
+  }
+  llvm_unreachable("Fatal: unhandled AST node\n");
+}
+void TreeRoot::dumpTree(NodeId Id) const {
+  if (Id == NoNodeId) {
+    Id = root();
+  }
+  const Node &N = getNode(Id);
+  for (int I = 0; I < N.Depth; ++I) {
+    outs() << " ";
+  }
+  outs() << showNode(Id) << "\n";
+  for (NodeId Child : N.Children) {
+    dumpTree(Child);
+  }
+}
+std::string TreeRoot::showNode(NodeId Id) const {
+  if (Id == NoNodeId) {
+    return "None";
+  }
+  std::string Label;
+  if (label(Id) != "") {
+    Label = formatv(": {0}", label(Id));
+  }
+  return formatv("{0}{1}({2})", getNode(Id).getTypeLabel(), Label,
+                 PostorderIds[Id]);
+}
+void TreeRoot::dumpNodeAsJson(NodeId Id) const {
+  auto N = getNode(Id);
+  std::string Label;
+  if (label(Id) != "") {
+    Label = formatv(R"(,"label":"{0}")", label(Id));
+  }
+  outs() << formatv(R"({"typeLabel":"{0}"{1},"children":[)", N.getTypeLabel(),
+                    Label);
+  if (N.Children.size() > 0) {
+    dumpNodeAsJson(N.Children[0]);
+    for (size_t I = 1; I < N.Children.size(); ++I) {
+      outs() << ",";
+      dumpNodeAsJson(N.Children[I]);
+    }
+  }
+  outs() << "]}";
+}
+void TreeRoot::dumpAsJson() const {
+  outs() << R"({"root":)";
+  dumpNodeAsJson(root());
+  outs() << "}\n";
+}
+
+void TreeRoot::setLeftMostDescendant() {
+  for (NodeId Id : Leaves) {
+    Node &N = getMutableNode(Id);
+    N.LeftMostDescendant = Id;
+    NodeId Cur = Id;
+    while (N.Parent != NoNodeId && getNode(N.Parent).Children.front() == Cur) {
+      Cur = N.Parent;
+      getMutableNode(Cur).LeftMostDescendant = Id;
+    }
+  }
+}
+
+Mappings::Mappings(const TreeRoot &T1, const TreeRoot &T2) : T1(T1), T2(T2) {
+  int Size = T1.getSize() + T2.getSize();
+  SrcToDst = new NodeId[Size];
+  DstToSrc = new NodeId[Size];
+  // set everything to NoNodeId == -1
+  memset(SrcToDst, NoNodeId, Size * sizeof(*SrcToDst));
+  memset(DstToSrc, NoNodeId, Size * sizeof(*DstToSrc));
+}
+Mappings::~Mappings() {
+  delete[] SrcToDst;
+  delete[] DstToSrc;
+}
+void Mappings::link(NodeId Src, NodeId Dst) {
+  SrcToDst[Src] = Dst;
+  DstToSrc[Dst] = Src;
+}
+
+void Mappings::dumpMapping() const {
+  for (NodeId Id1 = 0, Id2; Id1 < T1.getSize(); ++Id1) {
+    if ((Id2 = getDst(Id1)) != NoNodeId) {
+      outs() << formatv("Match {0} to {1}\n", T1.showNode(Id1),
+                        T2.showNode(Id2));
+    }
+  }
+}
+
+class Subtree {
+private:
+  /// Identifies a node in this subtree by its postorder offset.
+  using SNodeId = int;
+  /// The parent tree.
+  const TreeRoot &Tree;
+  /// Maps SNodeIds to original ids.
+  std::vector<NodeId> RootIds;
+  std::vector<NodeId> LeftMostDescendants;
+
+public:
+  std::vector<NodeId> KeyRoots;
+
+  Subtree(const TreeRoot &Tree, NodeId SubtreeRoot) : Tree(Tree) {
+    int Leaves = Tree.getSubtreePostorder(RootIds, SubtreeRoot);
+    setLeftMostDescendants();
+    computeKeyRoots(Leaves);
+  }
+
+  int getSizeS() const { return RootIds.size(); }
+  NodeId getIdInRoot(SNodeId Id) const {
+    assert(Id > 0 && Id <= getSizeS());
+    return RootIds[Id - 1];
+  }
+  const Node &getNodeS(SNodeId Id) const {
+    return Tree.getNode(getIdInRoot(Id));
+  }
+  const std::string labelS(SNodeId Id) const {
+    return Tree.label(getIdInRoot(Id));
+  }
+  SNodeId getLeftMostDescendant(SNodeId Id) const {
+    assert(Id > 0 && Id <= getSizeS());
+    return LeftMostDescendants[Id - 1];
+  }
+
+private:
+  void setLeftMostDescendants() {
+    LeftMostDescendants.resize(getSizeS());
+    for (SNodeId I = 0; I < getSizeS(); ++I) {
+      NodeId Cur = getIdInRoot(I + 1);
+      while (!Tree.getNode(Cur).isLeaf()) {
+        Cur = Tree.getNode(Cur).Children[0];
+      }
+      // TODO why does this work
+      LeftMostDescendants[I] = 1 + (Tree.PostorderIds[Cur] - getIdInRoot(1));
+    }
+    assert(LeftMostDescendants[0] == 0);
+  }
+  void computeKeyRoots(int Leaves) {
+    KeyRoots.resize(Leaves);
+    std::unordered_set<SNodeId> Visited;
+    int K = Leaves - 1;
+    for (int I = getSizeS(); I > 0; --I) {
+      SNodeId LeftDesc = getLeftMostDescendant(I);
+      if (Visited.count(LeftDesc) != 0u) {
+        continue;
+      }
+      assert(K >= 0);
+      KeyRoots[K] = I;
+      Visited.insert(LeftDesc);
+      --K;
+    }
+  }
+};
+
+// Computes an optimal mapping between two trees
+class ZsMatcher {
+  using Pairs = std::vector<std::pair<NodeId, NodeId>>;
+
+private:
+  Subtree S1;
+  Subtree S2;
+  double **TreeDist;
+  double **ForestDist;
+
+  // TODO Use string editing distance if applicable.
+  double getUpdateCost(NodeId Id1, NodeId Id2) {
+    if (!S1.getNodeS(Id1).hasSameType(S2.getNodeS(Id2))) {
+      return std::numeric_limits<double>::max();
+    }
+    if (S1.labelS(Id1) == S2.labelS(Id2)) {
+      return 0;
+    }
+    return 1;
+  }
+  void computeTreeDist() {
+    for (NodeId Id1 : S1.KeyRoots) {
+      for (NodeId Id2 : S2.KeyRoots) {
+        computeForestDist(Id1, Id2);
+      }
+    }
+  }
+  void dumpForestDist() const {
+    for (int I = 0; I < S1.getSizeS() + 1; ++I) {
+      outs() << "[";
+      for (int J = 0; J < S2.getSizeS() + 1; ++J) {
+        outs() << formatv("{0}{1: 2}", (J == 0 ? "" : " "), ForestDist[I][J]);
+      }
+      outs() << "]\n";
+    }
+  }
+  void computeForestDist(NodeId Id1, NodeId Id2) {
+    assert(Id1 > 0 && Id2 > 0);
+    NodeId LMD1 = S1.getLeftMostDescendant(Id1);
+    NodeId LMD2 = S2.getLeftMostDescendant(Id2);
+
+    ForestDist[LMD1][LMD2] = 0;
+    for (NodeId D1 = LMD1 + 1; D1 <= Id1; ++D1) {
+      double DeletionCost = 1.0;
+      ForestDist[D1][LMD2] = ForestDist[D1 - 1][LMD2] + DeletionCost;
+      for (NodeId D2 = LMD2 + 1; D2 <= Id2; ++D2) {
+        double InsertionCost = 1;
+        ForestDist[LMD1][D2] = ForestDist[LMD1][D2 - 1] + InsertionCost;
+        NodeId DLMD1 = S1.getLeftMostDescendant(D1);
+        NodeId DLMD2 = S2.getLeftMostDescendant(D2);
+        if (DLMD1 == LMD1 && DLMD2 == LMD2) {
+          double UpdateCost = getUpdateCost(D1, D2);
+          ForestDist[D1][D2] =
+              std::min(std::min(ForestDist[D1 - 1][D2] + DeletionCost,
+                                ForestDist[D1][D2 - 1] + InsertionCost),
+                       ForestDist[D1 - 1][D2 - 1] + UpdateCost);
+          TreeDist[D1][D2] = ForestDist[D1][D2];
+        } else {
+          ForestDist[D1][D2] =
+              std::min(std::min(ForestDist[D1 - 1][D2] + DeletionCost,
+                                ForestDist[D1][D2 - 1] + InsertionCost),
+                       ForestDist[DLMD1][DLMD2] + TreeDist[D1][D2]);
+        }
+      }
+    }
+  }
+
+public:
+  ZsMatcher(const TreeRoot &T1, const TreeRoot &T2, NodeId Id1, NodeId Id2)
+      : S1(T1, Id1), S2(T2, Id2) {
+    TreeDist = new double *[S1.getSizeS() + 1];
+    ForestDist = new double *[S1.getSizeS() + 1];
+    for (int I = 0; I < S1.getSizeS() + 1; ++I) {
+      TreeDist[I] = new double[S2.getSizeS() + 1]();
+      ForestDist[I] = new double[S2.getSizeS() + 1]();
+    }
+  }
+
+  ~ZsMatcher() {
+    for (int I = 0; I < S1.getSizeS() + 1; ++I) {
+      delete[] TreeDist[I];
+      delete[] ForestDist[I];
+    }
+    delete[] TreeDist;
+    delete[] ForestDist;
+  }
+
+  Pairs matchZs() {
+    Pairs Matches;
+
+    computeTreeDist();
+
+    bool RootNodePair = true;
+    Pairs TreePairs{{S1.getSizeS(), S2.getSizeS()}};
+
+    NodeId LastRow, LastCol, FirstRow, FirstCol, Row, Col;
+    while (!TreePairs.empty()) {
+      std::tie(LastRow, LastCol) = TreePairs.back();
+      TreePairs.pop_back();
+
+      if (!RootNodePair) {
+        computeForestDist(LastRow, LastCol);
+      }
+
+      RootNodePair = false;
+
+      FirstRow = S1.getLeftMostDescendant(LastRow);
+      FirstCol = S2.getLeftMostDescendant(LastCol);
+
+      Row = LastRow;
+      Col = LastCol;
+
+      while (Row > FirstRow || Col > FirstCol) {
+        if (Row > FirstRow &&
+            ForestDist[Row - 1][Col] + 1 == ForestDist[Row][Col]) {
+          --Row;
+        } else if (Col > FirstCol &&
+                   ForestDist[Row][Col - 1] + 1 == ForestDist[Row][Col]) {
+          --Col;
+        } else {
+          NodeId LMD1 = S1.getLeftMostDescendant(Row);
+          NodeId LMD2 = S2.getLeftMostDescendant(Col);
+          if (LMD1 == S1.getLeftMostDescendant(LastRow) &&
+              LMD2 == S2.getLeftMostDescendant(LastCol)) {
+            assert(S1.getNodeS(Row).hasSameType(S2.getNodeS(Col)));
+            Matches.emplace_back(S1.getIdInRoot(Row), S2.getIdInRoot(Col));
+            --Row;
+            --Col;
+          } else {
+            TreePairs.emplace_back(Row, Col);
+            Row = LMD1;
+            Col = LMD2;
+          }
+        }
+      }
+    }
+    return Matches;
+  }
+};
+
+namespace {
+// Compares nodes by their depth.
+struct HeightLess {
+  const TreeRoot &Tree;
+  HeightLess(const TreeRoot &Tree) : Tree(Tree) {}
+  bool operator()(NodeId Id1, NodeId Id2) const {
+    return Tree.getNode(Id1).Height < Tree.getNode(Id2).Height;
+  }
+};
+} // namespace
+
+// Priority queue for nodes, sorted descendingly by their height.
+class PriorityList {
+  const TreeRoot &Tree;
+  HeightLess Comparator;
+  std::vector<NodeId> Container;
+  PriorityQueue<NodeId, std::vector<NodeId>, HeightLess> List;
+
+public:
+  PriorityList(const TreeRoot &Tree)
+      : Tree(Tree), Comparator(Tree), List(Comparator, Container) {}
+
+  void push(NodeId id) { List.push(id); }
+
+  std::vector<NodeId> pop() {
+    int Max = peekMax();
+    std::vector<NodeId> Result;
+    if (Max != 0) {
+      while (peekMax() == Max) {
+        Result.push_back(List.top());
+        List.pop();
+      }
+    }
+    // TODO this is here to get a stable output, not a good heuristic
+    std::sort(Result.begin(), Result.end());
+    return Result;
+  }
+
+  int peekMax() const {
+    if (List.empty()) {
+      return 0;
+    }
+    return Tree.getNode(List.top()).Height;
+  }
+
+  void open(NodeId Id) {
+    for (NodeId Child : Tree.getNode(Id).Children) {
+      push(Child);
+    }
+  }
+};
+
+/// Computes the differences between two ASTs
+class ClangDiff {
+private:
+  TreeRoot &T1;
+  TreeRoot &T2;
+
+  // Returns true if the two subtrees have the same structure
+  // and equal node kinds (does not compare labels).
+  bool isomorphic(NodeId Id1, NodeId Id2) const {
+    const Node &N1 = T1.getNode(Id1);
+    const Node &N2 = T2.getNode(Id2);
+    if (!N1.hasSameType(N2) || N1.Children.size() != N2.Children.size() ||
+        T1.label(Id1) != T2.label(Id2)) {
+      return false;
+    }
+    for (size_t Id = 0; Id < N1.Children.size(); ++Id) {
+      if (!isomorphic(N1.Children[Id], N2.Children[Id])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // TODO This is too restrictive, we want to allow multiple mapping candidates
+  // for nodes and resolve the ambiguity later.
+  bool isMappingAllowed(const Mappings &M, NodeId Id1, NodeId Id2) const {
+    const Node &N1 = T1.getNode(Id1);
+    const Node &N2 = T2.getNode(Id2);
+    bool AnyMapped = M.hasSrc(Id1) || M.hasDst(Id2);
+    bool SameType = N1.hasSameType(N2);
+    NodeId P1 = N1.Parent;
+    NodeId P2 = N2.Parent;
+    bool ParentsSameType = (P1 == NoNodeId && P2 == NoNodeId) ||
+                           (P1 != NoNodeId && P2 != NoNodeId &&
+                            T1.getNode(P1).hasSameType(T2.getNode(P2)));
+    return !AnyMapped && SameType && ParentsSameType;
+  }
+
+  // Adds all corresponding subtrees of the two nodes to the mappings.
+  // The two nodes must be isomorphic.
+  void addIsomorphicSubTrees(Mappings &M, NodeId Id1, NodeId Id2) const {
+    M.link(Id1, Id2);
+    const Node &N1 = T1.getNode(Id1);
+    const Node &N2 = T2.getNode(Id2);
+    assert(isomorphic(Id1, Id2));
+    assert(N1.Children.size() == N2.Children.size());
+    for (size_t Id = 0; Id < N1.Children.size(); ++Id) {
+      addIsomorphicSubTrees(M, N1.Children[Id], N2.Children[Id]);
+    }
+  }
+
+  // Uses an optimal albeit slow algorithm to compute mappings for two
+  // subtrees, but only if both have fewer nodes than MaxSize.
+  void addOptimalMappings(Mappings &M, NodeId Id1, NodeId Id2) const {
+    if (std::max(T1.getNumberOfDescendants(Id1),
+                 T2.getNumberOfDescendants(Id2)) < MaxSize) {
+      ZsMatcher Matcher(T1, T2, Id1, Id2);
+      std::vector<std::pair<NodeId, NodeId>> R = Matcher.matchZs();
+      for (const auto Tuple : R) {
+        NodeId Src = Tuple.first;
+        NodeId Dst = Tuple.second;
+        if (isMappingAllowed(M, Src, Dst)) {
+          M.link(Src, Dst);
+        }
+      }
+    }
+  }
+
+  // Computes the ratio of common descendants between the two nodes.
+  // Descendants are only considered to be equal when they are mapped in M.
+  double dice(const Mappings &M, NodeId Id1, NodeId Id2) const {
+    if (Id1 == NoNodeId || Id2 == NoNodeId) {
+      return 0.0;
+    }
+    int CommonDescendants = 0;
+    const Node &N1 = T1.getNode(Id1);
+    for (NodeId Id = N1.LeftMostDescendant; Id < Id1; ++Id) {
+      CommonDescendants += int(M.hasSrc(Id));
+    }
+    return 2.0 * CommonDescendants /
+           (T1.getNumberOfDescendants(Id1) + T2.getNumberOfDescendants(Id2));
+  }
+
+  // Returns the node that has the highest degree of similarity.
+  NodeId findCandidate(const Mappings &M, NodeId Id1) const {
+    NodeId Candidate = NoNodeId;
+    double BestDiceValue = 0.0;
+    const Node &N1 = T1.getNode(Id1);
+    for (NodeId Id2 = 0; Id2 < T2.getSize(); ++Id2) {
+      const Node &N2 = T2.getNode(Id2);
+      if (!N1.hasSameType(N2)) {
+        continue;
+      }
+      if (M.hasDst(Id2)) {
+        continue;
+      }
+      double DiceValue = dice(M, Id1, Id2);
+      if (DiceValue > BestDiceValue) {
+        BestDiceValue = DiceValue;
+        Candidate = Id2;
+      }
+    }
+    return Candidate;
+  }
+
+  // Tries to match any yet unmapped nodes, in a bottom-up fashion.
+  void matchBottomUp(Mappings &M) const {
+    for (NodeId Id1 = 0; Id1 < T1.getSize(); ++Id1) {
+      if (Id1 == T1.root()) {
+        M.link(T1.root(), T2.root());
+        addOptimalMappings(M, T1.root(), T2.root());
+        break;
+      }
+      assert(Id1 != NoNodeId);
+      const Node &N1 = T1.getNode(Id1);
+      bool Matched = M.hasSrc(Id1);
+      bool MatchedChildren =
+          std::any_of(N1.Children.begin(), N1.Children.end(),
+                      [&](NodeId Child) { return M.hasSrc(Child); });
+      if (Matched || !MatchedChildren) {
+        continue;
+      }
+      NodeId Id2 = findCandidate(M, Id1);
+      if (Id2 == NoNodeId || dice(M, Id1, Id2) < MinDice ||
+          !isMappingAllowed(M, Id1, Id2)) {
+        continue;
+      }
+      M.link(Id1, Id2);
+      addOptimalMappings(M, Id1, Id2);
+    }
+  }
+
+  // Returns a set of mappings of isomorphic subtrees.
+  Mappings matchTopDown() const {
+    PriorityList L1(T1);
+    PriorityList L2(T2);
+
+    Mappings M(T1, T2);
+
+    L1.push(T1.root());
+    L2.push(T2.root());
+
+    int Max1, Max2;
+    while (std::min(Max1 = L1.peekMax(), Max2 = L2.peekMax()) > MinHeight) {
+      if (Max1 > Max2) {
+        for (NodeId Id : L1.pop()) {
+          L1.open(Id);
+        }
+        continue;
+      }
+      if (Max2 > Max1) {
+        for (NodeId Id : L2.pop()) {
+          L2.open(Id);
+        }
+        continue;
+      }
+      std::vector<NodeId> H1, H2;
+      H1 = L1.pop();
+      H2 = L2.pop();
+      for (NodeId Id1 : H1) {
+        for (NodeId Id2 : H2) {
+          if (isomorphic(Id1, Id2) && isMappingAllowed(M, Id1, Id2)) {
+            addIsomorphicSubTrees(M, Id1, Id2);
+          }
+        }
+      }
+      for (NodeId Id1 : H1) {
+        if (!M.hasSrc(Id1)) {
+          L1.open(Id1);
+        }
+      }
+      for (NodeId Id2 : H2) {
+        if (!M.hasDst(Id2)) {
+          L2.open(Id2);
+        }
+      }
+    }
+    return M;
+  }
+
+  // Finds an edit script that converts T1 to T2.
+  std::vector<Change> computeChanges(Mappings &M) {
+    std::vector<Change> Changes;
+    for (NodeId Id2 : T2.getSubtreeBfs(T2.root())) {
+      const Node &N2 = T2.getNode(Id2);
+      NodeId Id1 = M.getSrc(Id2);
+      if (Id1 != NoNodeId) {
+        assert(T1.getNode(Id1).hasSameType(N2));
+        if (T1.label(Id1) != T2.label(Id2)) {
+          Changes.emplace_back(Update, Id1, Id2, /*UNUSED Position=*/0);
+        }
+        continue;
+      }
+      NodeId P2 = N2.Parent;
+      NodeId P1 = M.getSrc(P2);
+      assert(P1 != NoNodeId);
+      Node &Parent1 = T1.getMutableNode(P1);
+      const Node &Parent2 = T2.getNode(P2);
+      auto &Siblings1 = Parent1.Children;
+      const auto &Siblings2 = Parent2.Children;
+      size_t Position;
+      for (Position = 0; Position < Siblings2.size(); ++Position) {
+        if (Siblings2[Position] == Id2 || Position >= Siblings1.size()) {
+          break;
+        }
+      }
+      Changes.emplace_back(Insert, Id2, P2, Position);
+      Node PatchNode;
+      PatchNode.Parent = P1;
+      PatchNode.LeftMostDescendant = N2.LeftMostDescendant;
+      PatchNode.Depth = N2.Depth;
+      PatchNode.ASTNode = N2.ASTNode;
+      // TODO update Depth if needed
+      NodeId PatchNodeId = T1.getSize();
+      assert(Position <= Siblings1.size());
+      // TODO maybe choose a different data structure for Children.
+      Siblings1.insert(Siblings1.begin() + Position, PatchNodeId);
+      T1.addNode(PatchNode);
+      M.link(PatchNodeId, Id2);
+    }
+    for (NodeId Id1 = 0; Id1 < T1.getSize(); ++Id1) {
+      NodeId Id2 = M.getDst(Id1);
+      if (Id2 == NoNodeId) {
+        Changes.emplace_back(Delete, Id1, Id2, /*UNUSED Position=*/0);
+      }
+    }
+    return Changes;
+  }
+
+  // Prints an edit action.
+  void dumpChange(const Change &Chg) const {
+    ChangeKind Kind;
+    NodeId Id1, Id2;
+    size_t Position;
+    std::tie(Kind, Id1, Id2, Position) = Chg;
+    std::string S;
+    switch (Kind) {
+    case Delete:
+      S = formatv("Delete {0}", T1.showNode(Id1));
+      break;
+    case Update:
+      S = formatv("Update {0} to {1}", T1.showNode(Id1), T2.label(Id2));
+      break;
+    case Insert:
+      S = formatv("Insert {0} into {1} at {2}", T2.showNode(Id1),
+                  T2.showNode(Id2), Position);
+      break;
+    case Move:
+      llvm_unreachable("TODO");
+      break;
+    };
+    outs() << S << "\n";
+  }
+
+  Mappings match() const {
+    Mappings M = matchTopDown();
+    matchBottomUp(M);
+    return M;
+  }
+
+public:
+  int MinHeight = 2;
+  double MinDice = 0.2;
+  int MaxSize = 100;
+
+  ClangDiff(TreeRoot &T1, TreeRoot &T2) : T1(T1), T2(T2) {}
+
+  void printDiff() {
+    Mappings M = match();
+    M.dumpMapping();
+    auto Changes = computeChanges(M);
+    for (const auto &C : Changes) {
+      dumpChange(C);
+    }
+  }
+};
+
+void runDiff(ASTUnit &AST1, ASTUnit &AST2) {
+  TreeRoot T1(AST1);
+  TreeRoot T2(AST2);
+  ClangDiff DiffTool(T1, T2);
+  DiffTool.printDiff();
+}
+
+} // namespace diff
+} // namespace clang
Index: include/clang/Tooling/ASTDiff/ASTDiff.h
===================================================================
--- /dev/null
+++ include/clang/Tooling/ASTDiff/ASTDiff.h
@@ -0,0 +1,121 @@
+//===- ASTDiff.h - AST differencing API -----------------------*- C++ -*- -===//
+//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLING_ASTDIFF_ASTDIFF_H
+#define LLVM_CLANG_TOOLING_ASTDIFF_ASTDIFF_H
+
+#include "clang/AST/ASTTypeTraits.h"
+#include "clang/Frontend/ASTUnit.h"
+
+using namespace llvm;
+using namespace clang;
+
+namespace clang {
+namespace diff {
+
+/// Within a tree, this identifies a node by its preorder offset.
+using NodeId = int;
+
+/// Sentinel value for invalid nodes.
+const NodeId NoNodeId = -1;
+
+/// Represents a Clang AST node, alongside some additional information.
+struct Node {
+  NodeId Parent, LeftMostDescendant;
+  int Depth, Height;
+  ast_type_traits::DynTypedNode ASTNode;
+  // Maybe there is a better way to store children than this.
+  SmallVector<NodeId, 4> Children;
+
+  ast_type_traits::ASTNodeKind getType() const { return ASTNode.getNodeKind(); }
+  bool hasSameType(const Node &Other) const {
+    return getType().isSame(Other.getType()) ||
+           (getType().isNone() && Other.getType().isNone());
+  }
+  const StringRef getTypeLabel() const { return getType().asStringRef(); }
+  bool isLeaf() const { return Children.empty(); }
+};
+
+/// Represents the AST of a TranslationUnit.
+class TreeRoot {
+private:
+  /// Nodes in preorder.
+  std::vector<Node> Nodes;
+  void setRightMostDescendant();
+  void setLeftMostDescendant();
+  void setSize(size_t Size) { Nodes.resize(Size); }
+
+public:
+  ASTUnit &AST;
+  std::vector<NodeId> Leaves;
+  std::vector<int> PostorderIds;
+
+  TreeRoot(ASTUnit &AST);
+
+  int getSize() const { return Nodes.size(); }
+  NodeId root() const { return 0; }
+
+  const Node &getNode(NodeId Id) const { return Nodes[Id]; }
+  Node &getMutableNode(NodeId Id) { return Nodes[Id]; }
+  bool isValidNodeId(NodeId Id) const { return Id >= 0 && Id < getSize(); }
+  void addNode(Node &N) { Nodes.push_back(N); }
+
+  int getSubtreePostorder(std::vector<NodeId> &Ids, NodeId Root) const;
+  std::vector<NodeId> getSubtreeBfs(NodeId Id) const;
+  int getNumberOfDescendants(NodeId Id) const;
+
+  /// Returns the value of the node.
+  std::string label(NodeId Id) const;
+  void dumpTree(NodeId Id = NoNodeId) const;
+  /// Return the node as <kind>[: <label>](<postorder-id)
+  std::string showNode(NodeId Id) const;
+  void dumpNodeAsJson(NodeId Id) const;
+  void dumpAsJson() const;
+};
+
+/// Maps nodes of the left tree to ones on the right, and vice versa.
+// Supports fast insertion and lookup.
+// TODO allow multiple mappings
+class Mappings {
+  NodeId *SrcToDst, *DstToSrc;
+  const TreeRoot &T1, &T2;
+
+public:
+  Mappings(const TreeRoot &T1, const TreeRoot &T2);
+  ~Mappings();
+
+  void link(NodeId Src, NodeId Dst);
+
+  NodeId getDst(NodeId Src) const { return SrcToDst[Src]; }
+  NodeId getSrc(NodeId Dst) const { return DstToSrc[Dst]; }
+  bool hasSrc(NodeId Src) const { return SrcToDst[Src] != NoNodeId; }
+  bool hasDst(NodeId Dst) const { return DstToSrc[Dst] != NoNodeId; }
+
+  void dumpMapping() const;
+};
+
+/// (Update, src, dst, _  ): update the value of node src to match dst.
+/// (Insert, src, dst, pos): insert src as child of dst at offset pos.
+/// (Delete, src, _,   _  ): delete node src.
+/// (Move,   src, dst, pos): move src to a child of dst at offset pos.
+enum ChangeKind { Delete, Update, Insert, Move };
+using Change = std::tuple<ChangeKind, NodeId, NodeId, size_t>;
+
+void runDiff(ASTUnit &AST1, ASTUnit &AST2);
+
+} // namespace diff
+} // namespace clang
+
+#endif
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to