[PATCH] D40731: Integrate CHash into CLang

2018-02-21 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger updated this revision to Diff 135227.
stettberger added a comment.

Rebased to HEAD, Run (external) clang-hash testsuite and ASTTest


Repository:
  rC Clang

https://reviews.llvm.org/D40731

Files:
  include/clang/AST/AttrDataCollectors.td
  include/clang/AST/CHashVisitor.h
  include/clang/AST/CMakeLists.txt
  include/clang/AST/DataCollection.h
  include/clang/AST/DeclDataCollectors.td
  include/clang/AST/StmtDataCollectors.td
  include/clang/AST/TypeDataCollectors.td
  unittests/AST/CHashTest.cpp
  unittests/AST/CMakeLists.txt

Index: unittests/AST/CMakeLists.txt
===
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -17,6 +17,7 @@
   NamedDeclPrinterTest.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
Index: unittests/AST/CHashTest.cpp
===
--- /dev/null
+++ unittests/AST/CHashTest.cpp
@@ -0,0 +1,91 @@
+//===- unittests/AST/DataCollectionTest.cpp ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===--===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include 
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+CompilerInstance &CI;
+llvm::MD5::MD5Result *ASTHash;
+
+public:
+
+CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+: CI(CI), ASTHash(ASTHash){}
+
+virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+// Traversing the translation unit decl via a RecursiveASTVisitor
+// will visit all nodes in the AST.
+CHashVisitor<> Visitor(Context);
+Visitor.TraverseDecl(TU);
+// Copy Away the resulting hash
+*ASTHash = *Visitor.getHash(TU);
+
+}
+
+~CHashConsumer() override {}
+};
+
+struct CHashAction : public ASTFrontendAction {
+llvm::MD5::MD5Result *Hash;
+
+CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+std::unique_ptr CreateASTConsumer(CompilerInstance &CI,
+   StringRef) override {
+return std::unique_ptr(new CHashConsumer(CI, Hash));
+}
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+llvm::MD5::MD5Result Hash1, Hash2;
+if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+return testing::AssertionFailure()
+<< "Parsing error in (A)\"" << Code1.str() << "\"";
+}
+if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+return testing::AssertionFailure()
+<< "Parsing error in (B) \"" << Code2.str() << "\"";
+}
+return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+ASSERT_TRUE(isASTHashEqual( // Unused struct
+ "struct foobar { int a0; char a1; unsigned long a2; };",
+ "struct foobar { int a0; char a1;};"
+ ));
+
+}
+
+TEST(CHashVisitor, TestSourceStructure) {
+ASSERT_FALSE(isASTHashEqual(
+ "void foo() { int c; if (0) { c = 1; } }",
+ "void foo() { int c; if (0) { } c = 1; }"));
+
+ASSERT_FALSE(isASTHashEqual(
+ "void f1() {} void f2() {   }",
+ "void f1() {} void f2() { f1(); }"));
+}
Index: include/clang/AST/TypeDataCollectors.td
===
--- include/clang/AST/TypeDataCollectors.td
+++ include/clang/AST/TypeDataCollectors.td
@@ -0,0 +1,78 @@
+//--- Types ---//
+
+class Type {
+  code Code = [{
+ addData(llvm::hash_value(S->getTypeClass()));
+  }];
+}
+
+class BuiltinType {
+   code Code = [{
+  addData(S->getKind());
+   }];
+}
+
+class ArrayType  {
+   code Code = [{
+  addData(S->getSizeModifier());
+  addData(S->getIndexTypeCVRQualifiers());
+   }];
+}
+
+class ConstantArrayType {
+   code Code = [{
+  addData(S->getSize().getZExtValue());
+   }];
+}
+
+class VectorType {
+   code Code = [{
+  addData(S->getNumElements());
+  addData(S->getVectorKind());
+   }];
+}
+
+class FunctionType {
+   code Code = [{
+  addData(S->getRegParmType

[PATCH] D40731: Integrate CHash into CLang

2018-02-23 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger updated this revision to Diff 135598.
stettberger added a comment.

[CHash] Stable TU-level AST Hashing for local and global hashes

This patch does integrate the AST hashing parts of the CHash project[1][2] into
CLang. The extension provided by this patch accomplishes 2 things:

- local hashes: It extends StmtDataCollector infrastructure by hash rules for 
declarations, types, and attributes. These rules are expressed in terms of 
TableGen files and capture what parts of a AST node are important to its 
semantic. Thereby, the rules only include information that is local to the 
respecting node (no pointers to other AST nodes should be followed).

  Since these DataCollectors do not impose rules how the AST nodes relate to 
each other, they provide a mechanism for "local" hashes.

- global hashes: The CHashVisitor combines the local hashes via the 
RecursiveAST Visitor into a global semantic hash that includes (for a given AST 
node) all local hashes of all AST nodes that can influence the compilation of 
this AST node.

  With these global hashes, we can deduce (early) in the compilation process 
wether the result of this compiler run will produce the same object file. This 
mechanism is *NOT* part of this commit.

[1] https://www.sra.uni-hannover.de/Research/cHash/
[2] "cHash: Detection of Redunandt Recompilations via AST hashing", USENIX 2017


Repository:
  rC Clang

https://reviews.llvm.org/D40731

Files:
  include/clang/AST/AttrDataCollectors.td
  include/clang/AST/CHashVisitor.h
  include/clang/AST/CMakeLists.txt
  include/clang/AST/DataCollection.h
  include/clang/AST/DeclDataCollectors.td
  include/clang/AST/StmtDataCollectors.td
  include/clang/AST/TypeDataCollectors.td
  unittests/AST/CHashTest.cpp
  unittests/AST/CMakeLists.txt

Index: unittests/AST/CMakeLists.txt
===
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -17,6 +17,7 @@
   NamedDeclPrinterTest.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
Index: unittests/AST/CHashTest.cpp
===
--- /dev/null
+++ unittests/AST/CHashTest.cpp
@@ -0,0 +1,91 @@
+//===- unittests/AST/DataCollectionTest.cpp ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===--===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include 
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+CompilerInstance &CI;
+llvm::MD5::MD5Result *ASTHash;
+
+public:
+
+CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+: CI(CI), ASTHash(ASTHash){}
+
+virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+// Traversing the translation unit decl via a RecursiveASTVisitor
+// will visit all nodes in the AST.
+CHashVisitor<> Visitor(Context);
+Visitor.TraverseDecl(TU);
+// Copy Away the resulting hash
+*ASTHash = *Visitor.getHash(TU);
+
+}
+
+~CHashConsumer() override {}
+};
+
+struct CHashAction : public ASTFrontendAction {
+llvm::MD5::MD5Result *Hash;
+
+CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+std::unique_ptr CreateASTConsumer(CompilerInstance &CI,
+   StringRef) override {
+return std::unique_ptr(new CHashConsumer(CI, Hash));
+}
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+llvm::MD5::MD5Result Hash1, Hash2;
+if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+return testing::AssertionFailure()
+<< "Parsing error in (A)\"" << Code1.str() << "\"";
+}
+if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+return testing::AssertionFailure()
+<< "Parsing error in (B) \"" << Code2.str() << "\"";
+}
+return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+ASSERT_TRUE(isASTHashEqual( // Unused struct
+ "struct foobar { int a0; char a1; unsigned long a2; };",
+ "struct foobar { int a0; char a1;};"
+ ));
+
+}
+
+TEST(CHashVisitor, TestSourceStructure) {
+ASS

[PATCH] D40731: Integrate CHash into CLang

2017-12-14 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger updated this revision to Diff 126925.
stettberger added a comment.

@rtrieu: Fixed the checking of Decl::hasAttrs() before using Decl::attrs()


Repository:
  rC Clang

https://reviews.llvm.org/D40731

Files:
  include/clang/AST/AttrDataCollectors.td
  include/clang/AST/CHashVisitor.h
  include/clang/AST/CMakeLists.txt
  include/clang/AST/DataCollection.h
  include/clang/AST/DeclDataCollectors.td
  include/clang/AST/StmtDataCollectors.td
  include/clang/AST/TypeDataCollectors.td
  unittests/AST/CHashTest.cpp
  unittests/AST/CMakeLists.txt

Index: unittests/AST/CMakeLists.txt
===
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -18,6 +18,7 @@
   PostOrderASTVisitor.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
Index: unittests/AST/CHashTest.cpp
===
--- /dev/null
+++ unittests/AST/CHashTest.cpp
@@ -0,0 +1,91 @@
+//===- unittests/AST/DataCollectionTest.cpp ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===--===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include 
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+CompilerInstance &CI;
+llvm::MD5::MD5Result *ASTHash;
+
+public:
+
+CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+: CI(CI), ASTHash(ASTHash){}
+
+virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+// Traversing the translation unit decl via a RecursiveASTVisitor
+// will visit all nodes in the AST.
+CHashVisitor<> Visitor(Context);
+Visitor.TraverseDecl(TU);
+// Copy Away the resulting hash
+*ASTHash = *Visitor.getHash(TU);
+
+}
+
+~CHashConsumer() override {}
+};
+
+struct CHashAction : public ASTFrontendAction {
+llvm::MD5::MD5Result *Hash;
+
+CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+std::unique_ptr CreateASTConsumer(CompilerInstance &CI,
+   StringRef) override {
+return std::unique_ptr(new CHashConsumer(CI, Hash));
+}
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+llvm::MD5::MD5Result Hash1, Hash2;
+if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+return testing::AssertionFailure()
+<< "Parsing error in (A)\"" << Code1.str() << "\"";
+}
+if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+return testing::AssertionFailure()
+<< "Parsing error in (B) \"" << Code2.str() << "\"";
+}
+return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+ASSERT_TRUE(isASTHashEqual( // Unused struct
+ "struct foobar { int a0; char a1; unsigned long a2; };",
+ "struct foobar { int a0; char a1;};"
+ ));
+
+}
+
+TEST(CHashVisitor, TestSourceStructure) {
+ASSERT_FALSE(isASTHashEqual(
+ "void foo() { int c; if (0) { c = 1; } }",
+ "void foo() { int c; if (0) { } c = 1; }"));
+
+ASSERT_FALSE(isASTHashEqual(
+ "void f1() {} void f2() {   }",
+ "void f1() {} void f2() { f1(); }"));
+}
Index: include/clang/AST/TypeDataCollectors.td
===
--- include/clang/AST/TypeDataCollectors.td
+++ include/clang/AST/TypeDataCollectors.td
@@ -0,0 +1,78 @@
+//--- Types ---//
+
+class Type {
+  code Code = [{
+ addData(llvm::hash_value(S->getTypeClass()));
+  }];
+}
+
+class BuiltinType {
+   code Code = [{
+  addData(S->getKind());
+   }];
+}
+
+class ArrayType  {
+   code Code = [{
+  addData(S->getSizeModifier());
+  addData(S->getIndexTypeCVRQualifiers());
+   }];
+}
+
+class ConstantArrayType {
+   code Code = [{
+  addData(S->getSize().getZExtValue());
+   }];
+}
+
+class VectorType {
+   code Code = [{
+  addData(S->getNumElements());
+  addData(S->getVectorKind());
+   }];
+}
+
+class FunctionType {
+   code Code = [{
+  addData(S->getRe

[PATCH] D40731: Integrate CHash into CLang

2018-03-12 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger updated this revision to Diff 138019.
stettberger marked an inline comment as done.
stettberger added a comment.

Addressed comments, for more details, please see the mail on cfe-devel


Repository:
  rC Clang

https://reviews.llvm.org/D40731

Files:
  include/clang/AST/AttrDataCollectors.td
  include/clang/AST/CHashVisitor.h
  include/clang/AST/CMakeLists.txt
  include/clang/AST/DataCollection.h
  include/clang/AST/DeclDataCollectors.td
  include/clang/AST/StmtDataCollectors.td
  include/clang/AST/TypeDataCollectors.td
  unittests/AST/CHashTest.cpp
  unittests/AST/CMakeLists.txt

Index: unittests/AST/CMakeLists.txt
===
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -17,6 +17,7 @@
   NamedDeclPrinterTest.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
Index: unittests/AST/CHashTest.cpp
===
--- /dev/null
+++ unittests/AST/CHashTest.cpp
@@ -0,0 +1,116 @@
+//===- unittests/AST/DataCollectionTest.cpp ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===--===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include 
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+CompilerInstance &CI;
+llvm::MD5::MD5Result *ASTHash;
+
+public:
+
+CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+: CI(CI), ASTHash(ASTHash){}
+
+virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+// Traversing the translation unit decl via a RecursiveASTVisitor
+// will visit all nodes in the AST.
+CHashVisitor<> Visitor(Context);
+Visitor.TraverseDecl(TU);
+// Copy Away the resulting hash
+*ASTHash = *Visitor.getHash(TU);
+
+}
+
+~CHashConsumer() override {}
+};
+
+struct CHashAction : public ASTFrontendAction {
+llvm::MD5::MD5Result *Hash;
+
+CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+std::unique_ptr CreateASTConsumer(CompilerInstance &CI,
+   StringRef) override {
+return std::unique_ptr(new CHashConsumer(CI, Hash));
+}
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+llvm::MD5::MD5Result Hash1, Hash2;
+if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+return testing::AssertionFailure()
+<< "Parsing error in (A)\"" << Code1.str() << "\"";
+}
+if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+return testing::AssertionFailure()
+<< "Parsing error in (B) \"" << Code2.str() << "\"";
+}
+return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+ASSERT_TRUE(isASTHashEqual( // Unused record definition
+ "struct foobar { int a0; char a1; unsigned long a2; };",
+ "struct foobar { int a0; char a1;};"
+ ));
+
+// Recursive Record Types
+ASSERT_FALSE(isASTHashEqual( // Names still matter
+"struct ll { struct ll* foo; }; struct ll a;",
+"struct ll { struct ll* bar; }; struct ll a;"
+));
+
+ASSERT_TRUE(isASTHashEqual( // Order does not matter
+ "struct b; struct a { struct b* o; }; struct b { struct a* o; }; struct a X;",
+ "struct a; struct b { struct a* o; }; struct a { struct b* o; }; struct a X;"
+ ));
+
+// The initialization of a struct with a reference to itself should not crash
+ASSERT_FALSE(isASTHashEqual(
+"struct foo { int N; }; struct foo a = { sizeof(a) };",
+"struct foo { int N; }; struct foo a = { sizeof(struct foo) };"
+));
+}
+
+TEST(CHashVisitor, TypedefTest) {
+ASSERT_FALSE(isASTHashEqual(
+"typedef int bar_t;  bar_t x;",
+"typedef char bar_t; bar_t x;"
+));
+}
+
+
+TEST(CHashVisitor, TestSourceStructure) {
+ASSERT_FALSE(isASTHashEqual(
+ "void foo() { int c; if (0) { c = 1; } }",
+ "void foo() 

[PATCH] D40731: Integrate CHash into CLang

2018-03-12 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger added inline comments.



Comment at: include/clang/AST/CHashVisitor.h:72-79
+  template
+  void addData(const llvm::iterator_range &x) {
+addData(std::distance(x.begin(), x.end()));
+  }
+  template
+  void addData(const llvm::ArrayRef &x) {
+addData(x.size());

rtrieu wrote:
> Should these also be storing the hashes of the elements in addition to 
> storing the size?
Ok. So this is called from 

- FunctionProtoTypes { addData(S->exceptions()); }
- DecompositionDecl { addData(S->bindings()); }
- ObjCObjectType  { addData(S->getTypeArgsAsWritten()); }

All of these occurrences are vistied by the RecursiveASTVisitor. However, we 
have no Idea how many children are visited by these for loops. Therefore, we 
additionally call addData(ArrayRef<...> x)  to indicate give the actual user of 
StmtCollector* the possibility to add these lengths to the hash. This is also 
the reason, why we add only the std::distance to the length. I will add a 
comment to this.



Comment at: include/clang/AST/CHashVisitor.h:211
+
+  bool VisitTypeDecl(TypeDecl *D) {
+// If we would hash the resulting type for a typedef, we

rtrieu wrote:
> I thought the idea was to have all the code in the *Collectors.td but here 
> you have a bunch of Visit* functions for AST nodes.  This seems counter to 
> your point of having them generated.
See Mail.



Comment at: include/clang/AST/CHashVisitor.h:212-213
+  bool VisitTypeDecl(TypeDecl *D) {
+// If we would hash the resulting type for a typedef, we
+// would get into an endless recursion.
+if (!isa(D) && !isa(D) && !isa(D)) {

rtrieu wrote:
> I don't see this example in the test.
It is now reflected in the tests.



Comment at: include/clang/AST/CHashVisitor.h:226
+}
+if (isa(ValDecl)) {
+  /* We emulate TraverseDecl here for VarDecl, because we

rtrieu wrote:
> Can't you query the hash for the Decl instead of doing work here?
How would I do it. As the example shows, there is no hash for a, when we try to 
calculate the hash for a. However, I included a  unittest to cover this.



Comment at: include/clang/AST/CHashVisitor.h:253
+  bool VisitValueDecl(ValueDecl *D) {
+/* Field Declarations can induce recursions */
+if (isa(D)) {

rtrieu wrote:
> Is this possible recursion included in your test?
It  now is.



Comment at: include/clang/AST/CHashVisitor.h:255
+if (isa(D)) {
+  addData(std::string(D->getType().getAsString()));
+} else {

rtrieu wrote:
> Shouldn't this be handled in the VisitFieldDecl function?
I must avoid to call addData(D->getType()) in VisitValueDecl for 
FieldDeclarations. So there would also be a conditional.



Comment at: include/clang/AST/DeclDataCollectors.td:5-7
+  // Every Declaration gets a tag field in the hash stream. It is
+  // hashed to add additional randomness to the hash
+  addData(llvm::hash_value(S->getKind()));

rtrieu wrote:
> Why is this randomness needed?
It is there for my personal feeling of less hash collisions. In former versions 
of this patch there was a large random constant for every type of AST node to 
decrease the likelyness of collisions when we left out some information. Small 
numbers (like enum values) often occur in source code. A 32 Bit Pseudo-Random 
value is unlikely to be seen in real source code. However, if you dislike its, 
I can remove the llvm::hash_value here.



Comment at: include/clang/AST/DeclDataCollectors.td:9
+
+  // CrossRef
+  addData(S->hasAttrs());

rtrieu wrote:
> What do you mean by "CrossRef"?
CrossRef indicates that the following fields are not local to the node. 
However, we give them to the user of the DataCollector to give it a chance to 
include the number of children into its hash. This directly relates to the 
addData(ArrayRef<...>) question above.


Repository:
  rC Clang

https://reviews.llvm.org/D40731



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D40731: Integrate CHash into CLang

2017-12-01 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger created this revision.
Herald added subscribers: cfe-commits, aprantl, mgorny.

The CHashVisitor can be used to determine a unique hash for a translation unit. 
The hash is stable across compiler invocations and if two translation units 
have the same hash, the resulting object file is semantically equal (code, 
data, etc. are equal; debug information may be different).


Repository:
  rC Clang

https://reviews.llvm.org/D40731

Files:
  include/clang/AST/AttrDataCollectors.td
  include/clang/AST/CHashVisitor.h
  include/clang/AST/CMakeLists.txt
  include/clang/AST/DeclDataCollectors.td
  include/clang/AST/StmtDataCollectors.td
  include/clang/AST/TypeDataCollectors.td
  unittests/AST/CHashTest.cpp
  unittests/AST/CMakeLists.txt

Index: unittests/AST/CMakeLists.txt
===
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -18,6 +18,7 @@
   PostOrderASTVisitor.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
Index: unittests/AST/CHashTest.cpp
===
--- /dev/null
+++ unittests/AST/CHashTest.cpp
@@ -0,0 +1,91 @@
+//===- unittests/AST/DataCollectionTest.cpp ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===--===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include 
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+CompilerInstance &CI;
+llvm::MD5::MD5Result *ASTHash;
+
+public:
+CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+: CI(CI), ASTHash(ASTHash){}
+
+virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+// Traversing the translation unit decl via a RecursiveASTVisitor
+// will visit all nodes in the AST.
+CHashVisitor<> Visitor(Context);
+Visitor.TraverseDecl(TU);
+// Copy Away the resulting hash
+*ASTHash = *Visitor.getHash(TU);
+
+}
+~CHashConsumer() override {}
+
+};
+
+class CHashAction : public ASTFrontendAction {
+public: // We are all friends
+llvm::MD5::MD5Result *Hash;
+
+CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+std::unique_ptr CreateASTConsumer(CompilerInstance &CI,
+   StringRef) override {
+return std::unique_ptr(new CHashConsumer(CI, Hash));
+}
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+llvm::MD5::MD5Result Hash1, Hash2;
+if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+return testing::AssertionFailure()
+<< "Parsing error in (A)\"" << Code1.str() << "\"";
+}
+if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+return testing::AssertionFailure()
+<< "Parsing error in (B) \"" << Code2.str() << "\"";
+}
+return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+ASSERT_TRUE(isASTHashEqual( // Unused struct
+ "struct foobar { int a0; char a1; unsigned long a2; };",
+ "struct foobar { int a0; char a1;};"
+ ));
+
+}
+
+TEST(CHashVisitor, TestSourceStructure) {
+ASSERT_FALSE(isASTHashEqual(
+ "void foo() { int c; if (0) { c = 1; } }",
+ "void foo() { int c; if (0) { } c = 1; }"));
+
+ASSERT_FALSE(isASTHashEqual(
+ "void f1() {} void f2() {   }",
+ "void f1() {} void f2() { f1(); }"));
+}
Index: include/clang/AST/TypeDataCollectors.td
===
--- include/clang/AST/TypeDataCollectors.td
+++ include/clang/AST/TypeDataCollectors.td
@@ -0,0 +1,66 @@
+//--- Types ---//
+class Type {
+  code Code = [{
+ addData(S->getTypeClass());
+  }];
+}
+
+class BuiltinType {
+   code Code = [{
+  addData(S->getKind());
+   }];
+}
+
+class ArrayType  {
+   code Code = [{
+  addData(S->getSizeModifier());
+  addData(S->getIndexTypeCVRQualifiers());
+   }];
+}
+
+class ConstantArrayType {
+   code Code = [{
+  addData(S->getSize().getZExtValue());
+

[PATCH] D40731: Integrate CHash into CLang

2017-12-04 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger updated this revision to Diff 125309.
stettberger marked 9 inline comments as done.
stettberger added a comment.

@Eugene.Zelenko Thank you for pointing me out on these issues. I ran clang-tidy 
and clang-format on CHashVisitor.h

@rsmith You're right, there is already more than one implemenation of 
{partial,unstable} AST hashes within LLVM, as we already discussed on cfe-dev 
in August[1]. Therefore, I rewrote our original CHash implementation to extend 
the already existing StmtDataCollectors approach. However, you are right, 
efforts should be coordinated to get a AST hashing implementation that can be 
adapted to various use-case scenarios.

[1] http://lists.llvm.org/pipermail/cfe-dev/2017-August/054911.html


Repository:
  rC Clang

https://reviews.llvm.org/D40731

Files:
  include/clang/AST/AttrDataCollectors.td
  include/clang/AST/CHashVisitor.h
  include/clang/AST/CMakeLists.txt
  include/clang/AST/DeclDataCollectors.td
  include/clang/AST/StmtDataCollectors.td
  include/clang/AST/TypeDataCollectors.td
  unittests/AST/CHashTest.cpp
  unittests/AST/CMakeLists.txt

Index: unittests/AST/CMakeLists.txt
===
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -18,6 +18,7 @@
   PostOrderASTVisitor.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
Index: unittests/AST/CHashTest.cpp
===
--- /dev/null
+++ unittests/AST/CHashTest.cpp
@@ -0,0 +1,91 @@
+//===- unittests/AST/DataCollectionTest.cpp ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===--===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include 
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+CompilerInstance &CI;
+llvm::MD5::MD5Result *ASTHash;
+
+public:
+
+CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+: CI(CI), ASTHash(ASTHash){}
+
+virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+// Traversing the translation unit decl via a RecursiveASTVisitor
+// will visit all nodes in the AST.
+CHashVisitor<> Visitor(Context);
+Visitor.TraverseDecl(TU);
+// Copy Away the resulting hash
+*ASTHash = *Visitor.getHash(TU);
+
+}
+
+~CHashConsumer() override {}
+};
+
+struct CHashAction : public ASTFrontendAction {
+llvm::MD5::MD5Result *Hash;
+
+CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+std::unique_ptr CreateASTConsumer(CompilerInstance &CI,
+   StringRef) override {
+return std::unique_ptr(new CHashConsumer(CI, Hash));
+}
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+llvm::MD5::MD5Result Hash1, Hash2;
+if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+return testing::AssertionFailure()
+<< "Parsing error in (A)\"" << Code1.str() << "\"";
+}
+if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+return testing::AssertionFailure()
+<< "Parsing error in (B) \"" << Code2.str() << "\"";
+}
+return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+ASSERT_TRUE(isASTHashEqual( // Unused struct
+ "struct foobar { int a0; char a1; unsigned long a2; };",
+ "struct foobar { int a0; char a1;};"
+ ));
+
+}
+
+TEST(CHashVisitor, TestSourceStructure) {
+ASSERT_FALSE(isASTHashEqual(
+ "void foo() { int c; if (0) { c = 1; } }",
+ "void foo() { int c; if (0) { } c = 1; }"));
+
+ASSERT_FALSE(isASTHashEqual(
+ "void f1() {} void f2() {   }",
+ "void f1() {} void f2() { f1(); }"));
+}
Index: include/clang/AST/TypeDataCollectors.td
===
--- include/clang/AST/TypeDataCollectors.td
+++ include/clang/AST/TypeDataCollectors.td
@@ -0,0 +1,67 @@
+//--- Types ---//
+
+class Type {
+  code Code = [{
+ addData(S->getTypeClass());
+

[PATCH] D40731: Integrate CHash into CLang

2017-12-04 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger added a comment.

For my changes to StmtDataCollectors: I tried to leave the already existing 
entries in StmtDataCollector.td untouched and the test-cases still work OK. If 
there is something somebody could break by changing these files it would be 
good to have a test case for it.

However, some of the collectors are not node local, which seems a little bit 
inconsequent to me. For example, DeclStmt includes information about the its 
Decl Children. In my opinion every entry in the *DataCollector.td should 
include only information that can be directly retrieved from this particular 
AST node. Cross-referencing between nodes should be done on the user side of 
the collectors (as it is done in CHashVisitor.h, which follows pointers in a 
cross-tree manner). With this separation of concerns it was quite easy for me 
to base CHash of the TableGen'ed  headers. Furthermore, a purely syntactical 
hash (only the children, no cross-tree dependencies) could be derived easily.

@johannes For the different types of information that should be included for 
one AST node (e.g., Type II clones): We could use a hierachy of "What should be 
considered here". But we should clearly define what is the definition of 
equivalency for that level. For example, if you wake me up at night  I could 
not define you the difference between a type I and a type II clone.


Repository:
  rC Clang

https://reviews.llvm.org/D40731



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D40731: Integrate CHash into CLang

2017-12-12 Thread Christian Dietrich via Phabricator via cfe-commits
stettberger updated this revision to Diff 126523.
stettberger added a comment.

I removed the magic constants and added the number of children for various AST 
nodes. This should avoid hash collisions.


Repository:
  rC Clang

https://reviews.llvm.org/D40731

Files:
  include/clang/AST/AttrDataCollectors.td
  include/clang/AST/CHashVisitor.h
  include/clang/AST/CMakeLists.txt
  include/clang/AST/DataCollection.h
  include/clang/AST/DeclDataCollectors.td
  include/clang/AST/StmtDataCollectors.td
  include/clang/AST/TypeDataCollectors.td
  unittests/AST/CHashTest.cpp
  unittests/AST/CMakeLists.txt

Index: unittests/AST/CMakeLists.txt
===
--- unittests/AST/CMakeLists.txt
+++ unittests/AST/CMakeLists.txt
@@ -18,6 +18,7 @@
   PostOrderASTVisitor.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
Index: unittests/AST/CHashTest.cpp
===
--- /dev/null
+++ unittests/AST/CHashTest.cpp
@@ -0,0 +1,91 @@
+//===- unittests/AST/DataCollectionTest.cpp ---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===--===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include 
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+CompilerInstance &CI;
+llvm::MD5::MD5Result *ASTHash;
+
+public:
+
+CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+: CI(CI), ASTHash(ASTHash){}
+
+virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+// Traversing the translation unit decl via a RecursiveASTVisitor
+// will visit all nodes in the AST.
+CHashVisitor<> Visitor(Context);
+Visitor.TraverseDecl(TU);
+// Copy Away the resulting hash
+*ASTHash = *Visitor.getHash(TU);
+
+}
+
+~CHashConsumer() override {}
+};
+
+struct CHashAction : public ASTFrontendAction {
+llvm::MD5::MD5Result *Hash;
+
+CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+std::unique_ptr CreateASTConsumer(CompilerInstance &CI,
+   StringRef) override {
+return std::unique_ptr(new CHashConsumer(CI, Hash));
+}
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+llvm::MD5::MD5Result Hash1, Hash2;
+if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+return testing::AssertionFailure()
+<< "Parsing error in (A)\"" << Code1.str() << "\"";
+}
+if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+return testing::AssertionFailure()
+<< "Parsing error in (B) \"" << Code2.str() << "\"";
+}
+return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+ASSERT_TRUE(isASTHashEqual( // Unused struct
+ "struct foobar { int a0; char a1; unsigned long a2; };",
+ "struct foobar { int a0; char a1;};"
+ ));
+
+}
+
+TEST(CHashVisitor, TestSourceStructure) {
+ASSERT_FALSE(isASTHashEqual(
+ "void foo() { int c; if (0) { c = 1; } }",
+ "void foo() { int c; if (0) { } c = 1; }"));
+
+ASSERT_FALSE(isASTHashEqual(
+ "void f1() {} void f2() {   }",
+ "void f1() {} void f2() { f1(); }"));
+}
Index: include/clang/AST/TypeDataCollectors.td
===
--- include/clang/AST/TypeDataCollectors.td
+++ include/clang/AST/TypeDataCollectors.td
@@ -0,0 +1,78 @@
+//--- Types ---//
+
+class Type {
+  code Code = [{
+ addData(llvm::hash_value(S->getTypeClass()));
+  }];
+}
+
+class BuiltinType {
+   code Code = [{
+  addData(S->getKind());
+   }];
+}
+
+class ArrayType  {
+   code Code = [{
+  addData(S->getSizeModifier());
+  addData(S->getIndexTypeCVRQualifiers());
+   }];
+}
+
+class ConstantArrayType {
+   code Code = [{
+  addData(S->getSize().getZExtValue());
+   }];
+}
+
+class VectorType {
+   code Code = [{
+  addData(S->getNumElements());
+  addData(S->getVectorKind());
+   }];
+}
+
+class FunctionTy