From 601674bd6b50e8aa3b51f6eb1e2cff5fd5f5de9a Mon Sep 17 00:00:00 2001
From: Christian Dietrich <christian.dietrich@informatik.uni-erlangen.de>
Date: Fri, 23 Feb 2018 09:31:47 +0100
Subject: [PATCH] [CHash] Stable TU-level AST Hashing for local and global
 hashes

This patch does integrate the AST hashing parts of the CHash project[1][2] into
CLang. The extension provided by this patch accomplishes 2 things:

- local hashes: It extends StmtDataCollector infrastructure by hash rules for
  declarations, types, and attributes. These rules are expressed in terms of
  TableGen files and capture what parts of a AST node are important to its
  semantic. Thereby, the rules only include information that is local to the
  respecting node (no pointers to other AST nodes should be followed).

  Since these DataCollectors do not impose rules how the AST nodes relate to
  each other, they provide a mechanism for "local" hashes.

- global hashes: The CHashVisitor combines the local hashes via the
  RecursiveAST Visitor into a global semantic hash that includes (for a given
  AST node) all local hashes of all AST nodes that can influence the
  compilation of this AST node.

  With these global hashes, we can deduce (early) in the compilation process
  wether the result of this compiler run will produce the same object file.
  This mechanism is *NOT* part of this commit.

[1] https://www.sra.uni-hannover.de/Research/cHash/
[2] "cHash: Detection of Redunandt Recompilations via AST hashing", USENIX 2017
---
 include/clang/AST/AttrDataCollectors.td |  10 +
 include/clang/AST/CHashVisitor.h        | 331 ++++++++++++++++++++++++
 include/clang/AST/CMakeLists.txt        |  12 +
 include/clang/AST/DataCollection.h      |  10 +
 include/clang/AST/DeclDataCollectors.td | 205 +++++++++++++++
 include/clang/AST/StmtDataCollectors.td |  78 +++++-
 include/clang/AST/TypeDataCollectors.td |  78 ++++++
 unittests/AST/CHashTest.cpp             | 116 +++++++++
 unittests/AST/CMakeLists.txt            |   1 +
 9 files changed, 837 insertions(+), 4 deletions(-)
 create mode 100644 include/clang/AST/AttrDataCollectors.td
 create mode 100644 include/clang/AST/CHashVisitor.h
 create mode 100644 include/clang/AST/DeclDataCollectors.td
 create mode 100644 include/clang/AST/TypeDataCollectors.td
 create mode 100644 unittests/AST/CHashTest.cpp

diff --git a/include/clang/AST/AttrDataCollectors.td b/include/clang/AST/AttrDataCollectors.td
new file mode 100644
index 00000000000..cbf51a31c97
--- /dev/null
+++ b/include/clang/AST/AttrDataCollectors.td
@@ -0,0 +1,10 @@
+//--- Attributes ---------------------------------------------------------//
+class Attr {
+  code Code = [{
+    std::string AttrString;      
+    llvm::raw_string_ostream OS(AttrString);
+    S->printPretty(OS, Context.getLangOpts());
+    OS.flush();
+    addData(AttrString);
+  }];
+}
diff --git a/include/clang/AST/CHashVisitor.h b/include/clang/AST/CHashVisitor.h
new file mode 100644
index 00000000000..78fad4aa6c6
--- /dev/null
+++ b/include/clang/AST/CHashVisitor.h
@@ -0,0 +1,331 @@
+//===--- CHashVisitor.h - Stable AST Hashing -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the CHashVisitor class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_CHASHVISITOR_H
+#define LLVM_CLANG_AST_CHASHVISITOR_H
+
+#include "clang/AST/AST.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/DataCollection.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "llvm/Support/MD5.h"
+#include <map>
+#include <string>
+
+namespace clang {
+
+
+template <typename H = llvm::MD5, typename HR = llvm::MD5::MD5Result>
+class CHashVisitor : public clang::RecursiveASTVisitor<CHashVisitor<H, HR>> {
+
+  using Inherited = clang::RecursiveASTVisitor<CHashVisitor<H, HR>>;
+
+public:
+  using Hash = H;
+  using HashResult = HR;
+
+  /// Configure the RecursiveASTVisitor
+  bool shouldWalkTypesOfTypeLocs() const { return false; }
+
+protected:
+  ASTContext &Context;
+
+  // For the DataCollector, we implement a few addData() functions
+  void addData(uint64_t data) { topHash().update(data); }
+  void addData(const StringRef &str) { topHash().update(str); }
+  // On our way down, we meet a lot of qualified types.
+  void addData(const QualType &T) {
+    // 1. Hash referenced type
+    const Type *const ActualType = T.getTypePtr();
+    assert(ActualType != nullptr);
+
+    // FIXME: Structural hash
+    // 1.1 Was it already hashed?
+    const HashResult *const SavedDigest = getHash(ActualType);
+    if (SavedDigest) {
+      // 1.1.1 Use cached value
+      topHash().update(SavedDigest->Bytes);
+    } else {
+      // 1.1.2 Calculate hash for type
+      const Hash *const CurrentHash = pushHash();
+      Inherited::TraverseType(T); // Uses getTypePtr() internally
+      const HashResult TypeDigest = popHash(CurrentHash);
+      topHash().update(TypeDigest.Bytes);
+
+      // Store hash for underlying type
+      storeHash(ActualType, TypeDigest);
+    }
+
+    // Add the qulaifiers at this specific usage of the type
+    addData(T.getCVRQualifiers());
+  }
+
+  /// The RecursiveASTVisitor often visits several children of a node
+  /// in a for loop. However, we also have to include the number of
+  /// children into the hash in order to avoid collisions. Therefore,
+  /// the following addData(..) methods only include the lengths into
+  /// the hash. The actual children are visited by the
+  /// RecursiveASTVistor.
+  template<typename T>
+  void addData(const llvm::iterator_range<T> &x) {
+    addData(std::distance(x.begin(), x.end()));
+  }
+  template<typename T>
+  void addData(const llvm::ArrayRef<T> &x) {
+    addData(x.size());
+  }
+
+public:
+#define DEF_ADD_DATA_STORED(CLASS, CODE)                                       \
+  template <class = void> bool Visit##CLASS(const CLASS *S) {                  \
+    CODE;                                                                      \
+    return true;                                                               \
+  }
+#define DEF_ADD_DATA(CLASS, CODE) DEF_ADD_DATA_STORED(CLASS, CODE)
+#include "clang/AST/StmtDataCollectors.inc"
+#define DEF_ADD_DATA(CLASS, CODE) DEF_ADD_DATA_STORED(CLASS, CODE)
+#include "clang/AST/AttrDataCollectors.inc"
+#define DEF_ADD_DATA(CLASS, CODE) DEF_ADD_DATA_STORED(CLASS, CODE)
+#include "clang/AST/DeclDataCollectors.inc"
+#define DEF_ADD_DATA(CLASS, CODE) DEF_ADD_DATA_STORED(CLASS, CODE)
+#include "clang/AST/TypeDataCollectors.inc"
+
+  CHashVisitor(ASTContext &Context) : Context(Context) {}
+
+  /// For some special nodes, override the traverse function, since
+  /// we need both pre- and post order traversal
+  bool TraverseTranslationUnitDecl(TranslationUnitDecl *TU) {
+    if (!TU)
+      return true;
+    // First, we push a new hash onto the hashing stack. This hash
+    // will capture everythin within the TU*/
+    Hash *CurrentHash = pushHash();
+
+    Inherited::WalkUpFromTranslationUnitDecl(TU);
+
+    // Do recursion on our own, since we want to exclude some children
+    const auto DC = cast<DeclContext>(TU);
+    for (auto *Child : DC->noload_decls()) {
+      if (isa<TypedefDecl>(Child) || isa<RecordDecl>(Child) ||
+          isa<EnumDecl>(Child))
+        continue;
+
+      // Extern variable definitions at the top-level
+      if (const auto VD = dyn_cast<VarDecl>(Child)) {
+        if (VD->hasExternalStorage()) {
+          continue;
+        }
+      }
+
+      if (const auto FD = dyn_cast<FunctionDecl>(Child)) {
+        // We try to avoid hashing of declarations that have no definition
+        if (!FD->isThisDeclarationADefinition()) {
+          bool doHashing = false;
+          // HOWEVER! If this declaration is an alias Declaration, we
+          // hash it no matter what
+          if (FD->hasAttrs()) {
+            for (const Attr *const A : FD->getAttrs()) {
+              if (A->getKind() == attr::Kind::Alias) {
+                doHashing = true;
+                break;
+              }
+            }
+          }
+          if (!doHashing)
+            continue;
+        }
+      }
+
+      TraverseDecl(Child);
+    }
+
+    storeHash(TU, popHash(CurrentHash));
+
+    return true;
+  }
+
+  bool TraverseDecl(Decl *D) {
+    if (!D)
+      return true;
+    /* For some declarations, we store the calculated hash value. */
+    bool CacheHash = false;
+    if (isa<FunctionDecl>(D) && cast<FunctionDecl>(D)->isDefined())
+      CacheHash = true;
+    if (isa<VarDecl>(D) && cast<VarDecl>(D)->hasGlobalStorage())
+      CacheHash = true;
+    if (isa<RecordDecl>(D) && dyn_cast<RecordDecl>(D)->isCompleteDefinition())
+      CacheHash = true;
+
+    if (!CacheHash) {
+      return Inherited::TraverseDecl(D);
+    }
+
+    const HashResult *const SavedDigest = getHash(D);
+    if (SavedDigest) {
+      topHash().update(SavedDigest->Bytes);
+      return true;
+    }
+    Hash *CurrentHash = pushHash();
+    bool Ret = Inherited::TraverseDecl(D);
+    HashResult CurrentHashResult = popHash(CurrentHash);
+    storeHash(D, CurrentHashResult);
+    if (!isa<TranslationUnitDecl>(D)) {
+      topHash().update(CurrentHashResult.Bytes);
+    }
+
+    return Ret;
+  }
+
+  /// When doing a semantic hash, we have to use cross-tree links to
+  /// other parts of the AST, here we establish these links
+
+#define DEF_TYPE_GOTO_DECL(CLASS, EXPR)                                 \
+  bool Visit##CLASS(CLASS *T) {                                                \
+    Inherited::Visit##CLASS(T);                                                \
+    return TraverseDecl(EXPR);                                                 \
+  }
+
+  DEF_TYPE_GOTO_DECL(TypedefType, T->getDecl());
+  DEF_TYPE_GOTO_DECL(RecordType, T->getDecl());
+  // The EnumType forwards to the declaration. The declaration does
+  // not hand back to the type.
+  DEF_TYPE_GOTO_DECL(EnumType, T->getDecl());
+  bool TraverseEnumDecl(EnumDecl *E) {
+    /* In the original RecursiveASTVisitor
+       > if (D->getTypeForDecl()) {
+       >    TRY_TO(TraverseType(QualType(D->getTypeForDecl(), 0)));
+       > }
+       => NO, NO, NO, to avoid endless recursion
+    */
+    return Inherited::WalkUpFromEnumDecl(E);
+  }
+
+  bool VisitTypeDecl(TypeDecl *D) {
+    // If we would hash the resulting type for a typedef, we
+    // would get into an endless recursion.
+    if (!isa<TypedefNameDecl>(D) && !isa<RecordDecl>(D) && !isa<EnumDecl>(D)) {
+      addData(QualType(D->getTypeForDecl(), 0));
+    }
+    return true;
+  }
+
+  bool VisitDeclRefExpr(DeclRefExpr *E) {
+    ValueDecl *ValDecl = E->getDecl();
+    // Function Declarations are handled in VisitCallExpr
+    if (!ValDecl) {
+      return true;
+    }
+    if (isa<VarDecl>(ValDecl)) {
+      /* We emulate TraverseDecl here for VarDecl, because we
+       * are not allowed to call TraverseDecl here, since the
+       * initial expression of a DeclRefExpr might reference a
+       * sourronding Declaration itself. For example:
+       *
+       * struct foo {int N;}
+       * struct foo a = { sizeof(a) };
+       */
+      VarDecl *VD = static_cast<VarDecl *>(ValDecl);
+      VisitNamedDecl(VD);
+      Inherited::TraverseType(VD->getType());
+      VisitVarDecl(VD);
+    } else if (isa<FunctionDecl>(ValDecl)) {
+      /* Hash Functions without their body */
+      FunctionDecl *FD = static_cast<FunctionDecl *>(ValDecl);
+      Stmt *Body = FD->getBody();
+      FD->setBody(nullptr);
+      TraverseDecl(FD);
+      FD->setBody(Body);
+    } else {
+      TraverseDecl(ValDecl);
+    }
+    return true;
+  }
+
+  bool VisitValueDecl(ValueDecl *D) {
+    /* Field Declarations can induce recursions */
+    if (isa<FieldDecl>(D)) {
+      addData(std::string(D->getType().getAsString()));
+    } else {
+      addData(D->getType());
+    }
+    addData(D->isWeak());
+    return true;
+  }
+
+  bool VisitExpr(Expr *E) {
+    // We specially treat the CXXThisExpr, as we cannot call
+    // addData(QualType) directly on this, because it would reference
+    // back to the enclosing CXXRecord and result in a recursion.
+    if (isa<CXXThisExpr>(E)) {
+      addData(std::string(E->getType().getAsString()));
+    } else {
+      Inherited::VisitExpr(E);
+    }
+    return true;
+  }
+
+  /// For performance reasons, we cache some of the hashes for types
+  /// and declarations.
+
+public:
+  // We store hashes for declarations and types in separate maps.
+  std::map<const Type *, HashResult> TypeSilo;
+  std::map<const Decl *, HashResult> DeclSilo;
+
+  void storeHash(const Type *Obj, HashResult Dig) { TypeSilo[Obj] = Dig; }
+
+  void storeHash(const Decl *Obj, HashResult Dig) { DeclSilo[Obj] = Dig; }
+
+  const HashResult *getHash(const Type *Obj) {
+    if (TypeSilo.find(Obj) != TypeSilo.end()) {
+      return &TypeSilo[Obj];
+    }
+    return nullptr;
+  }
+
+  const HashResult *getHash(const Decl *Obj) {
+    if (DeclSilo.find(Obj) != DeclSilo.end()) {
+      return &DeclSilo[Obj];
+    }
+    return nullptr;
+  }
+
+  /// In order to produce hashes for subtrees on the way, a hash
+  /// stack is used. When a new subhash is meant to be calculated,
+  /// we push a new stack on the hash. All hashing functions use
+  /// always the top of the hashing stack.
+
+protected:
+  llvm::SmallVector<Hash, 32> HashStack;
+
+public:
+  Hash *pushHash() {
+    HashStack.push_back(Hash());
+    return &HashStack.back();
+  }
+
+  HashResult popHash(const Hash *ShouldBe = nullptr) {
+    assert(!ShouldBe || ShouldBe == &HashStack.back());
+
+    // Finalize the Hash and return the digest.
+    HashResult CurrentDigest;
+    topHash().final(CurrentDigest);
+    HashStack.pop_back();
+    return CurrentDigest;
+  }
+
+  Hash &topHash() { return HashStack.back(); }
+};
+
+} // namespace clang
+#endif // LLVM_CLANG_AST_CHASHVISITOR_H
diff --git a/include/clang/AST/CMakeLists.txt b/include/clang/AST/CMakeLists.txt
index 942d08d585f..ef42a98859e 100644
--- a/include/clang/AST/CMakeLists.txt
+++ b/include/clang/AST/CMakeLists.txt
@@ -53,3 +53,15 @@ clang_tablegen(CommentCommandList.inc -gen-clang-comment-command-list
 clang_tablegen(StmtDataCollectors.inc -gen-clang-data-collectors
   SOURCE StmtDataCollectors.td
   TARGET StmtDataCollectors)
+
+clang_tablegen(DeclDataCollectors.inc -gen-clang-data-collectors
+  SOURCE DeclDataCollectors.td
+  TARGET DeclDataCollectors)
+
+clang_tablegen(AttrDataCollectors.inc -gen-clang-data-collectors
+  SOURCE AttrDataCollectors.td
+  TARGET AttrDataCollectors)
+
+clang_tablegen(TypeDataCollectors.inc -gen-clang-data-collectors
+  SOURCE TypeDataCollectors.td
+  TARGET TypeDataCollectors)
diff --git a/include/clang/AST/DataCollection.h b/include/clang/AST/DataCollection.h
index 229ac2bd0f2..9be04e422a2 100644
--- a/include/clang/AST/DataCollection.h
+++ b/include/clang/AST/DataCollection.h
@@ -59,6 +59,16 @@ addDataToConsumer(T &DataConsumer, Type Data) {
   DataConsumer.update(StringRef(reinterpret_cast<char *>(&Data), sizeof(Data)));
 }
 
+template <class T, class Type>
+void addDataToConsumer(T &DataConsumer, const llvm::iterator_range<Type> &R) {
+  addDataToConsumer(DataConsumer, std::distance(R.begin(), R.end()));
+}
+
+template <class T, class Type>
+void addDataToConsumer(T &DataConsumer, const llvm::ArrayRef<Type> &R) {
+  addDataToConsumer(DataConsumer, R.size());
+}
+
 } // end namespace data_collection
 } // end namespace clang
 
diff --git a/include/clang/AST/DeclDataCollectors.td b/include/clang/AST/DeclDataCollectors.td
new file mode 100644
index 00000000000..5fdb98fdf2f
--- /dev/null
+++ b/include/clang/AST/DeclDataCollectors.td
@@ -0,0 +1,205 @@
+//--- Declarations -------------------------------------------------------//
+
+class Decl {
+  code Code = [{
+      // Every Declaration gets a tag field in the hash stream. It is
+      // hashed to add additional randomness to the hash
+      addData(llvm::hash_value(S->getKind()));
+
+      // CrossRef
+      addData(S->hasAttrs());
+      if (S->hasAttrs())
+        addData(S->attrs());
+  }];
+}
+
+class DeclContext  {
+   code Code = [{
+      // CrossRef
+      addData(S->decls());
+   }];
+}
+
+class BlockDecl  {
+   code Code = [{
+      // CrossRef
+      auto it = llvm::make_range(S->capture_begin(), S->capture_end());
+      addData(it);
+   }];
+}
+
+
+class ValueDecl  {
+  code Code = [{
+      addData(S->getType());
+      addData(S->isWeak());
+  }];
+}
+
+class NamedDecl {
+  code Code = [{
+      addData(S->getNameAsString());
+  }];
+}
+
+class TypeDecl {
+  code Code = [{
+      addData(QualType(S->getTypeForDecl(),0));
+  }];
+}
+
+class EnumDecl {
+  code Code = [{
+      addData(S->getNumPositiveBits());
+      addData(S->getNumNegativeBits());
+  }];
+}
+
+class EnumConstantDecl {
+  code Code = [{
+       /* Not every enum has a init expression. Therefore, 
+          we extract the actual enum value from it. */
+       addData(S->getInitVal().getExtValue());
+  }];
+}
+
+class TagDecl {
+  code Code = [{
+     addData(S->getTagKind());
+  }];
+}
+
+
+class TypedefNameDecl {
+  code Code = [{
+     addData(S->getUnderlyingType());
+  }];
+}
+
+class VarDecl {
+  code Code = [{
+      addData(S->getStorageClass());
+      addData(S->getTLSKind());
+      addData(S->isModulePrivate());
+      addData(S->isNRVOVariable());
+  }];
+}
+
+class ParmVarDecl {
+  code Code = [{
+       addData(S->isParameterPack());
+       addData(S->getOriginalType());
+  }];
+}
+
+class ImplicitParamDecl {
+  code Code = [{
+       addData(S->getParameterKind());
+  }];
+}
+
+class FunctionDecl {
+  code Code = [{
+       addData(S->isExternC());
+       addData(S->isGlobal());
+       addData(S->isNoReturn());
+       addData(S->getStorageClass());
+       addData(S->isInlineSpecified());
+       addData(S->isInlined());
+
+       // CrossRef
+       auto range = llvm::make_range(S->param_begin(), S->param_end());
+       addData(range);
+  }];
+}
+
+class LabelDecl {
+  code Code = [{
+       addData(S->isGnuLocal());
+       addData(S->isMSAsmLabel());
+       if (S->isMSAsmLabel()) {
+          addData(S->getMSAsmLabel());
+       }
+  }];
+}
+
+class CXXRecordDecl {
+  code Code = [{
+       // CrossRef
+       if (S->isCompleteDefinition()) {
+           addData(S->bases());
+       }
+  }];
+}
+
+class CXXConstructorDecl {
+  code Code = [{
+       // CrossRef
+       addData(S->inits());
+  }];
+}
+
+class FieldDecl {
+  code Code = [{
+      addData(S->isBitField());
+  }];
+}
+
+class CapturedDecl {
+  code Code = [{
+      addData(S->isNothrow());
+  }];
+}
+
+class DecompositionDecl {
+  code Code = [{
+      // CrossRef
+      addData(S->bindings());
+  }];
+}
+
+
+//--- Obj-C ---------------------------------------------------------//
+
+class ObjCCategoryDecl {
+  code Code = [{
+     // CrossRef
+     if (auto *it = S->getTypeParamList()) {
+       auto range = llvm::make_range(it->begin(), it->end());
+       addData(range);
+     }
+  }];
+}
+
+class ObjCInterfaceDecl {
+  code Code = [{
+     // CrossRef
+     if (auto *it = S->getTypeParamListAsWritten()) {
+       auto range = llvm::make_range(it->begin(), it->end());
+       addData(range);
+     }
+  }];
+}
+
+class ObjCMethodDecl {
+  code Code = [{
+     // CrossRef
+     auto range = llvm::make_range(S->param_begin(), S->param_end());
+     addData(range);
+  }];
+}
+
+//--- Templates -----------------------------------------------------//
+
+class FriendTemplateDecl {
+  code Code = [{
+      // CrossRef
+      addData(S->getNumTemplateParameters());
+      for (unsigned I = 0, E = S->getNumTemplateParameters(); I < E; ++I) {
+          auto TPL = S->getTemplateParameterList(I);
+          auto it  = llvm::make_range(TPL->begin(), TPL->end());
+          addData(it);
+      }
+  }];
+}
+
diff --git a/include/clang/AST/StmtDataCollectors.td b/include/clang/AST/StmtDataCollectors.td
index bf5f8c21707..5492a4be5b1 100644
--- a/include/clang/AST/StmtDataCollectors.td
+++ b/include/clang/AST/StmtDataCollectors.td
@@ -5,6 +5,9 @@ class Stmt {
     // macro-generated code.
     addData(data_collection::getMacroStack(S->getLocStart(), Context));
     addData(data_collection::getMacroStack(S->getLocEnd(), Context));
+
+    // CrossRef
+    addData(S->children());
   }];
 }
 
@@ -28,16 +31,29 @@ class ExpressionTraitExpr {
 class PredefinedExpr {
   code Code = [{
     addData(S->getIdentType());
+    addData(S->getFunctionName()->getString());
   }];
 }
 class TypeTraitExpr {
   code Code = [{
     addData(S->getTrait());
+    // CrossRef
+    addData(S->getNumArgs());
     for (unsigned i = 0; i < S->getNumArgs(); ++i)
       addData(S->getArg(i)->getType());
   }];
 }
 
+class UnaryExprOrTypeTraitExpr {
+  code Code = [{
+    addData(S->getKind());
+    if (S->isArgumentType()) {
+       addData(S->getArgumentType());
+    }
+  }];
+}
+
+
 //--- Calls --------------------------------------------------------------//
 class CallExpr {
   code Code = [{
@@ -72,6 +88,7 @@ class DeclRefExpr {
 }
 class MemberExpr {
   code Code = [{
+    // I suspect this should be included: addData(S->isArrow());
     addData(S->getMemberDecl()->getName());
   }];
 }
@@ -124,6 +141,12 @@ class ObjCBridgedCastExpr {
   }];
 }
 
+class CastExpr {
+  code Code = [{
+    addData(S->getCastKind());
+  }];
+}
+
 //--- Miscellaneous Exprs ------------------------------------------------//
 class BinaryOperator {
   code Code = [{
@@ -136,6 +159,12 @@ class UnaryOperator {
   }];
 }
 
+class VAArgExpr {
+  code Code = [{
+       addData(S->isMicrosoftABI());
+  }];
+}
+
 //--- Control flow -------------------------------------------------------//
 class GotoStmt {
   code Code = [{
@@ -189,27 +218,48 @@ class CXXFoldExpr {
 }
 class GenericSelectionExpr {
   code Code = [{
+    // CrossRef
+    addData(S->getNumAssocs());
     for (unsigned i = 0; i < S->getNumAssocs(); ++i) {
       addData(S->getAssocType(i));
     }
   }];
 }
+
+class PseudoObjectExpr {
+  code Code = [{
+    // CrossRef
+    addData(S->semantics());
+  }];
+}
+
 class LambdaExpr {
   code Code = [{
+    addData(S->isGenericLambda());
+    addData(S->isMutable());
+    addData(S->hasExplicitParameters());
+    addData(S->hasExplicitResultType());
+
+
+    // CrossRef
+    addData(S->captures());
+    addData(S->explicit_captures());
     for (const LambdaCapture &C : S->captures()) {
       addData(C.isPackExpansion());
       addData(C.getCaptureKind());
       if (C.capturesVariable())
         addData(C.getCapturedVar()->getType());
     }
-    addData(S->isGenericLambda());
-    addData(S->isMutable());
+    
+
   }];
 }
 class DeclStmt {
   code Code = [{
-    auto numDecls = std::distance(S->decl_begin(), S->decl_end());
-    addData(static_cast<unsigned>(numDecls));
+    // CrossRef
+    addData(S->decls());
+
+    // FIXME? As this should be done by a using visitor
     for (const Decl *D : S->decls()) {
       if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
         addData(VD->getType());
@@ -222,6 +272,12 @@ class AsmStmt {
     addData(S->isSimple());
     addData(S->isVolatile());
     addData(S->generateAsmString(Context));
+
+    // CrossRef + FIXME
+    addData(S->getNumInputs());
+    addData(S->getNumOutputs());
+    addData(S->getNumClobbers());
+
     for (unsigned i = 0; i < S->getNumInputs(); ++i) {
       addData(S->getInputConstraint(i));
     }
@@ -236,7 +292,21 @@ class AsmStmt {
 class AttributedStmt {
   code Code = [{
     for (const Attr *A : S->getAttrs()) {
+       // We duplicate class Attr here to not rely on being integrated
+       // into a RecursiveASTVisitor.
+       std::string AttrString;      
+       llvm::raw_string_ostream OS(AttrString);
+       A->printPretty(OS, Context.getLangOpts());
+       OS.flush();
       addData(std::string(A->getSpelling()));
     }
   }];
 }
+
+class CompoundStmt {
+  code Code = [{
+    // CrossRef
+    addData(S->size());
+  }];
+}
+
diff --git a/include/clang/AST/TypeDataCollectors.td b/include/clang/AST/TypeDataCollectors.td
new file mode 100644
index 00000000000..74befd555db
--- /dev/null
+++ b/include/clang/AST/TypeDataCollectors.td
@@ -0,0 +1,78 @@
+//--- Types ---------------------------------------------------------------//
+
+class Type {
+  code Code = [{
+     addData(llvm::hash_value(S->getTypeClass()));
+  }];
+}
+
+class BuiltinType {
+   code Code = [{
+      addData(S->getKind());
+   }];
+}
+
+class ArrayType  {
+   code Code = [{
+      addData(S->getSizeModifier());
+      addData(S->getIndexTypeCVRQualifiers());
+   }];
+}
+
+class ConstantArrayType {
+   code Code = [{
+      addData(S->getSize().getZExtValue());
+   }];
+}
+
+class VectorType {
+   code Code = [{
+      addData(S->getNumElements());
+      addData(S->getVectorKind());
+   }];
+}
+
+class FunctionType {
+   code Code = [{
+      addData(S->getRegParmType());
+      addData(S->getCallConv());
+   }];
+}
+
+class FunctionProtoType {
+   code Code = [{
+      addData(S->getExceptionSpecType());
+      addData(S->isVariadic());
+      addData(S->getRefQualifier());
+      addData(S->hasTrailingReturn());
+
+      addData(S->param_types());
+      addData(S->exceptions());
+   }];
+}
+
+class UnaryTransformType {
+   code Code = [{
+        addData(S->getUTTKind());
+   }];
+}
+
+class AttributedType {
+   code Code = [{
+        addData(S->getAttrKind());
+   }];
+}
+
+class ElaboratedType {
+   code Code = [{
+        addData(S->getKeyword());
+   }];
+}
+
+class ObjCObjectType {
+   code Code = [{
+        addData(S->getTypeArgsAsWritten());
+   }];
+}
+
+
diff --git a/unittests/AST/CHashTest.cpp b/unittests/AST/CHashTest.cpp
new file mode 100644
index 00000000000..46dbd721da7
--- /dev/null
+++ b/unittests/AST/CHashTest.cpp
@@ -0,0 +1,116 @@
+//===- unittests/AST/DataCollectionTest.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains tests for the DataCollection module.
+//
+// They work by hashing the collected data of two nodes and asserting that the
+// hash values are equal iff the nodes are considered equal.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/CHashVisitor.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+using namespace clang;
+using namespace tooling;
+
+
+class CHashConsumer : public ASTConsumer {
+    CompilerInstance &CI;
+    llvm::MD5::MD5Result *ASTHash;
+
+public:
+
+    CHashConsumer(CompilerInstance &CI, llvm::MD5::MD5Result *ASTHash)
+        : CI(CI), ASTHash(ASTHash){}
+
+    virtual void HandleTranslationUnit(clang::ASTContext &Context) override {
+        TranslationUnitDecl *TU = Context.getTranslationUnitDecl();
+
+        // Traversing the translation unit decl via a RecursiveASTVisitor
+        // will visit all nodes in the AST.
+        CHashVisitor<> Visitor(Context);
+        Visitor.TraverseDecl(TU);
+        // Copy Away the resulting hash
+        *ASTHash = *Visitor.getHash(TU);
+
+    }
+
+    ~CHashConsumer() override {}
+};
+
+struct CHashAction : public ASTFrontendAction {
+    llvm::MD5::MD5Result *Hash;
+
+    CHashAction(llvm::MD5::MD5Result *Hash) : Hash(Hash) {}
+
+    std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
+                                                   StringRef) override {
+        return std::unique_ptr<ASTConsumer>(new CHashConsumer(CI, Hash));
+    }
+};
+
+static testing::AssertionResult
+isASTHashEqual(StringRef Code1, StringRef Code2) {
+    llvm::MD5::MD5Result Hash1, Hash2;
+    if (!runToolOnCode(new CHashAction(&Hash1), Code1)) {
+        return testing::AssertionFailure()
+            << "Parsing error in (A)\"" << Code1.str() << "\"";
+    }
+    if (!runToolOnCode(new CHashAction(&Hash2), Code2)) {
+        return testing::AssertionFailure()
+            << "Parsing error in (B) \"" << Code2.str() << "\"";
+    }
+    return testing::AssertionResult(Hash1 == Hash2);
+}
+
+TEST(CHashVisitor, TestRecordTypes) {
+    ASSERT_TRUE(isASTHashEqual( // Unused record definition
+                     "struct foobar { int a0; char a1; unsigned long a2; };",
+                     "struct foobar { int a0; char a1;};"
+                     ));
+
+    // Recursive Record Types
+    ASSERT_FALSE(isASTHashEqual( // Names still matter
+                    "struct ll { struct ll* foo; }; struct ll a;",
+                    "struct ll { struct ll* bar; }; struct ll a;"
+                    ));
+
+    ASSERT_TRUE(isASTHashEqual( // Order does not matter
+                     "struct b; struct a { struct b* o; }; struct b { struct a* o; }; struct a X;",
+                     "struct a; struct b { struct a* o; }; struct a { struct b* o; }; struct a X;"
+                     ));
+
+    // The initialization of a struct with a reference to itself should not crash
+    ASSERT_FALSE(isASTHashEqual(
+                    "struct foo { int N; }; struct foo a = { sizeof(a) };",
+                    "struct foo { int N; }; struct foo a = { sizeof(struct foo) };"
+                    ));
+}
+
+TEST(CHashVisitor, TypedefTest) {
+    ASSERT_FALSE(isASTHashEqual(
+                    "typedef int bar_t;  bar_t x;",
+                    "typedef char bar_t; bar_t x;"
+                    ));
+}
+
+
+TEST(CHashVisitor, TestSourceStructure) {
+    ASSERT_FALSE(isASTHashEqual(
+                     "void foo() { int c; if (0) { c = 1; } }",
+                     "void foo() { int c; if (0) { } c = 1; }"));
+
+    ASSERT_FALSE(isASTHashEqual(
+                     "void f1() {} void f2() {       }",
+                     "void f1() {} void f2() { f1(); }"));
+
+}
diff --git a/unittests/AST/CMakeLists.txt b/unittests/AST/CMakeLists.txt
index 89590c11066..9505023e594 100644
--- a/unittests/AST/CMakeLists.txt
+++ b/unittests/AST/CMakeLists.txt
@@ -17,6 +17,7 @@ add_clang_unittest(ASTTests
   NamedDeclPrinterTest.cpp
   SourceLocationTest.cpp
   StmtPrinterTest.cpp
+  CHashTest.cpp
   )
 
 target_link_libraries(ASTTests
-- 
GitLab