[analyzer] Make StmtDataCollector customizable

Summary:
This moves the data collection macro calls for Stmt nodes
to lib/AST/StmtDataCollectors.inc

Users can subclass ConstStmtVisitor and include StmtDataCollectors.inc
to define visitor methods for each Stmt subclass. This makes it also
possible to customize the visit methods as exemplified in
lib/Analysis/CloneDetection.cpp.

Move helper methods for data collection to a new module,
AST/DataCollection.

Add data collection for DeclRefExpr, MemberExpr and some literals.

Reviewers: arphaman, teemperor!

Subscribers: mgorny, xazax.hun, cfe-commits

Differential Revision: https://reviews.llvm.org/D36664

llvm-svn: 311569
diff --git a/clang/lib/Analysis/CloneDetection.cpp b/clang/lib/Analysis/CloneDetection.cpp
index 5ea7498..1cce6a4 100644
--- a/clang/lib/Analysis/CloneDetection.cpp
+++ b/clang/lib/Analysis/CloneDetection.cpp
@@ -13,16 +13,12 @@
 
 #include "clang/Analysis/CloneDetection.h"
 
-#include "clang/AST/ASTContext.h"
-#include "clang/AST/RecursiveASTVisitor.h"
-#include "clang/AST/Stmt.h"
-#include "clang/Lex/Lexer.h"
+#include "clang/AST/DataCollection.h"
+#include "clang/AST/DeclTemplate.h"
 #include "llvm/Support/MD5.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang;
-using namespace clang::clone_detection;
 
 StmtSequence::StmtSequence(const CompoundStmt *Stmt, const Decl *D,
                            unsigned StartIndex, unsigned EndIndex)
@@ -91,34 +87,6 @@
   return SourceRange(getStartLoc(), getEndLoc());
 }
 
-/// Prints the macro name that contains the given SourceLocation into the given
-/// raw_string_ostream.
-static void printMacroName(llvm::raw_string_ostream &MacroStack,
-                           ASTContext &Context, SourceLocation Loc) {
-  MacroStack << Lexer::getImmediateMacroName(Loc, Context.getSourceManager(),
-                                             Context.getLangOpts());
-
-  // Add an empty space at the end as a padding to prevent
-  // that macro names concatenate to the names of other macros.
-  MacroStack << " ";
-}
-
-std::string clone_detection::getMacroStack(SourceLocation Loc,
-                                           ASTContext &Context) {
-  std::string MacroStack;
-  llvm::raw_string_ostream MacroStackStream(MacroStack);
-  SourceManager &SM = Context.getSourceManager();
-
-  // Iterate over all macros that expanded into the given SourceLocation.
-  while (Loc.isMacroID()) {
-    // Add the macro name to the stream.
-    printMacroName(MacroStackStream, Context, Loc);
-    Loc = SM.getImmediateMacroCallerLoc(Loc);
-  }
-  MacroStackStream.flush();
-  return MacroStack;
-}
-
 void CloneDetector::analyzeCodeBody(const Decl *D) {
   assert(D);
   assert(D->hasBody());
@@ -184,16 +152,17 @@
   }
 }
 
-bool FilenamePatternConstraint::isAutoGenerated(const CloneDetector::CloneGroup &Group) {
+bool FilenamePatternConstraint::isAutoGenerated(
+    const CloneDetector::CloneGroup &Group) {
   std::string Error;
-  if (IgnoredFilesPattern.empty() || Group.empty() || 
+  if (IgnoredFilesPattern.empty() || Group.empty() ||
       !IgnoredFilesRegex->isValid(Error))
     return false;
 
   for (const StmtSequence &S : Group) {
     const SourceManager &SM = S.getASTContext().getSourceManager();
-    StringRef Filename = llvm::sys::path::filename(SM.getFilename(
-        S.getContainingDecl()->getLocation()));
+    StringRef Filename = llvm::sys::path::filename(
+        SM.getFilename(S.getContainingDecl()->getLocation()));
     if (IgnoredFilesRegex->match(Filename))
       return true;
   }
@@ -201,6 +170,59 @@
   return false;
 }
 
+/// This class defines what a type II code clone is: If it collects for two
+/// statements the same data, then those two statements are considered to be
+/// clones of each other.
+///
+/// All collected data is forwarded to the given data consumer of the type T.
+/// The data consumer class needs to provide a member method with the signature:
+///   update(StringRef Str)
+namespace {
+template <class T>
+class CloneTypeIIStmtDataCollector
+    : public ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>> {
+  ASTContext &Context;
+  /// The data sink to which all data is forwarded.
+  T &DataConsumer;
+
+  template <class Ty> void addData(const Ty &Data) {
+    data_collection::addDataToConsumer(DataConsumer, Data);
+  }
+
+public:
+  CloneTypeIIStmtDataCollector(const Stmt *S, ASTContext &Context,
+                               T &DataConsumer)
+      : Context(Context), DataConsumer(DataConsumer) {
+    this->Visit(S);
+  }
+
+// Define a visit method for each class to collect data and subsequently visit
+// all parent classes. This uses a template so that custom visit methods by us
+// take precedence.
+#define DEF_ADD_DATA(CLASS, CODE)                                              \
+  template <class = void> void Visit##CLASS(const CLASS *S) {                  \
+    CODE;                                                                      \
+    ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S);        \
+  }
+
+#include "../AST/StmtDataCollectors.inc"
+
+// Type II clones ignore variable names and literals, so let's skip them.
+#define SKIP(CLASS)                                                            \
+  void Visit##CLASS(const CLASS *S) {                                          \
+    ConstStmtVisitor<CloneTypeIIStmtDataCollector<T>>::Visit##CLASS(S);        \
+  }
+  SKIP(DeclRefExpr)
+  SKIP(MemberExpr)
+  SKIP(IntegerLiteral)
+  SKIP(FloatingLiteral)
+  SKIP(StringLiteral)
+  SKIP(CXXBoolLiteralExpr)
+  SKIP(CharacterLiteral)
+#undef SKIP
+};
+} // end anonymous namespace
+
 static size_t createHash(llvm::MD5 &Hash) {
   size_t HashCode;
 
@@ -222,7 +244,7 @@
   llvm::MD5 Hash;
   ASTContext &Context = D->getASTContext();
 
-  StmtDataCollector<llvm::MD5>(S, Context, Hash);
+  CloneTypeIIStmtDataCollector<llvm::MD5>(S, Context, Hash);
 
   auto CS = dyn_cast<CompoundStmt>(S);
   SmallVector<size_t, 8> ChildHashes;
@@ -288,8 +310,8 @@
 static void CollectStmtSequenceData(const StmtSequence &Sequence,
                                     FoldingSetNodeIDWrapper &OutputData) {
   for (const Stmt *S : Sequence) {
-    StmtDataCollector<FoldingSetNodeIDWrapper>(S, Sequence.getASTContext(),
-                                               OutputData);
+    CloneTypeIIStmtDataCollector<FoldingSetNodeIDWrapper>(
+        S, Sequence.getASTContext(), OutputData);
 
     for (const Stmt *Child : S->children()) {
       if (!Child)
@@ -339,7 +361,7 @@
     // Sort hash_codes in StmtsByHash.
     std::stable_sort(StmtsByHash.begin(), StmtsByHash.end(),
                      [](std::pair<size_t, StmtSequence> LHS,
-                            std::pair<size_t, StmtSequence> RHS) {
+                        std::pair<size_t, StmtSequence> RHS) {
                        return LHS.first < RHS.first;
                      });
 
@@ -393,8 +415,10 @@
   ASTContext &Context = Seq.getASTContext();
 
   // Look up what macros expanded into the current statement.
-  std::string StartMacroStack = getMacroStack(Seq.getStartLoc(), Context);
-  std::string EndMacroStack = getMacroStack(Seq.getEndLoc(), Context);
+  std::string StartMacroStack =
+      data_collection::getMacroStack(Seq.getStartLoc(), Context);
+  std::string EndMacroStack =
+      data_collection::getMacroStack(Seq.getEndLoc(), Context);
 
   // First, check if ParentMacroStack is not empty which means we are currently
   // dealing with a parent statement which was expanded from a macro.