Add a -rewrite-includes option, which is similar to -rewrite-macros, but only expands #include directives.

Patch contributed by Lubos Lunak (l.lunax@suse.cz).
Review by Matt Beaumont-Gay (matthewbg@google.com).

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@158093 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Rewrite/CMakeLists.txt b/lib/Rewrite/CMakeLists.txt
index 2a05040..091af99 100644
--- a/lib/Rewrite/CMakeLists.txt
+++ b/lib/Rewrite/CMakeLists.txt
@@ -6,6 +6,7 @@
   FrontendActions.cpp
   HTMLPrint.cpp
   HTMLRewrite.cpp
+  InclusionRewriter.cpp
   RewriteMacros.cpp
   RewriteModernObjC.cpp
   RewriteObjC.cpp
diff --git a/lib/Rewrite/FrontendActions.cpp b/lib/Rewrite/FrontendActions.cpp
index 1753325..fa58575 100644
--- a/lib/Rewrite/FrontendActions.cpp
+++ b/lib/Rewrite/FrontendActions.cpp
@@ -181,3 +181,12 @@
 
   DoRewriteTest(CI.getPreprocessor(), OS);
 }
+
+void RewriteIncludesAction::ExecuteAction() {
+  CompilerInstance &CI = getCompilerInstance();
+  raw_ostream *OS = CI.createDefaultOutputFile(true, getCurrentFile());
+  if (!OS) return;
+
+  RewriteIncludesInInput(CI.getPreprocessor(), OS,
+                         CI.getPreprocessorOutputOpts());
+}
diff --git a/lib/Rewrite/InclusionRewriter.cpp b/lib/Rewrite/InclusionRewriter.cpp
new file mode 100644
index 0000000..439d777
--- /dev/null
+++ b/lib/Rewrite/InclusionRewriter.cpp
@@ -0,0 +1,370 @@
+//===--- InclusionRewriter.cpp - Rewrite includes into their expansions ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This code rewrites include invocations into their expansions.  This gives you
+// a file with all included files merged into it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Rewrite/Rewriters.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Frontend/PreprocessorOutputOptions.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace clang;
+using namespace llvm;
+
+namespace {
+
+class InclusionRewriter : public PPCallbacks {
+  /// Information about which #includes were actually performed,
+  /// created by preprocessor callbacks.
+  struct FileChange {
+    SourceLocation From;
+    FileID Id;
+    SrcMgr::CharacteristicKind FileType;
+    FileChange(SourceLocation From) : From(From) {
+    }
+  };
+  Preprocessor &PP; //< Used to find inclusion directives.
+  SourceManager &SM; //< Used to read and manage source files.
+  raw_ostream &OS; //< The destination stream for rewritten contents.
+  bool ShowLineMarkers; //< Show #line markers.
+  bool UseLineDirective; //< Use of line directives or line markers.
+  typedef std::map<unsigned, FileChange> FileChangeMap;
+  FileChangeMap FileChanges; /// Tracks which files were included where.
+  /// Used transitively for building up the FileChanges mapping over the
+  /// various \c PPCallbacks callbacks.
+  FileChangeMap::iterator LastInsertedFileChange;
+public:
+  InclusionRewriter(Preprocessor &PP, raw_ostream &OS, bool ShowLineMarkers);
+  bool Process(FileID FileId, SrcMgr::CharacteristicKind FileType);
+private:
+  virtual void FileChanged(SourceLocation Loc, FileChangeReason Reason,
+                           SrcMgr::CharacteristicKind FileType,
+                           FileID PrevFID);
+  virtual void FileSkipped(const FileEntry &ParentFile,
+                           const Token &FilenameTok,
+                           SrcMgr::CharacteristicKind FileType);
+  virtual void InclusionDirective(SourceLocation HashLoc,
+                                  const Token &IncludeTok,
+                                  StringRef FileName,
+                                  bool IsAngled,
+                                  const FileEntry *File,
+                                  SourceLocation EndLoc,
+                                  StringRef SearchPath,
+                                  StringRef RelativePath);
+  void WriteLineInfo(const char *Filename, int Line,
+                     SrcMgr::CharacteristicKind FileType,
+                     StringRef EOL, StringRef Extra = StringRef());
+  void OutputContentUpTo(const MemoryBuffer &FromFile,
+                         unsigned &WriteFrom, unsigned WriteTo,
+                         StringRef EOL, int &lines,
+                         bool EnsureNewline = false);
+  void CommentOutDirective(Lexer &DirectivesLex, const Token &StartToken,
+                           const MemoryBuffer &FromFile, StringRef EOL,
+                           unsigned &NextToWrite, int &Lines);
+  const FileChange *FindFileChangeLocation(SourceLocation Loc) const;
+  StringRef NextIdentifierName(Lexer &RawLex, Token &RawToken);
+};
+
+}  // end anonymous namespace
+
+/// Initializes an InclusionRewriter with a \p PP source and \p OS destination.
+InclusionRewriter::InclusionRewriter(Preprocessor &PP, raw_ostream &OS,
+                                     bool ShowLineMarkers)
+    : PP(PP), SM(PP.getSourceManager()), OS(OS),
+    ShowLineMarkers(ShowLineMarkers),
+    LastInsertedFileChange(FileChanges.end()) {
+  // If we're in microsoft mode, use normal #line instead of line markers.
+  UseLineDirective = PP.getLangOpts().MicrosoftExt;
+}
+
+/// Write appropriate line information as either #line directives or GNU line
+/// markers depending on what mode we're in, including the \p Filename and
+/// \p Line we are located at, using the specified \p EOL line separator, and
+/// any \p Extra context specifiers in GNU line directives.
+void InclusionRewriter::WriteLineInfo(const char *Filename, int Line,
+                                      SrcMgr::CharacteristicKind FileType,
+                                      StringRef EOL, StringRef Extra) {
+  if (!ShowLineMarkers)
+    return;
+  if (UseLineDirective) {
+    OS << "#line" << ' ' << Line << ' ' << '"' << Filename << '"';
+  } else {
+    // Use GNU linemarkers as described here:
+    // http://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
+    OS << '#' << ' ' << Line << ' ' << '"' << Filename << '"';
+    if (!Extra.empty())
+      OS << Extra;
+    if (FileType == SrcMgr::C_System)
+      // "`3' This indicates that the following text comes from a system header
+      // file, so certain warnings should be suppressed."
+      OS << " 3";
+    else if (FileType == SrcMgr::C_ExternCSystem)
+      // as above for `3', plus "`4' This indicates that the following text
+      // should be treated as being wrapped in an implicit extern "C" block."
+      OS << " 3 4";
+  }
+  OS << EOL;
+}
+
+/// FileChanged - Whenever the preprocessor enters or exits a #include file
+/// it invokes this handler.
+void InclusionRewriter::FileChanged(SourceLocation Loc,
+                                    FileChangeReason Reason,
+                                    SrcMgr::CharacteristicKind NewFileType,
+                                    FileID) {
+  if (Reason != EnterFile)
+    return;
+  if (LastInsertedFileChange == FileChanges.end())
+    // we didn't reach this file (eg: the main file) via an inclusion directive
+    return;
+  LastInsertedFileChange->second.Id = FullSourceLoc(Loc, SM).getFileID();
+  LastInsertedFileChange->second.FileType = NewFileType;
+  LastInsertedFileChange = FileChanges.end();
+}
+
+/// Called whenever an inclusion is skipped due to canonical header protection
+/// macros.
+void InclusionRewriter::FileSkipped(const FileEntry &/*ParentFile*/,
+                                    const Token &/*FilenameTok*/,
+                                    SrcMgr::CharacteristicKind /*FileType*/) {
+  assert(LastInsertedFileChange != FileChanges.end() && "A file, that wasn't "
+    "found via an inclusion directive, was skipped");
+  FileChanges.erase(LastInsertedFileChange);
+  LastInsertedFileChange = FileChanges.end();
+}
+
+/// This should be called whenever the preprocessor encounters include
+/// directives. It does not say whether the file has been included, but it
+/// provides more information about the directive (hash location instead
+/// of location inside the included file). It is assumed that the matching
+/// FileChanged() or FileSkipped() is called after this.
+void InclusionRewriter::InclusionDirective(SourceLocation HashLoc,
+                                           const Token &/*IncludeTok*/,
+                                           StringRef /*FileName*/,
+                                           bool /*IsAngled*/,
+                                           const FileEntry * /*File*/,
+                                           SourceLocation /*EndLoc*/,
+                                           StringRef /*SearchPath*/,
+                                           StringRef /*RelativePath*/) {
+  assert(LastInsertedFileChange == FileChanges.end() && "Another inclusion "
+    "directive was found before the previous one was processed");
+  std::pair<FileChangeMap::iterator, bool> p = FileChanges.insert(
+    std::make_pair(HashLoc.getRawEncoding(), FileChange(HashLoc)));
+  assert(p.second && "Unexpected revisitation of the same include directive");
+  LastInsertedFileChange = p.first;
+}
+
+/// Simple lookup for a SourceLocation (specifically one denoting the hash in
+/// an inclusion directive) in the map of inclusion information, FileChanges.
+const InclusionRewriter::FileChange *
+InclusionRewriter::FindFileChangeLocation(SourceLocation Loc) const {
+  FileChangeMap::const_iterator I = FileChanges.find(Loc.getRawEncoding());
+  if (I != FileChanges.end())
+    return &I->second;
+  return NULL;
+}
+
+/// Count the raw \\n characters in the \p Len characters from \p Pos.
+inline unsigned CountNewLines(const char *Pos, int Len) {
+  const char *End = Pos + Len;
+  unsigned Lines = 0;
+  --Pos;
+  while ((Pos = static_cast<const char*>(memchr(Pos + 1, '\n', End - Pos - 1))))
+    ++Lines;
+  return Lines;
+}
+
+/// Detect the likely line ending style of \p FromFile by examining the first
+/// newline found within it.
+static StringRef DetectEOL(const MemoryBuffer &FromFile) {
+  // detect what line endings the file uses, so that added content does not mix
+  // the style
+  const char *Pos = strchr(FromFile.getBufferStart(), '\n');
+  if (Pos == NULL)
+    return "\n";
+  if (Pos + 1 < FromFile.getBufferEnd() && Pos[1] == '\r')
+    return "\n\r";
+  if (Pos - 1 >= FromFile.getBufferStart() && Pos[-1] == '\r')
+    return "\r\n";
+  return "\n";
+}
+
+/// Writes out bytes from \p FromFile, starting at \p NextToWrite and ending at
+/// \p WriteTo - 1.
+void InclusionRewriter::OutputContentUpTo(const MemoryBuffer &FromFile,
+                                          unsigned &WriteFrom, unsigned WriteTo,
+                                          StringRef EOL, int &Line,
+                                          bool EnsureNewline) {
+  if (WriteTo <= WriteFrom)
+    return;
+  OS.write(FromFile.getBufferStart() + WriteFrom, WriteTo - WriteFrom);
+  // count lines manually, it's faster than getPresumedLoc()
+  Line += CountNewLines(FromFile.getBufferStart() + WriteFrom,
+                        WriteTo - WriteFrom);
+  if (EnsureNewline) {
+    char LastChar = FromFile.getBufferStart()[WriteTo - 1];
+    if (LastChar != '\n' && LastChar != '\r')
+      OS << EOL;
+  }
+  WriteFrom = WriteTo;
+}
+
+/// Print characters from \p FromFile starting at \p NextToWrite up until the
+/// inclusion directive at \p StartToken, then print out the inclusion
+/// inclusion directive disabled by a #if directive, updating \p NextToWrite
+/// and \p Line to track the number of source lines visited and the progress
+/// through the \p FromFile buffer.
+void InclusionRewriter::CommentOutDirective(Lexer &DirectiveLex,
+                                            const Token &StartToken,
+                                            const MemoryBuffer &FromFile,
+                                            StringRef EOL,
+                                            unsigned &NextToWrite, int &Line) {
+  OutputContentUpTo(FromFile, NextToWrite,
+    SM.getFileOffset(StartToken.getLocation()), EOL, Line);
+  Token DirectiveToken;
+  do {
+    DirectiveLex.LexFromRawLexer(DirectiveToken);
+  } while (!DirectiveToken.is(tok::eod) && DirectiveToken.isNot(tok::eof));
+  OS << "#if 0 /* expanded by -rewrite-includes */" << EOL;
+  OutputContentUpTo(FromFile, NextToWrite,
+    SM.getFileOffset(DirectiveToken.getLocation()) + DirectiveToken.getLength(),
+    EOL, Line);
+  OS << "#endif /* expanded by -rewrite-includes */" << EOL;
+}
+
+/// Find the next identifier in the pragma directive specified by \p RawToken.
+StringRef InclusionRewriter::NextIdentifierName(Lexer &RawLex,
+                                                Token &RawToken) {
+  RawLex.LexFromRawLexer(RawToken);
+  if (RawToken.is(tok::raw_identifier))
+    PP.LookUpIdentifierInfo(RawToken);
+  if (RawToken.is(tok::identifier))
+    return RawToken.getIdentifierInfo()->getName();
+  return StringRef();
+}
+
+/// Use a raw lexer to analyze \p FileId, inccrementally copying parts of it
+/// and including content of included files recursively.
+bool InclusionRewriter::Process(FileID FileId,
+                                SrcMgr::CharacteristicKind FileType)
+{
+  bool Invalid;
+  const MemoryBuffer &FromFile = *SM.getBuffer(FileId, &Invalid);
+  assert(!Invalid && "Invalid FileID while trying to rewrite includes");
+  const char *FileName = FromFile.getBufferIdentifier();
+  Lexer RawLex(FileId, &FromFile, PP.getSourceManager(), PP.getLangOpts());
+  RawLex.SetCommentRetentionState(false);
+
+  StringRef EOL = DetectEOL(FromFile);
+
+  // Per the GNU docs: "1" indicates the start of a new file.
+  WriteLineInfo(FileName, 1, FileType, EOL, " 1");
+
+  if (SM.getFileIDSize(FileId) == 0)
+    return true;
+
+  // The next byte to be copied from the source file
+  unsigned NextToWrite = 0;
+  int Line = 1; // The current input file line number.
+
+  Token RawToken;
+  RawLex.LexFromRawLexer(RawToken);
+
+  // TODO: Consider adding a switch that strips possibly unimportant content,
+  // such as comments, to reduce the size of repro files.
+  while (RawToken.isNot(tok::eof)) {
+    if (RawToken.is(tok::hash) && RawToken.isAtStartOfLine()) {
+      RawLex.setParsingPreprocessorDirective(true);
+      Token HashToken = RawToken;
+      RawLex.LexFromRawLexer(RawToken);
+      if (RawToken.is(tok::raw_identifier))
+        PP.LookUpIdentifierInfo(RawToken);
+      if (RawToken.is(tok::identifier)) {
+        switch (RawToken.getIdentifierInfo()->getPPKeywordID()) {
+          case tok::pp_include:
+          case tok::pp_include_next:
+          case tok::pp_import: {
+            CommentOutDirective(RawLex, HashToken, FromFile, EOL, NextToWrite,
+              Line);
+            if (const FileChange *Change = FindFileChangeLocation(
+                HashToken.getLocation())) {
+              // now include and recursively process the file
+              if (Process(Change->Id, Change->FileType))
+                // and set lineinfo back to this file, if the nested one was
+                // actually included
+                // `2' indicates returning to a file (after having included
+                // another file.
+                WriteLineInfo(FileName, Line, FileType, EOL, " 2");
+            } else
+              // fix up lineinfo (since commented out directive changed line
+              // numbers) for inclusions that were skipped due to header guards
+              WriteLineInfo(FileName, Line, FileType, EOL);
+            break;
+          }
+          case tok::pp_pragma: {
+            StringRef Identifier = NextIdentifierName(RawLex, RawToken);
+            if (Identifier == "clang" || Identifier == "GCC") {
+              if (NextIdentifierName(RawLex, RawToken) == "system_header") {
+                // keep the directive in, commented out
+                CommentOutDirective(RawLex, HashToken, FromFile, EOL,
+                  NextToWrite, Line);
+                // update our own type
+                FileType = SM.getFileCharacteristic(RawToken.getLocation());
+                WriteLineInfo(FileName, Line, FileType, EOL);
+              }
+            } else if (Identifier == "once") {
+              // keep the directive in, commented out
+              CommentOutDirective(RawLex, HashToken, FromFile, EOL,
+                NextToWrite, Line);
+              WriteLineInfo(FileName, Line, FileType, EOL);
+            }
+            break;
+          }
+          default:
+            break;
+        }
+      }
+      RawLex.setParsingPreprocessorDirective(false);
+    }
+    RawLex.LexFromRawLexer(RawToken);
+  }
+  OutputContentUpTo(FromFile, NextToWrite,
+    SM.getFileOffset(SM.getLocForEndOfFile(FileId)) + 1, EOL, Line,
+    /*EnsureNewline*/true);
+  return true;
+}
+
+/// InclusionRewriterInInput - Implement -rewrite-includes mode.
+void clang::RewriteIncludesInInput(Preprocessor &PP, raw_ostream *OS,
+                                   const PreprocessorOutputOptions &Opts) {
+  SourceManager &SM = PP.getSourceManager();
+  InclusionRewriter *Rewrite = new InclusionRewriter(PP, *OS,
+                                                     Opts.ShowLineMarkers);
+  PP.addPPCallbacks(Rewrite);
+
+  // First let the preprocessor process the entire file and call callbacks.
+  // Callbacks will record which #include's were actually performed.
+  PP.EnterMainSourceFile();
+  Token Tok;
+  // Only preprocessor directives matter here, so disable macro expansion
+  // everywhere else as an optimization.
+  // TODO: It would be even faster if the preprocessor could be switched
+  // to a mode where it would parse only preprocessor directives and comments,
+  // nothing else matters for parsing or processing.
+  PP.SetMacroExpansionOnlyInDirectives();
+  do {
+    PP.Lex(Tok);
+  } while (Tok.isNot(tok::eof));
+  Rewrite->Process(SM.getMainFileID(), SrcMgr::C_User);
+  OS->flush();
+}