Introduce a new lexer function to compute the "preamble" of a file,
which is the part of the file that contains all of the initial
comments, includes, and preprocessor directives that occur before any
of the actual code. Added a new -print-preamble cc1 action that is
only used for testing.


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@108913 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp
index 00363d9..b007ac4 100644
--- a/lib/Frontend/CompilerInvocation.cpp
+++ b/lib/Frontend/CompilerInvocation.cpp
@@ -331,6 +331,7 @@
   case frontend::ParsePrintCallbacks:    return "-parse-print-callbacks";
   case frontend::ParseSyntaxOnly:        return "-fsyntax-only";
   case frontend::PrintDeclContext:       return "-print-decl-contexts";
+  case frontend::PrintPreamble:          return "-print-preamble";
   case frontend::PrintPreprocessedInput: return "-E";
   case frontend::RewriteMacros:          return "-rewrite-macros";
   case frontend::RewriteObjC:            return "-rewrite-objc";
@@ -989,6 +990,8 @@
       Opts.ProgramAction = frontend::ParseSyntaxOnly; break;
     case OPT_print_decl_contexts:
       Opts.ProgramAction = frontend::PrintDeclContext; break;
+    case OPT_print_preamble:
+      Opts.ProgramAction = frontend::PrintPreamble; break;
     case OPT_E:
       Opts.ProgramAction = frontend::PrintPreprocessedInput; break;
     case OPT_rewrite_macros:
diff --git a/lib/Frontend/FrontendActions.cpp b/lib/Frontend/FrontendActions.cpp
index 3a53dee..4db9c11 100644
--- a/lib/Frontend/FrontendActions.cpp
+++ b/lib/Frontend/FrontendActions.cpp
@@ -19,6 +19,7 @@
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "clang/Frontend/Utils.h"
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace clang;
 
@@ -192,3 +193,32 @@
   DoPrintPreprocessedInput(CI.getPreprocessor(), OS,
                            CI.getPreprocessorOutputOpts());
 }
+
+void PrintPreambleAction::ExecuteAction() {
+  switch (getCurrentFileKind()) {
+  case IK_C:
+  case IK_CXX:
+  case IK_ObjC:
+  case IK_ObjCXX:
+  case IK_OpenCL:
+    break;
+      
+  case IK_None:
+  case IK_Asm:
+  case IK_PreprocessedC:
+  case IK_PreprocessedCXX:
+  case IK_PreprocessedObjC:
+  case IK_PreprocessedObjCXX:
+  case IK_AST:
+  case IK_LLVM_IR:
+    // We can't do anything with these.
+    return;
+  }
+  
+  llvm::MemoryBuffer *Buffer = llvm::MemoryBuffer::getFile(getCurrentFile());
+  if (Buffer) {
+    unsigned Preamble = Lexer::ComputePreamble(Buffer);
+    llvm::outs().write(Buffer->getBufferStart(), Preamble);
+    delete Buffer;
+  }
+}
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 91b14f6..2f11c37 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -28,6 +28,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Basic/SourceManager.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <cctype>
@@ -247,6 +248,130 @@
   return TheTok.getLength();
 }
 
+namespace {
+  enum PreambleDirectiveKind {
+    PDK_Skipped,
+    PDK_StartIf,
+    PDK_EndIf,
+    PDK_Unknown
+  };
+}
+
+unsigned Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer) {
+  // Create a lexer starting at the beginning of the file. Note that we use a
+  // "fake" file source location at offset 1 so that the lexer will track our
+  // position within the file.
+  const unsigned StartOffset = 1;
+  SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset);
+  LangOptions LangOpts;
+  Lexer TheLexer(StartLoc, LangOpts, Buffer->getBufferStart(), 
+                 Buffer->getBufferStart(), Buffer->getBufferEnd());
+  
+  bool InPreprocessorDirective = false;
+  Token TheTok;
+  Token IfStartTok;
+  unsigned IfCount = 0;
+  do {
+    TheLexer.LexFromRawLexer(TheTok);
+
+    if (InPreprocessorDirective) {
+      // If we've hit the end of the file, we're done.
+      if (TheTok.getKind() == tok::eof) {
+        InPreprocessorDirective = false;
+        break;
+      }
+      
+      // If we haven't hit the end of the preprocessor directive, skip this
+      // token.
+      if (!TheTok.isAtStartOfLine())
+        continue;
+        
+      // We've passed the end of the preprocessor directive, and will look
+      // at this token again below.
+      InPreprocessorDirective = false;
+    }
+    
+    // Comments are okay; skip over them.
+    if (TheTok.getKind() == tok::comment)
+      continue;
+    
+    if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
+      // This is the start of a preprocessor directive. 
+      Token HashTok = TheTok;
+      InPreprocessorDirective = true;
+      
+      // Figure out which direective this is. Since we're lexing raw tokens,
+      // we don't have an identifier table available. Instead, just look at
+      // the raw identifier to recognize and categorize preprocessor directives.
+      TheLexer.LexFromRawLexer(TheTok);
+      if (TheTok.getKind() == tok::identifier && !TheTok.needsCleaning()) {
+        const char *IdStart = Buffer->getBufferStart() 
+                            + TheTok.getLocation().getRawEncoding() - 1;
+        llvm::StringRef Keyword(IdStart, TheTok.getLength());
+        PreambleDirectiveKind PDK
+          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
+              .Case("include", PDK_Skipped)
+              .Case("__include_macros", PDK_Skipped)
+              .Case("define", PDK_Skipped)
+              .Case("undef", PDK_Skipped)
+              .Case("line", PDK_Skipped)
+              .Case("error", PDK_Skipped)
+              .Case("pragma", PDK_Skipped)
+              .Case("import", PDK_Skipped)
+              .Case("include_next", PDK_Skipped)
+              .Case("warning", PDK_Skipped)
+              .Case("ident", PDK_Skipped)
+              .Case("sccs", PDK_Skipped)
+              .Case("assert", PDK_Skipped)
+              .Case("unassert", PDK_Skipped)
+              .Case("if", PDK_StartIf)
+              .Case("ifdef", PDK_StartIf)
+              .Case("ifndef", PDK_StartIf)
+              .Case("elif", PDK_Skipped)
+              .Case("else", PDK_Skipped)
+              .Case("endif", PDK_EndIf)
+              .Default(PDK_Unknown);
+
+        switch (PDK) {
+        case PDK_Skipped:
+          continue;
+
+        case PDK_StartIf:
+          if (IfCount == 0)
+            IfStartTok = HashTok;
+            
+          ++IfCount;
+          continue;
+            
+        case PDK_EndIf:
+          // Mismatched #endif. The preamble ends here.
+          if (IfCount == 0)
+            break;
+
+          --IfCount;
+          continue;
+            
+        case PDK_Unknown:
+          // We don't know what this directive is; stop at the '#'.
+          break;
+        }
+      }
+      
+      // We only end up here if we didn't recognize the preprocessor
+      // directive or it was one that can't occur in the preamble at this
+      // point. Roll back the current token to the location of the '#'.
+      InPreprocessorDirective = false;
+      TheTok = HashTok;
+    }
+
+    // We hit a token
+    break;
+  } while (true);
+  
+  SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation();
+  return End.getRawEncoding() - StartLoc.getRawEncoding();
+}
+
 //===----------------------------------------------------------------------===//
 // Character information.
 //===----------------------------------------------------------------------===//