Structured comment parsing, first step.

* Retain comments in the AST
* Serialize/deserialize comments
* Find comments attached to a certain Decl
* Expose raw comment text and SourceRange via libclang


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@158771 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/ARCMigrate/TransEmptyStatementsAndDealloc.cpp b/lib/ARCMigrate/TransEmptyStatementsAndDealloc.cpp
index 0f6c799..d8fabcd 100644
--- a/lib/ARCMigrate/TransEmptyStatementsAndDealloc.cpp
+++ b/lib/ARCMigrate/TransEmptyStatementsAndDealloc.cpp
@@ -44,7 +44,7 @@
   SourceManager &SM = Ctx.getSourceManager();
   std::vector<SourceLocation>::iterator
     I = std::upper_bound(MacroLocs.begin(), MacroLocs.end(), SemiLoc,
-                         SourceManager::LocBeforeThanCompare(SM));
+                         BeforeThanCompare<SourceLocation>(SM));
   --I;
   SourceLocation
       AfterMacroLoc = I->getLocWithOffset(getARCMTMacroName().size());
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index 9ad3a7f..d8677c2 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -53,6 +53,107 @@
   HalfRank, FloatRank, DoubleRank, LongDoubleRank
 };
 
+const RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const {
+  if (!CommentsLoaded && ExternalSource) {
+    ExternalSource->ReadComments();
+    CommentsLoaded = true;
+  }
+
+  assert(D);
+
+  // TODO: handle comments for function parameters properly.
+  if (isa<ParmVarDecl>(D))
+    return NULL;
+
+  ArrayRef<RawComment> RawComments = Comments.getComments();
+
+  // If there are no comments anywhere, we won't find anything.
+  if (RawComments.empty())
+    return NULL;
+
+  // If the declaration doesn't map directly to a location in a file, we
+  // can't find the comment.
+  SourceLocation DeclLoc = D->getLocation();
+  if (DeclLoc.isInvalid() || !DeclLoc.isFileID())
+    return NULL;
+
+  // Find the comment that occurs just after this declaration.
+  ArrayRef<RawComment>::iterator Comment
+      = std::lower_bound(RawComments.begin(),
+                         RawComments.end(),
+                         SourceRange(DeclLoc),
+                         BeforeThanCompare<RawComment>(SourceMgr));
+
+  // Decompose the location for the declaration and find the beginning of the
+  // file buffer.
+  std::pair<FileID, unsigned> DeclLocDecomp = SourceMgr.getDecomposedLoc(DeclLoc);
+
+  // First check whether we have a trailing comment.
+  if (Comment != RawComments.end() &&
+      Comment->isDoxygen() && Comment->isTrailingComment() &&
+      !isa<TagDecl>(D) && !isa<NamespaceDecl>(D)) {
+    std::pair<FileID, unsigned> CommentBeginDecomp
+      = SourceMgr.getDecomposedLoc(Comment->getSourceRange().getBegin());
+    // Check that Doxygen trailing comment comes after the declaration, starts
+    // on the same line and in the same file as the declaration.
+    if (DeclLocDecomp.first == CommentBeginDecomp.first &&
+        SourceMgr.getLineNumber(DeclLocDecomp.first, DeclLocDecomp.second)
+          == SourceMgr.getLineNumber(CommentBeginDecomp.first,
+                                     CommentBeginDecomp.second)) {
+      return &*Comment;
+    }
+  }
+
+  // The comment just after the declaration was not a trailing comment.
+  // Let's look at the previous comment.
+  if (Comment == RawComments.begin())
+    return NULL;
+  --Comment;
+
+  // Check that we actually have a non-member Doxygen comment.
+  if (!Comment->isDoxygen() || Comment->isTrailingComment())
+    return NULL;
+
+  // Decompose the end of the comment.
+  std::pair<FileID, unsigned> CommentEndDecomp
+    = SourceMgr.getDecomposedLoc(Comment->getSourceRange().getEnd());
+
+  // If the comment and the declaration aren't in the same file, then they
+  // aren't related.
+  if (DeclLocDecomp.first != CommentEndDecomp.first)
+    return NULL;
+
+  // Get the corresponding buffer.
+  bool Invalid = false;
+  const char *Buffer = SourceMgr.getBufferData(DeclLocDecomp.first,
+                                               &Invalid).data();
+  if (Invalid)
+    return NULL;
+
+  // Extract text between the comment and declaration.
+  StringRef Text(Buffer + CommentEndDecomp.second,
+                 DeclLocDecomp.second - CommentEndDecomp.second);
+
+  // There should be no other declarations between comment and declaration.
+  if (Text.find_first_of(",;{}") != StringRef::npos)
+    return NULL;
+
+  return &*Comment;
+}
+
+const RawComment *ASTContext::getRawCommentForDecl(const Decl *D) const {
+  // Check whether we have cached a comment string for this declaration
+  // already.
+  llvm::DenseMap<const Decl *, const RawComment *>::iterator Pos
+      = DeclComments.find(D);
+  if (Pos != DeclComments.end())
+      return Pos->second;
+
+  const RawComment *RC = getRawCommentForDeclNoCache(D);
+  DeclComments[D] = RC;
+  return RC;
+}
+
 void 
 ASTContext::CanonicalTemplateTemplateParm::Profile(llvm::FoldingSetNodeID &ID, 
                                                TemplateTemplateParmDecl *Parm) {
@@ -244,6 +345,7 @@
     BuiltinInfo(builtins),
     DeclarationNames(*this),
     ExternalSource(0), Listener(0),
+    Comments(SM), CommentsLoaded(false),
     LastSDM(0, 0),
     UniqueBlockByRefTypeID(0) 
 {
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index dfb9d61..228b43b 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -15,3 +15,4 @@
 add_subdirectory(FrontendTool)
 add_subdirectory(Tooling)
 add_subdirectory(StaticAnalyzer)
+add_subdirectory(Comments)
diff --git a/lib/Comments/CMakeLists.txt b/lib/Comments/CMakeLists.txt
new file mode 100644
index 0000000..f9561c6
--- /dev/null
+++ b/lib/Comments/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_USED_LIBS clangBasic clangAST clangLex)
+
+add_clang_library(clangComments
+  CommentLexer.cpp
+  RawCommentList.cpp
+  )
+
diff --git a/lib/Comments/Makefile b/lib/Comments/Makefile
new file mode 100644
index 0000000..0783f1f
--- /dev/null
+++ b/lib/Comments/Makefile
@@ -0,0 +1,14 @@
+##===- clang/lib/Comments/Makefile -------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+CLANG_LEVEL := ../..
+LIBRARYNAME := clangComments
+
+include $(CLANG_LEVEL)/Makefile
+
diff --git a/lib/Comments/RawCommentList.cpp b/lib/Comments/RawCommentList.cpp
new file mode 100644
index 0000000..7db9175
--- /dev/null
+++ b/lib/Comments/RawCommentList.cpp
@@ -0,0 +1,207 @@
+//===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Comments/RawCommentList.h"
+#include "clang/AST/ASTContext.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace clang;
+
+namespace {
+/// Get comment kind and bool describing if it is a trailing comment.
+std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment) {
+  if (Comment.size() < 3 || Comment[0] != '/')
+    return std::make_pair(RawComment::CK_Invalid, false);
+
+  RawComment::CommentKind K;
+  if (Comment[1] == '/') {
+    if (Comment.size() < 3)
+      return std::make_pair(RawComment::CK_OrdinaryBCPL, false);
+
+    if (Comment[2] == '/')
+      K = RawComment::CK_BCPLSlash;
+    else if (Comment[2] == '!')
+      K = RawComment::CK_BCPLExcl;
+    else
+      return std::make_pair(RawComment::CK_OrdinaryBCPL, false);
+  } else {
+    assert(Comment.size() >= 4);
+
+    // Comment lexer does not understand escapes in comment markers, so pretend
+    // that this is not a comment.
+    if (Comment[1] != '*' ||
+        Comment[Comment.size() - 2] != '*' ||
+        Comment[Comment.size() - 1] != '/')
+      return std::make_pair(RawComment::CK_Invalid, false);
+
+    if (Comment[2] == '*')
+      K = RawComment::CK_JavaDoc;
+    else if (Comment[2] == '!')
+      K = RawComment::CK_Qt;
+    else
+      return std::make_pair(RawComment::CK_OrdinaryC, false);
+  }
+  const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
+  return std::make_pair(K, TrailingComment);
+}
+
+bool mergedCommentIsTrailingComment(StringRef Comment) {
+  return (Comment.size() > 3) && (Comment[3] == '<');
+}
+} // unnamed namespace
+
+RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
+                       bool Merged) :
+    Range(SR), RawTextValid(false), IsAlmostTrailingComment(false),
+    BeginLineValid(false), EndLineValid(false) {
+  // Extract raw comment text, if possible.
+  if (getRawText(SourceMgr).empty()) {
+    Kind = CK_Invalid;
+    return;
+  }
+
+  if (!Merged) {
+    // Guess comment kind.
+    std::pair<CommentKind, bool> K = getCommentKind(RawText);
+    Kind = K.first;
+    IsTrailingComment = K.second;
+
+    IsAlmostTrailingComment = RawText.startswith("//<") ||
+                                 RawText.startswith("/*<");
+  } else {
+    Kind = CK_Merged;
+    IsTrailingComment = mergedCommentIsTrailingComment(RawText);
+  }
+}
+
+unsigned RawComment::getBeginLine(const SourceManager &SM) const {
+  if (BeginLineValid)
+    return BeginLine;
+
+  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Range.getBegin());
+  BeginLine = SM.getLineNumber(LocInfo.first, LocInfo.second);
+  BeginLineValid = true;
+  return BeginLine;
+}
+
+unsigned RawComment::getEndLine(const SourceManager &SM) const {
+  if (EndLineValid)
+    return EndLine;
+
+  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Range.getEnd());
+  EndLine = SM.getLineNumber(LocInfo.first, LocInfo.second);
+  EndLineValid = true;
+  return EndLine;
+}
+
+StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
+  FileID BeginFileID;
+  FileID EndFileID;
+  unsigned BeginOffset;
+  unsigned EndOffset;
+
+  llvm::tie(BeginFileID, BeginOffset) =
+      SourceMgr.getDecomposedLoc(Range.getBegin());
+  llvm::tie(EndFileID, EndOffset) =
+      SourceMgr.getDecomposedLoc(Range.getEnd());
+
+  const unsigned Length = EndOffset - BeginOffset;
+  if (Length < 2)
+    return StringRef();
+
+  // The comment can't begin in one file and end in another.
+  assert(BeginFileID == EndFileID);
+
+  bool Invalid = false;
+  const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
+                                                    &Invalid).data();
+  if (Invalid)
+    return StringRef();
+
+  return StringRef(BufferStart + BeginOffset, Length);
+}
+
+namespace {
+bool containsOnlyWhitespace(StringRef Str) {
+  return Str.find_first_not_of(" \t\f\v\r\n") == StringRef::npos;
+}
+
+bool onlyWhitespaceBetweenComments(SourceManager &SM,
+                                   const RawComment &C1, const RawComment &C2) {
+  std::pair<FileID, unsigned> C1EndLocInfo = SM.getDecomposedLoc(
+                                                C1.getSourceRange().getEnd());
+  std::pair<FileID, unsigned> C2BeginLocInfo = SM.getDecomposedLoc(
+                                              C2.getSourceRange().getBegin());
+
+  // Question does not make sense if comments are located in different files.
+  if (C1EndLocInfo.first != C2BeginLocInfo.first)
+    return false;
+
+  bool Invalid = false;
+  const char *Buffer = SM.getBufferData(C1EndLocInfo.first, &Invalid).data();
+  if (Invalid)
+    return false;
+
+  StringRef TextBetweenComments(Buffer + C1EndLocInfo.second,
+                                C2BeginLocInfo.second - C1EndLocInfo.second);
+
+  return containsOnlyWhitespace(TextBetweenComments);
+}
+} // unnamed namespace
+
+void RawCommentList::addComment(const RawComment &RC, ASTContext &Context) {
+  if (RC.isInvalid())
+    return;
+
+  assert((Comments.empty() ||
+          SourceMgr.isBeforeInTranslationUnit(
+              Comments[0].getSourceRange().getEnd(),
+              RC.getSourceRange().getBegin())) &&
+         "comments are not coming in source order");
+
+  if (OnlyWhitespaceSeen) {
+    if (!onlyWhitespaceBetweenComments(SourceMgr, LastComment, RC))
+      OnlyWhitespaceSeen = false;
+  }
+
+  LastComment = RC;
+
+  // Ordinary comments are not interesting for us.
+  if (RC.isOrdinary())
+    return;
+
+  // If this is the first Doxygen comment, save it (because there isn't
+  // anything to merge it with).
+  if (Comments.empty()) {
+    Comments.push_back(RC);
+    OnlyWhitespaceSeen = true;
+    return;
+  }
+
+  const RawComment &C1 = Comments.back();
+  const RawComment &C2 = RC;
+
+  // Merge comments only if there is only whitespace between them.
+  // Can't merge trailing and non-trailing comments.
+  // Merge trailing comments if they are on same or consecutive lines.
+  if (OnlyWhitespaceSeen &&
+      (C1.isTrailingComment() == C2.isTrailingComment()) &&
+      (!C1.isTrailingComment() ||
+       C1.getEndLine(SourceMgr) + 1 >= C2.getBeginLine(SourceMgr))) {
+    SourceRange MergedRange(C1.getSourceRange().getBegin(),
+                            C2.getSourceRange().getEnd());
+    RawComment Merged(SourceMgr, MergedRange, true);
+    Comments.pop_back();
+    Comments.push_back(Merged);
+  } else
+    Comments.push_back(RC);
+
+  OnlyWhitespaceSeen = true;
+}
+
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index 5509f5f..70be2cf 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -623,14 +623,14 @@
                                      /*IsIncludeDirective=*/false);
 }
 
-void Preprocessor::AddCommentHandler(CommentHandler *Handler) {
+void Preprocessor::addCommentHandler(CommentHandler *Handler) {
   assert(Handler && "NULL comment handler");
   assert(std::find(CommentHandlers.begin(), CommentHandlers.end(), Handler) ==
          CommentHandlers.end() && "Comment handler already registered");
   CommentHandlers.push_back(Handler);
 }
 
-void Preprocessor::RemoveCommentHandler(CommentHandler *Handler) {
+void Preprocessor::removeCommentHandler(CommentHandler *Handler) {
   std::vector<CommentHandler *>::iterator Pos
   = std::find(CommentHandlers.begin(), CommentHandlers.end(), Handler);
   assert(Pos != CommentHandlers.end() && "Comment handler not registered");
diff --git a/lib/Makefile b/lib/Makefile
index 2eb72a9..0e81af1 100755
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,7 +10,7 @@
 
 PARALLEL_DIRS = Headers Basic Lex Parse AST Sema CodeGen Analysis \
                 StaticAnalyzer Edit Rewrite ARCMigrate Serialization Frontend \
-                FrontendTool Tooling Driver
+                FrontendTool Tooling Driver Comments
 
 include $(CLANG_LEVEL)/Makefile
 
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
index 0d7d047..10fb8eb 100644
--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp
@@ -23,6 +23,20 @@
 #include "clang/AST/ASTConsumer.h"
 using namespace clang;
 
+/// \brief A comment handler that passes comments found by the preprocessor
+/// to the parser action.
+class ActionCommentHandler : public CommentHandler {
+  Sema &S;
+
+public:
+  explicit ActionCommentHandler(Sema &S) : S(S) { }
+
+  virtual bool HandleComment(Preprocessor &PP, SourceRange Comment) {
+    S.ActOnComment(Comment);
+    return false;
+  }
+};
+
 IdentifierInfo *Parser::getSEHExceptKeyword() {
   // __except is accepted as a (contextual) keyword 
   if (!Ident__except && (getLangOpts().MicrosoftExt || getLangOpts().Borland))
@@ -77,7 +91,10 @@
 
     PP.AddPragmaHandler("OPENCL", FPContractHandler.get());
   }
-      
+
+  CommentHandler.reset(new ActionCommentHandler(actions));
+  PP.addCommentHandler(CommentHandler.get());
+
   PP.setCodeCompletionHandler(*this);
 }
 
@@ -422,6 +439,9 @@
 
   PP.RemovePragmaHandler("STDC", FPContractHandler.get());
   FPContractHandler.reset();
+
+  PP.removeCommentHandler(CommentHandler.get());
+
   PP.clearCodeCompletionHandler();
 
   assert(TemplateIds.empty() && "Still alive TemplateIdAnnotations around?");
diff --git a/lib/Sema/Sema.cpp b/lib/Sema/Sema.cpp
index 6323589..9e4b291 100644
--- a/lib/Sema/Sema.cpp
+++ b/lib/Sema/Sema.cpp
@@ -1014,6 +1014,11 @@
   return dyn_cast<LambdaScopeInfo>(FunctionScopes.back());  
 }
 
+void Sema::ActOnComment(SourceRange Comment) {
+  RawComment RC(SourceMgr, Comment);
+  Context.addComment(RC);
+}
+
 // Pin this vtable to this file.
 ExternalSemaSource::~ExternalSemaSource() {}
 
diff --git a/lib/Sema/SemaType.cpp b/lib/Sema/SemaType.cpp
index d6c8d92..4bd9b14 100644
--- a/lib/Sema/SemaType.cpp
+++ b/lib/Sema/SemaType.cpp
@@ -2550,7 +2550,7 @@
         //  RemovalLocs.push_back(Chunk.Fun.getRestrictQualifierLoc());
         if (!RemovalLocs.empty()) {
           std::sort(RemovalLocs.begin(), RemovalLocs.end(),
-                    SourceManager::LocBeforeThanCompare(S.getSourceManager()));
+                    BeforeThanCompare<SourceLocation>(S.getSourceManager()));
           RemovalRange = SourceRange(RemovalLocs.front(), RemovalLocs.back());
           Loc = RemovalLocs.front();
         }
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index eb74566..f5aa74e 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -1737,6 +1737,17 @@
         }
         break;
 
+      case COMMENTS_BLOCK_ID: {
+        llvm::BitstreamCursor C = Stream;
+        if (Stream.SkipBlock() ||
+            ReadBlockAbbrevs(C, COMMENTS_BLOCK_ID)) {
+          Error("malformed comments block in AST file");
+          return Failure;
+        }
+        CommentsCursors.push_back(std::make_pair(C, &F));
+        break;
+      }
+
       default:
         if (!Stream.SkipBlock())
           break;
@@ -6258,6 +6269,61 @@
   CurrSwitchCaseStmts->clear();
 }
 
+void ASTReader::ReadComments() {
+  std::vector<RawComment> Comments;
+  for (SmallVectorImpl<std::pair<llvm::BitstreamCursor,
+                                 serialization::ModuleFile *> >::iterator
+       I = CommentsCursors.begin(),
+       E = CommentsCursors.end();
+       I != E; ++I) {
+    llvm::BitstreamCursor &Cursor = I->first;
+    serialization::ModuleFile &F = *I->second;
+    SavedStreamPosition SavedPosition(Cursor);
+
+    RecordData Record;
+    while (true) {
+      unsigned Code = Cursor.ReadCode();
+      if (Code == llvm::bitc::END_BLOCK)
+        break;
+
+      if (Code == llvm::bitc::ENTER_SUBBLOCK) {
+        // No known subblocks, always skip them.
+        Cursor.ReadSubBlockID();
+        if (Cursor.SkipBlock()) {
+          Error("malformed block record in AST file");
+          return;
+        }
+        continue;
+      }
+
+      if (Code == llvm::bitc::DEFINE_ABBREV) {
+        Cursor.ReadAbbrevRecord();
+        continue;
+      }
+
+      // Read a record.
+      Record.clear();
+      switch ((CommentRecordTypes) Cursor.ReadRecord(Code, Record)) {
+        default:  // Default behavior: ignore.
+          break;
+
+        case COMMENTS_RAW_COMMENT: {
+          unsigned Idx = 0;
+          SourceRange SR = ReadSourceRange(F, Record, Idx);
+          RawComment::CommentKind Kind =
+              (RawComment::CommentKind) Record[Idx++];
+          bool IsTrailingComment = Record[Idx++];
+          bool IsAlmostTrailingComment = Record[Idx++];
+          Comments.push_back(RawComment(SR, Kind, IsTrailingComment,
+                                        IsAlmostTrailingComment));
+          break;
+      }
+      }
+    }
+  }
+  Context.Comments.addCommentsToFront(Comments);
+}
+
 void ASTReader::finishPendingActions() {
   while (!PendingIdentifierInfos.empty() || !PendingDeclChains.empty()) {
     // If any identifiers with corresponding top-level declarations have
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index de5816d..1f96180 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -2240,6 +2240,23 @@
   Stream.EmitRecordWithBlob(AbbrevCode, Record, data(FileSortedIDs));
 }
 
+void ASTWriter::WriteComments() {
+  Stream.EnterSubblock(COMMENTS_BLOCK_ID, 3);
+  ArrayRef<RawComment> RawComments = Context->Comments.getComments();
+  RecordData Record;
+  for (ArrayRef<RawComment>::iterator I = RawComments.begin(),
+                                      E = RawComments.end();
+       I != E; ++I) {
+    Record.clear();
+    AddSourceRange(I->getSourceRange(), Record);
+    Record.push_back(I->getKind());
+    Record.push_back(I->isTrailingComment());
+    Record.push_back(I->isAlmostTrailingComment());
+    Stream.EmitRecord(COMMENTS_RAW_COMMENT, Record);
+  }
+  Stream.ExitBlock();
+}
+
 //===----------------------------------------------------------------------===//
 // Global Method Pool and Selector Serialization
 //===----------------------------------------------------------------------===//
@@ -3415,6 +3432,7 @@
 
   WriteFileDeclIDsMap();
   WriteSourceManagerBlock(Context.getSourceManager(), PP, isysroot);
+  WriteComments();
   
   if (Chain) {
     // Write the mapping information describing our module dependencies and how