Implement AST classes for comments, a real parser for Doxygen comments and a
very simple semantic analysis that just builds the AST; minor changes for lexer
to pick up source locations I didn't think about before.

Comments AST is modelled along the ideas of HTML AST: block and inline content.

* Block content is a paragraph or a command that has a paragraph as an argument
  or verbatim command.
* Inline content is placed within some block.  Inline content includes plain
  text, inline commands and HTML as tag soup.



git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@159790 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp
index c3a801d..77d2a9b 100644
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -122,6 +122,7 @@
 }
 
 namespace {
+/// Returns pointer to the first newline character in the string.
 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
     const char C = *BufferPtr;
@@ -270,6 +271,9 @@
   case LS_HTMLOpenTag:
     lexHTMLOpenTag(T);
     return;
+  case LS_HTMLCloseTag:
+    lexHTMLCloseTag(T);
+    return;
   }
 
   assert(State == LS_Normal);
@@ -356,7 +360,7 @@
         if (isHTMLIdentifierCharacter(C))
           setupAndLexHTMLOpenTag(T);
         else if (C == '/')
-          lexHTMLCloseTag(T);
+          setupAndLexHTMLCloseTag(T);
         else {
           StringRef Text(BufferPtr, TokenPtr - BufferPtr);
           formTokenWithChars(T, TokenPtr, tok::text);
@@ -404,6 +408,18 @@
   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
   T.setVerbatimBlockName(Name);
 
+  // If there is a newline following the verbatim opening command, skip the
+  // newline so that we don't create an tok::verbatim_block_line with empty
+  // text content.
+  if (BufferPtr != CommentEnd) {
+    const char C = *BufferPtr;
+    if (C == '\n' || C == '\r') {
+      BufferPtr = skipNewline(BufferPtr, CommentEnd);
+      State = LS_VerbatimBlockBody;
+      return;
+    }
+  }
+
   State = LS_VerbatimBlockFirstLine;
 }
 
@@ -419,9 +435,11 @@
 
   // Look for end command in current line.
   size_t Pos = Line.find(VerbatimBlockEndCommandName);
+  const char *TextEnd;
   const char *NextLine;
   if (Pos == StringRef::npos) {
     // Current line is completely verbatim.
+    TextEnd = Newline;
     NextLine = skipNewline(Newline, CommentEnd);
   } else if (Pos == 0) {
     // Current line contains just an end command.
@@ -433,10 +451,11 @@
     return;
   } else {
     // There is some text, followed by end command.  Extract text first.
-    NextLine = BufferPtr + Pos;
+    TextEnd = BufferPtr + Pos;
+    NextLine = TextEnd;
   }
 
-  StringRef Text(BufferPtr, NextLine - BufferPtr);
+  StringRef Text(BufferPtr, TextEnd - BufferPtr);
   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
   T.setVerbatimBlockText(Text);
 
@@ -542,18 +561,26 @@
   }
 }
 
-void Lexer::lexHTMLCloseTag(Token &T) {
+void Lexer::setupAndLexHTMLCloseTag(Token &T) {
   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
 
   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
 
   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
-  if (End != CommentEnd && *End == '>')
-    End++;
 
   formTokenWithChars(T, End, tok::html_tag_close);
   T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
+
+  if (BufferPtr != CommentEnd && *BufferPtr == '>')
+    State = LS_HTMLCloseTag;
+}
+
+void Lexer::lexHTMLCloseTag(Token &T) {
+  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
+
+  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
+  State = LS_Normal;
 }
 
 Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
@@ -595,7 +622,8 @@
         BufferPtr++;
 
       CommentState = LCS_InsideBCPLComment;
-      State = LS_Normal;
+      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
+        State = LS_Normal;
       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
       goto again;
     }
@@ -628,7 +656,7 @@
       EndWhitespace++;
 
     // Turn any whitespace between comments (and there is only whitespace
-    // between them) into a newline.  We have two newlines between comments
+    // between them) into a newline.  We have two newlines between C comments
     // in total (first one was synthesized after a comment).
     formTokenWithChars(T, EndWhitespace, tok::newline);