Implement AST classes for comments, a real parser for Doxygen comments and a
very simple semantic analysis that just builds the AST; minor changes for lexer
to pick up source locations I didn't think about before.

Comments AST is modelled along the ideas of HTML AST: block and inline content.

* Block content is a paragraph or a command that has a paragraph as an argument
  or verbatim command.
* Inline content is placed within some block.  Inline content includes plain
  text, inline commands and HTML as tag soup.



git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@159790 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/AST/CommentParser.cpp b/lib/AST/CommentParser.cpp
new file mode 100644
index 0000000..701b6fa
--- /dev/null
+++ b/lib/AST/CommentParser.cpp
@@ -0,0 +1,414 @@
+//===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/CommentParser.h"
+#include "clang/AST/CommentSema.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace clang {
+namespace comments {
+
+Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator):
+    L(L), S(S), Allocator(Allocator) {
+  consumeToken();
+}
+
+ParamCommandComment *Parser::parseParamCommandArgs(
+    ParamCommandComment *PC,
+    TextTokenRetokenizer &Retokenizer) {
+  Token Arg;
+  // Check if argument looks like direction specification: [dir]
+  // e.g., [in], [out], [in,out]
+  if (Retokenizer.lexDelimitedSeq(Arg, '[', ']'))
+    PC = S.actOnParamCommandArg(PC,
+                                Arg.getLocation(),
+                                Arg.getEndLocation(),
+                                Arg.getText(),
+                                /* IsDirection = */ true);
+
+  if (Retokenizer.lexWord(Arg))
+    StringRef ArgText = Arg.getText();
+    PC = S.actOnParamCommandArg(PC,
+                                Arg.getLocation(),
+                                Arg.getEndLocation(),
+                                Arg.getText(),
+                                /* IsDirection = */ false);
+
+  return PC;
+}
+
+BlockCommandComment *Parser::parseBlockCommandArgs(
+    BlockCommandComment *BC,
+    TextTokenRetokenizer &Retokenizer,
+    unsigned NumArgs) {
+  typedef BlockCommandComment::Argument Argument;
+  Argument *Args = new (Allocator) Argument[NumArgs];
+  unsigned ParsedArgs = 0;
+  Token Arg;
+  while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) {
+    Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(),
+                                            Arg.getEndLocation()),
+                                Arg.getText());
+    ParsedArgs++;
+  }
+
+  return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs));
+}
+
+BlockCommandComment *Parser::parseBlockCommand() {
+  assert(Tok.is(tok::command));
+
+  ParamCommandComment *PC;
+  BlockCommandComment *BC;
+  bool IsParam = false;
+  unsigned NumArgs = 0;
+  if (S.isParamCommand(Tok.getCommandName())) {
+    IsParam = true;
+    PC = S.actOnParamCommandStart(Tok.getLocation(),
+                                  Tok.getEndLocation(),
+                                  Tok.getCommandName());
+  } else {
+    NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName());
+    BC = S.actOnBlockCommandStart(Tok.getLocation(),
+                                  Tok.getEndLocation(),
+                                  Tok.getCommandName());
+  }
+  consumeToken();
+
+  if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) {
+    // Block command ahead.  We can't nest block commands, so pretend that this
+    // command has an empty argument.
+    // TODO: Diag() Warn empty arg to block command
+    ParagraphComment *PC = S.actOnParagraphComment(
+                                ArrayRef<InlineContentComment *>());
+    return S.actOnBlockCommandFinish(BC, PC);
+  }
+
+  if (IsParam || NumArgs > 0) {
+    // In order to parse command arguments we need to retokenize a few
+    // following text tokens.
+    TextTokenRetokenizer Retokenizer(Allocator);
+    while (Tok.is(tok::text)) {
+      if (Retokenizer.addToken(Tok))
+        consumeToken();
+    }
+
+    if (IsParam)
+      PC = parseParamCommandArgs(PC, Retokenizer);
+    else
+      BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs);
+
+    // Put back tokens we didn't use.
+    Token Text;
+    while (Retokenizer.lexText(Text))
+      putBack(Text);
+  }
+
+  BlockContentComment *Block = parseParagraphOrBlockCommand();
+  // Since we have checked for a block command, we should have parsed a
+  // paragraph.
+  if (IsParam)
+    return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block));
+  else
+    return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block));
+}
+
+InlineCommandComment *Parser::parseInlineCommand() {
+  assert(Tok.is(tok::command));
+
+  const Token CommandTok = Tok;
+  consumeToken();
+
+  TextTokenRetokenizer Retokenizer(Allocator);
+  while (Tok.is(tok::text)) {
+    if (Retokenizer.addToken(Tok))
+      consumeToken();
+  }
+
+  Token ArgTok;
+  bool ArgTokValid = Retokenizer.lexWord(ArgTok);
+
+  InlineCommandComment *IC;
+  if (ArgTokValid) {
+    IC = S.actOnInlineCommand(CommandTok.getLocation(),
+                              CommandTok.getEndLocation(),
+                              CommandTok.getCommandName(),
+                              ArgTok.getLocation(),
+                              ArgTok.getEndLocation(),
+                              ArgTok.getText());
+  } else {
+    IC = S.actOnInlineCommand(CommandTok.getLocation(),
+                              CommandTok.getEndLocation(),
+                              CommandTok.getCommandName());
+  }
+
+  Token Text;
+  while (Retokenizer.lexText(Text))
+    putBack(Text);
+
+  return IC;
+}
+
+HTMLOpenTagComment *Parser::parseHTMLOpenTag() {
+  assert(Tok.is(tok::html_tag_open));
+  HTMLOpenTagComment *HOT =
+      S.actOnHTMLOpenTagStart(Tok.getLocation(),
+                              Tok.getHTMLTagOpenName());
+  consumeToken();
+
+  SmallVector<HTMLOpenTagComment::Attribute, 2> Attrs;
+  while (true) {
+    if (Tok.is(tok::html_ident)) {
+      Token Ident = Tok;
+      consumeToken();
+      if (Tok.isNot(tok::html_equals)) {
+        Attrs.push_back(HTMLOpenTagComment::Attribute(Ident.getLocation(),
+                                                      Ident.getHTMLIdent()));
+        continue;
+      }
+      Token Equals = Tok;
+      consumeToken();
+      if (Tok.isNot(tok::html_quoted_string)) {
+        // TODO: Diag() expected quoted string
+        Attrs.push_back(HTMLOpenTagComment::Attribute(Ident.getLocation(),
+                                                      Ident.getHTMLIdent()));
+        continue;
+      }
+      Attrs.push_back(HTMLOpenTagComment::Attribute(
+                              Ident.getLocation(),
+                              Ident.getHTMLIdent(),
+                              Equals.getLocation(),
+                              SourceRange(Tok.getLocation(),
+                                          Tok.getEndLocation()),
+                              Tok.getHTMLQuotedString()));
+      consumeToken();
+      continue;
+    } else if (Tok.is(tok::html_greater)) {
+      HOT = S.actOnHTMLOpenTagFinish(HOT,
+                                     copyArray(llvm::makeArrayRef(Attrs)),
+                                     Tok.getLocation());
+      consumeToken();
+      return HOT;
+    } else if (Tok.is(tok::html_equals) ||
+               Tok.is(tok::html_quoted_string)) {
+      // TODO: Diag() Err expected ident
+      while (Tok.is(tok::html_equals) ||
+             Tok.is(tok::html_quoted_string))
+        consumeToken();
+    } else {
+      // Not a token from HTML open tag.  Thus HTML tag prematurely ended.
+      // TODO: Diag() Err HTML tag prematurely ended
+      return S.actOnHTMLOpenTagFinish(HOT,
+                                      copyArray(llvm::makeArrayRef(Attrs)),
+                                      SourceLocation());
+    }
+  }
+}
+
+HTMLCloseTagComment *Parser::parseHTMLCloseTag() {
+  assert(Tok.is(tok::html_tag_close));
+  Token TokTagOpen = Tok;
+  consumeToken();
+  SourceLocation Loc;
+  if (Tok.is(tok::html_greater)) {
+    Loc = Tok.getLocation();
+    consumeToken();
+  }
+
+  return S.actOnHTMLCloseTag(TokTagOpen.getLocation(),
+                             Loc,
+                             TokTagOpen.getHTMLTagCloseName());
+}
+
+BlockContentComment *Parser::parseParagraphOrBlockCommand() {
+  SmallVector<InlineContentComment *, 8> Content;
+
+  while (true) {
+    switch (Tok.getKind()) {
+    case tok::verbatim_block_begin:
+    case tok::verbatim_line_name:
+    case tok::eof:
+      assert(Content.size() != 0);
+      break; // Block content or EOF ahead, finish this parapgaph.
+
+    case tok::command:
+      if (S.isBlockCommand(Tok.getCommandName())) {
+        if (Content.size() == 0)
+          return parseBlockCommand();
+        break; // Block command ahead, finish this parapgaph.
+      }
+      if (S.isInlineCommand(Tok.getCommandName())) {
+        Content.push_back(parseInlineCommand());
+        continue;
+      }
+
+      // Not a block command, not an inline command ==> an unknown command.
+      Content.push_back(S.actOnUnknownCommand(Tok.getLocation(),
+                                              Tok.getEndLocation(),
+                                              Tok.getCommandName()));
+      consumeToken();
+      continue;
+
+    case tok::newline: {
+      consumeToken();
+      if (Tok.is(tok::newline) || Tok.is(tok::eof)) {
+        consumeToken();
+        break; // Two newlines -- end of paragraph.
+      }
+      if (Content.size() > 0)
+        Content.back()->addTrailingNewline();
+      continue;
+    }
+
+    // Don't deal with HTML tag soup now.
+    case tok::html_tag_open:
+      Content.push_back(parseHTMLOpenTag());
+      continue;
+
+    case tok::html_tag_close:
+      Content.push_back(parseHTMLCloseTag());
+      continue;
+
+    case tok::text:
+      Content.push_back(S.actOnText(Tok.getLocation(),
+                                    Tok.getEndLocation(),
+                                    Tok.getText()));
+      consumeToken();
+      continue;
+
+    case tok::verbatim_block_line:
+    case tok::verbatim_block_end:
+    case tok::verbatim_line_text:
+    case tok::html_ident:
+    case tok::html_equals:
+    case tok::html_quoted_string:
+    case tok::html_greater:
+      llvm_unreachable("should not see this token");
+    }
+    break;
+  }
+
+  return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content)));
+}
+
+VerbatimBlockComment *Parser::parseVerbatimBlock() {
+  assert(Tok.is(tok::verbatim_block_begin));
+
+  VerbatimBlockComment *VB =
+      S.actOnVerbatimBlockStart(Tok.getLocation(),
+                                Tok.getVerbatimBlockName());
+  consumeToken();
+
+  // Don't create an empty line if verbatim opening command is followed
+  // by a newline.
+  if (Tok.is(tok::newline))
+    consumeToken();
+
+  SmallVector<VerbatimBlockLineComment *, 8> Lines;
+  while (Tok.is(tok::verbatim_block_line) ||
+         Tok.is(tok::newline)) {
+    VerbatimBlockLineComment *Line;
+    if (Tok.is(tok::verbatim_block_line)) {
+      Line = S.actOnVerbatimBlockLine(Tok.getLocation(),
+                                      Tok.getVerbatimBlockText());
+      consumeToken();
+      if (Tok.is(tok::newline)) {
+        consumeToken();
+      }
+    } else {
+      // Empty line, just a tok::newline.
+      Line = S.actOnVerbatimBlockLine(Tok.getLocation(),
+                                      "");
+      consumeToken();
+    }
+    Lines.push_back(Line);
+  }
+
+  assert(Tok.is(tok::verbatim_block_end));
+  VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(),
+                                  Tok.getVerbatimBlockName(),
+                                  copyArray(llvm::makeArrayRef(Lines)));
+  consumeToken();
+
+  return VB;
+}
+
+VerbatimLineComment *Parser::parseVerbatimLine() {
+  assert(Tok.is(tok::verbatim_line_name));
+
+  Token NameTok = Tok;
+  consumeToken();
+
+  SourceLocation TextBegin;
+  StringRef Text;
+  // Next token might not be a tok::verbatim_line_text if verbatim line
+  // starting command comes just before a newline or comment end.
+  if (Tok.is(tok::verbatim_line_text)) {
+    TextBegin = Tok.getLocation();
+    Text = Tok.getVerbatimLineText();
+  } else {
+    TextBegin = NameTok.getEndLocation();
+    Text = "";
+  }
+
+  VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(),
+                                                NameTok.getVerbatimLineName(),
+                                                TextBegin,
+                                                Text);
+  consumeToken();
+  return VL;
+}
+
+BlockContentComment *Parser::parseBlockContent() {
+  switch (Tok.getKind()) {
+  case tok::text:
+  case tok::command:
+  case tok::html_tag_open:
+  case tok::html_tag_close:
+    return parseParagraphOrBlockCommand();
+
+  case tok::verbatim_block_begin:
+    return parseVerbatimBlock();
+
+  case tok::verbatim_line_name:
+    return parseVerbatimLine();
+
+  case tok::eof:
+  case tok::newline:
+  case tok::verbatim_block_line:
+  case tok::verbatim_block_end:
+  case tok::verbatim_line_text:
+  case tok::html_ident:
+  case tok::html_equals:
+  case tok::html_quoted_string:
+  case tok::html_greater:
+    llvm_unreachable("should not see this token");
+  }
+}
+
+FullComment *Parser::parseFullComment() {
+  // Skip newlines at the beginning of the comment.
+  while (Tok.is(tok::newline))
+    consumeToken();
+
+  SmallVector<BlockContentComment *, 8> Blocks;
+  while (Tok.isNot(tok::eof)) {
+    Blocks.push_back(parseBlockContent());
+
+    // Skip extra newlines after paragraph end.
+    while (Tok.is(tok::newline))
+      consumeToken();
+  }
+  return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks)));
+}
+
+} // end namespace comments
+} // end namespace clang
+
+