Implement resolving of HTML character references (named: &, decimal: *,
hex: ) during comment parsing.

Now internal representation of plain text in comment AST does not contain
character references, but the characters themselves.


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@160891 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp
index 3146832..dde4845 100644
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -1,4 +1,5 @@
 #include "clang/AST/CommentLexer.h"
+#include "clang/Basic/ConvertUTF.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -87,6 +88,71 @@
   return false;
 }
 
+namespace {
+bool isHTMLNamedCharacterReferenceCharacter(char C) {
+  return (C >= 'a' && C <= 'z') ||
+         (C >= 'A' && C <= 'Z');
+}
+
+bool isHTMLDecimalCharacterReferenceCharacter(char C) {
+  return C >= '0' && C <= '9';
+}
+
+bool isHTMLHexCharacterReferenceCharacter(char C) {
+  return (C >= '0' && C <= '9') ||
+         (C >= 'a' && C <= 'f') ||
+         (C >= 'A' && C <= 'F');
+}
+} // unnamed namespace
+
+StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
+  return llvm::StringSwitch<StringRef>(Name)
+      .Case("amp", "&")
+      .Case("lt", "<")
+      .Case("gt", ">")
+      .Case("quot", "\"")
+      .Case("apos", "\'")
+      .Default("");
+}
+
+StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
+  unsigned CodePoint = 0;
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
+    CodePoint *= 10;
+    CodePoint += Name[i] - '0';
+  }
+
+  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  char *ResolvedPtr = Resolved;
+  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
+    return StringRef(Resolved, ResolvedPtr - Resolved);
+  else
+    return StringRef();
+}
+
+StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
+  unsigned CodePoint = 0;
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    CodePoint *= 16;
+    const char C = Name[i];
+    assert(isHTMLHexCharacterReferenceCharacter(C));
+    if (C >= '0' && C <= '9')
+      CodePoint += Name[i] - '0';
+    else if (C >= 'a' && C <= 'f')
+      CodePoint += Name[i] - 'a' + 10;
+    else
+      CodePoint += Name[i] - 'A' + 10;
+  }
+
+  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  char *ResolvedPtr = Resolved;
+  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
+    return StringRef(Resolved, ResolvedPtr - Resolved);
+  else
+    return StringRef();
+}
+
 void Lexer::skipLineStartingDecorations() {
   // This function should be called only for C comments
   assert(CommentState == LCS_InsideCComment);
@@ -147,6 +213,33 @@
   return BufferPtr;
 }
 
+const char *skipNamedCharacterReference(const char *BufferPtr,
+                                        const char *BufferEnd) {
+  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
+const char *skipDecimalCharacterReference(const char *BufferPtr,
+                                          const char *BufferEnd) {
+  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
+const char *skipHexCharacterReference(const char *BufferPtr,
+                                          const char *BufferEnd) {
+  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
+      return BufferPtr;
+  }
+  return BufferEnd;
+}
+
 bool isHTMLIdentifierStartingCharacter(char C) {
   return (C >= 'a' && C <= 'z') ||
          (C >= 'A' && C <= 'Z');
@@ -295,9 +388,7 @@
       case '@': {
         TokenPtr++;
         if (TokenPtr == CommentEnd) {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
+          formTextToken(T, TokenPtr);
           return;
         }
         char C = *TokenPtr;
@@ -322,9 +413,7 @@
 
         // Don't make zero-length commands.
         if (!isCommandNameCharacter(*TokenPtr)) {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
+          formTextToken(T, TokenPtr);
           return;
         }
 
@@ -357,12 +446,14 @@
         return;
       }
 
+      case '&':
+        lexHTMLCharacterReference(T);
+        return;
+
       case '<': {
         TokenPtr++;
         if (TokenPtr == CommentEnd) {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
+          formTextToken(T, TokenPtr);
           return;
         }
         const char C = *TokenPtr;
@@ -370,11 +461,9 @@
           setupAndLexHTMLStartTag(T);
         else if (C == '/')
           setupAndLexHTMLEndTag(T);
-        else {
-          StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-          formTokenWithChars(T, TokenPtr, tok::text);
-          T.setText(Text);
-        }
+        else
+          formTextToken(T, TokenPtr);
+
         return;
       }
 
@@ -394,12 +483,10 @@
             break;
           const char C = *TokenPtr;
           if(C == '\n' || C == '\r' ||
-             C == '\\' || C == '@' || C == '<')
+             C == '\\' || C == '@' || C == '&' || C == '<')
             break;
         }
-        StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-        formTokenWithChars(T, TokenPtr, tok::text);
-        T.setText(Text);
+        formTextToken(T, TokenPtr);
         return;
       }
     }
@@ -506,6 +593,69 @@
   State = LS_Normal;
 }
 
+void Lexer::lexHTMLCharacterReference(Token &T) {
+  const char *TokenPtr = BufferPtr;
+  assert(*TokenPtr == '&');
+  TokenPtr++;
+  if (TokenPtr == CommentEnd) {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  const char *NamePtr;
+  bool isNamed = false;
+  bool isDecimal = false;
+  char C = *TokenPtr;
+  if (isHTMLNamedCharacterReferenceCharacter(C)) {
+    NamePtr = TokenPtr;
+    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
+    isNamed = true;
+  } else if (C == '#') {
+    TokenPtr++;
+    if (TokenPtr == CommentEnd) {
+      formTextToken(T, TokenPtr);
+      return;
+    }
+    C = *TokenPtr;
+    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
+      NamePtr = TokenPtr;
+      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
+      isDecimal = true;
+    } else if (C == 'x' || C == 'X') {
+      TokenPtr++;
+      NamePtr = TokenPtr;
+      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
+    } else {
+      formTextToken(T, TokenPtr);
+      return;
+    }
+  } else {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
+      *TokenPtr != ';') {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  StringRef Name(NamePtr, TokenPtr - NamePtr);
+  TokenPtr++; // Skip semicolon.
+  StringRef Resolved;
+  if (isNamed)
+    Resolved = resolveHTMLNamedCharacterReference(Name);
+  else if (isDecimal)
+    Resolved = resolveHTMLDecimalCharacterReference(Name);
+  else
+    Resolved = resolveHTMLHexCharacterReference(Name);
+
+  if (Resolved.empty()) {
+    formTextToken(T, TokenPtr);
+    return;
+  }
+  formTokenWithChars(T, TokenPtr, tok::text);
+  T.setText(Resolved);
+  return;
+}
+
 void Lexer::setupAndLexHTMLStartTag(Token &T) {
   assert(BufferPtr[0] == '<' &&
          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
@@ -561,11 +711,9 @@
       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
         TokenPtr++;
         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
-      } else {
-        StringRef Text(BufferPtr, TokenPtr - BufferPtr);
-        formTokenWithChars(T, TokenPtr, tok::text);
-        T.setText(Text);
-      }
+      } else
+        formTextToken(T, TokenPtr);
+
       State = LS_Normal;
       return;
     }
@@ -609,8 +757,10 @@
   State = LS_Normal;
 }
 
-Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
+Lexer::Lexer(llvm::BumpPtrAllocator &Allocator,
+             SourceLocation FileLoc, const CommentOptions &CommOpts,
              const char *BufferStart, const char *BufferEnd):
+    Allocator(Allocator),
     BufferStart(BufferStart), BufferEnd(BufferEnd),
     FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
     CommentState(LCS_BeforeComment), State(LS_Normal) {