Implement resolving of HTML character references (named: &, decimal: *,
hex: ) during comment parsing.
Now internal representation of plain text in comment AST does not contain
character references, but the characters themselves.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@160891 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/unittests/AST/CommentLexer.cpp b/unittests/AST/CommentLexer.cpp
index dd92df4..8b5d0c8 100644
--- a/unittests/AST/CommentLexer.cpp
+++ b/unittests/AST/CommentLexer.cpp
@@ -37,6 +37,7 @@
IntrusiveRefCntPtr<DiagnosticIDs> DiagID;
DiagnosticsEngine Diags;
SourceManager SourceMgr;
+ llvm::BumpPtrAllocator Allocator;
void lexString(const char *Source, std::vector<Token> &Toks);
};
@@ -47,7 +48,7 @@
FileID File = SourceMgr.createFileIDForMemBuffer(Buf);
SourceLocation Begin = SourceMgr.getLocForStartOfFile(File);
- comments::Lexer L(Begin, CommentOptions(),
+ comments::Lexer L(Allocator, Begin, CommentOptions(),
Source, Source + strlen(Source));
while (1) {
@@ -1272,6 +1273,324 @@
}
}
+TEST_F(CommentLexerTest, HTMLCharacterReferences1) {
+ const char *Source = "// &";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences2) {
+ const char *Source = "// &!";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef("!"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences3) {
+ const char *Source = "// &";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences4) {
+ const char *Source = "// &!";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef("!"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences5) {
+ const char *Source = "// &#";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&#"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences6) {
+ const char *Source = "// &#a";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&#"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef("a"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences7) {
+ const char *Source = "// *";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("*"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences8) {
+ const char *Source = "// *a";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("*"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef("a"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences9) {
+ const char *Source = "// &#x";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&#x"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences10) {
+ const char *Source = "// &#xz";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&#x"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef("z"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences11) {
+ const char *Source = "// «";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("«"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences12) {
+ const char *Source = "// «z";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("«"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef("z"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences13) {
+ const char *Source = "// &";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&"), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences14) {
+ const char *Source = "// &<";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef("<"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences15) {
+ const char *Source = "// & meow";
+
+ std::vector<Token> Toks;
+
+ lexString(Source, Toks);
+
+ ASSERT_EQ(4U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("&"), Toks[1].getText());
+
+ ASSERT_EQ(tok::text, Toks[2].getKind());
+ ASSERT_EQ(StringRef(" meow"), Toks[2].getText());
+
+ ASSERT_EQ(tok::newline, Toks[3].getKind());
+}
+
+TEST_F(CommentLexerTest, HTMLCharacterReferences16) {
+ const char *Sources[] = {
+ "// =",
+ "// =",
+ "// ="
+ };
+
+ for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) {
+ std::vector<Token> Toks;
+
+ lexString(Sources[i], Toks);
+
+ ASSERT_EQ(3U, Toks.size());
+
+ ASSERT_EQ(tok::text, Toks[0].getKind());
+ ASSERT_EQ(StringRef(" "), Toks[0].getText());
+
+ ASSERT_EQ(tok::text, Toks[1].getKind());
+ ASSERT_EQ(StringRef("="), Toks[1].getText());
+
+ ASSERT_EQ(tok::newline, Toks[2].getKind());
+ }
+}
+
TEST_F(CommentLexerTest, MultipleComments) {
const char *Source =
"// Aaa\n"