Comment parsing: parse "<blah" as an HTML tag only if "blah" is a known tag
name. This should reduce the amount of warning false positives about bad HTML
in comments when the comment author intended to put a reference to a template.
This change will also enable us parse the comment as intended in these cases.
Fixes part 1 of PR13374.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@162407 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp
index b6516ec..870db2b 100644
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -28,6 +28,33 @@
(C >= 'a' && C <= 'f') ||
(C >= 'A' && C <= 'F');
}
+
+bool isHTMLTagName(StringRef Name) {
+ return llvm::StringSwitch<bool>(Name)
+ .Cases("em", "strong", true)
+ .Cases("tt", "i", "b", "big", "small", true)
+ .Cases("strike", "s", "u", "font", true)
+ .Case("a", true)
+ .Case("hr", true)
+ .Cases("div", "span", true)
+ .Cases("h1", "h2", "h3", true)
+ .Cases("h4", "h5", "h6", true)
+ .Case("code", true)
+ .Case("blockquote", true)
+ .Cases("sub", "sup", true)
+ .Case("img", true)
+ .Case("p", true)
+ .Case("br", true)
+ .Case("pre", true)
+ .Cases("ins", "del", true)
+ .Cases("ul", "ol", "li", true)
+ .Cases("dl", "dt", "dd", true)
+ .Cases("table", "caption", true)
+ .Cases("thead", "tfoot", "tbody", true)
+ .Cases("colgroup", "col", true)
+ .Cases("tr", "th", "td", true)
+ .Default(false);
+}
} // unnamed namespace
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
@@ -585,8 +612,12 @@
assert(BufferPtr[0] == '<' &&
isHTMLIdentifierStartingCharacter(BufferPtr[1]));
const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
-
StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
+ if (!isHTMLTagName(Name)) {
+ formTextToken(T, TagNameEnd);
+ return;
+ }
+
formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
T.setHTMLTagStartName(Name);
@@ -665,11 +696,16 @@
const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
+ StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
+ if (!isHTMLTagName(Name)) {
+ formTextToken(T, TagNameEnd);
+ return;
+ }
const char *End = skipWhitespace(TagNameEnd, CommentEnd);
formTokenWithChars(T, End, tok::html_end_tag);
- T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
+ T.setHTMLTagEndName(Name);
if (BufferPtr != CommentEnd && *BufferPtr == '>')
State = LS_HTMLEndTag;