Issue #6662: Fix parsing of malformatted charref (&#bad;)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 2cbc2ec..7cee47a 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -175,6 +175,9 @@
i = self.updatepos(i, k)
continue
else:
+ if ";" in rawdata[i:]: #bail by consuming &#
+ self.handle_data(rawdata[0:2])
+ i = self.updatepos(i, 2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 810af6c..c45cf00 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -313,6 +313,13 @@
("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
])
+ def test_malformatted_charref(self):
+ self._run_check("<p>&#bad;</p>", [
+ ("starttag", "p", []),
+ ("data", "&#bad;"),
+ ("endtag", "p"),
+ ])
+
def test_main():
test_support.run_unittest(HTMLParserTestCase)