Issue #6662: Fix parsing of malformatted charref (&#bad;)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 2cbc2ec..7cee47a 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -175,6 +175,9 @@
i = self.updatepos(i, k)
continue
else:
+ if ";" in rawdata[i:]: #bail by consuming &#
+ self.handle_data(rawdata[0:2])
+ i = self.updatepos(i, 2)
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 810af6c..c45cf00 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -313,6 +313,13 @@
("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])
])
+ def test_malformatted_charref(self):
+ self._run_check("<p>&#bad;</p>", [
+ ("starttag", "p", []),
+ ("data", "&#bad;"),
+ ("endtag", "p"),
+ ])
+
def test_main():
test_support.run_unittest(HTMLParserTestCase)
diff --git a/Misc/NEWS b/Misc/NEWS
index b094126..e2f0f0c 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -29,6 +29,8 @@
Library
-------
+- Issue #6662: Fix parsing of malformatted charref (&#bad;)
+
- Issue #8016: Add the CP858 codec.
- Issue #3924: Ignore cookies with invalid "version" field in cookielib.