blob: f0f8ad6d322f6b56e66252e52dfaf349254f918c [file] [log] [blame]
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001import htmlentitydefs
Fred Drake19ff4ac2001-07-16 18:52:40 +00002import pprint
Thomas Wouters0e3f5912006-08-11 14:57:12 +00003import re
Fred Drake19ff4ac2001-07-16 18:52:40 +00004import sgmllib
Fred Drake19ff4ac2001-07-16 18:52:40 +00005import unittest
Barry Warsaw04f357c2002-07-23 19:04:11 +00006from test import test_support
Fred Drake19ff4ac2001-07-16 18:52:40 +00007
8
9class EventCollector(sgmllib.SGMLParser):
10
11 def __init__(self):
12 self.events = []
13 self.append = self.events.append
14 sgmllib.SGMLParser.__init__(self)
15
16 def get_events(self):
17 # Normalize the list of events so that buffer artefacts don't
18 # separate runs of contiguous characters.
19 L = []
20 prevtype = None
21 for event in self.events:
22 type = event[0]
23 if type == prevtype == "data":
24 L[-1] = ("data", L[-1][1] + event[1])
25 else:
26 L.append(event)
27 prevtype = type
28 self.events = L
29 return L
30
31 # structure markup
32
33 def unknown_starttag(self, tag, attrs):
34 self.append(("starttag", tag, attrs))
35
36 def unknown_endtag(self, tag):
37 self.append(("endtag", tag))
38
39 # all other markup
40
41 def handle_comment(self, data):
42 self.append(("comment", data))
43
44 def handle_charref(self, data):
45 self.append(("charref", data))
46
47 def handle_data(self, data):
48 self.append(("data", data))
49
50 def handle_decl(self, decl):
51 self.append(("decl", decl))
52
53 def handle_entityref(self, data):
54 self.append(("entityref", data))
55
56 def handle_pi(self, data):
57 self.append(("pi", data))
58
Fred Drake30c48492001-09-24 20:22:09 +000059 def unknown_decl(self, decl):
60 self.append(("unknown decl", decl))
61
Fred Drake19ff4ac2001-07-16 18:52:40 +000062
63class CDATAEventCollector(EventCollector):
64 def start_cdata(self, attrs):
65 self.append(("starttag", "cdata", attrs))
66 self.setliteral()
67
68
Thomas Wouters0e3f5912006-08-11 14:57:12 +000069class HTMLEntityCollector(EventCollector):
70
71 entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
72 '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
73
74 def convert_charref(self, name):
75 self.append(("charref", "convert", name))
76 if name[0] != "x":
77 return EventCollector.convert_charref(self, name)
78
79 def convert_codepoint(self, codepoint):
80 self.append(("codepoint", "convert", codepoint))
81 EventCollector.convert_codepoint(self, codepoint)
82
83 def convert_entityref(self, name):
84 self.append(("entityref", "convert", name))
85 return EventCollector.convert_entityref(self, name)
86
87 # These to record that they were called, then pass the call along
88 # to the default implementation so that it's actions can be
89 # recorded.
90
91 def handle_charref(self, data):
92 self.append(("charref", data))
93 sgmllib.SGMLParser.handle_charref(self, data)
94
95 def handle_entityref(self, data):
96 self.append(("entityref", data))
97 sgmllib.SGMLParser.handle_entityref(self, data)
98
99
Fred Drake19ff4ac2001-07-16 18:52:40 +0000100class SGMLParserTestCase(unittest.TestCase):
101
102 collector = EventCollector
103
Fred Drake30c48492001-09-24 20:22:09 +0000104 def get_events(self, source):
Fred Drake19ff4ac2001-07-16 18:52:40 +0000105 parser = self.collector()
Fred Drake30c48492001-09-24 20:22:09 +0000106 try:
107 for s in source:
108 parser.feed(s)
109 parser.close()
110 except:
111 #self.events = parser.events
112 raise
113 return parser.get_events()
114
115 def check_events(self, source, expected_events):
116 try:
117 events = self.get_events(source)
118 except:
119 import sys
120 #print >>sys.stderr, pprint.pformat(self.events)
121 raise
Fred Drake19ff4ac2001-07-16 18:52:40 +0000122 if events != expected_events:
123 self.fail("received events did not match expected events\n"
124 "Expected:\n" + pprint.pformat(expected_events) +
125 "\nReceived:\n" + pprint.pformat(events))
126
127 def check_parse_error(self, source):
128 parser = EventCollector()
129 try:
130 parser.feed(source)
131 parser.close()
132 except sgmllib.SGMLParseError:
133 pass
134 else:
135 self.fail("expected SGMLParseError for %r\nReceived:\n%s"
136 % (source, pprint.pformat(parser.get_events())))
137
Fred Drake30c48492001-09-24 20:22:09 +0000138 def test_doctype_decl_internal(self):
139 inside = """\
140DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
141 SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
142 <!ELEMENT html - O EMPTY>
143 <!ATTLIST html
144 version CDATA #IMPLIED
145 profile CDATA 'DublinCore'>
146 <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
147 <!ENTITY myEntity 'internal parsed entity'>
148 <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
149 <!ENTITY % paramEntity 'name|name|name'>
150 %paramEntity;
151 <!-- comment -->
152]"""
153 self.check_events(["<!%s>" % inside], [
154 ("decl", inside),
155 ])
156
157 def test_doctype_decl_external(self):
158 inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
159 self.check_events("<!%s>" % inside, [
160 ("decl", inside),
161 ])
162
Fred Drake19ff4ac2001-07-16 18:52:40 +0000163 def test_underscore_in_attrname(self):
164 # SF bug #436621
165 """Make sure attribute names with underscores are accepted"""
166 self.check_events("<a has_under _under>", [
167 ("starttag", "a", [("has_under", "has_under"),
168 ("_under", "_under")]),
169 ])
170
171 def test_underscore_in_tagname(self):
172 # SF bug #436621
173 """Make sure tag names with underscores are accepted"""
174 self.check_events("<has_under></has_under>", [
175 ("starttag", "has_under", []),
176 ("endtag", "has_under"),
177 ])
178
179 def test_quotes_in_unquoted_attrs(self):
180 # SF bug #436621
181 """Be sure quotes in unquoted attributes are made part of the value"""
182 self.check_events("<a href=foo'bar\"baz>", [
183 ("starttag", "a", [("href", "foo'bar\"baz")]),
184 ])
185
186 def test_xhtml_empty_tag(self):
187 """Handling of XHTML-style empty start tags"""
188 self.check_events("<br />text<i></i>", [
189 ("starttag", "br", []),
190 ("data", "text"),
191 ("starttag", "i", []),
192 ("endtag", "i"),
193 ])
194
195 def test_processing_instruction_only(self):
196 self.check_events("<?processing instruction>", [
197 ("pi", "processing instruction"),
198 ])
199
200 def test_bad_nesting(self):
201 self.check_events("<a><b></a></b>", [
202 ("starttag", "a", []),
203 ("starttag", "b", []),
204 ("endtag", "a"),
205 ("endtag", "b"),
206 ])
207
Fred Drake30c48492001-09-24 20:22:09 +0000208 def test_bare_ampersands(self):
209 self.check_events("this text & contains & ampersands &", [
210 ("data", "this text & contains & ampersands &"),
211 ])
212
213 def test_bare_pointy_brackets(self):
214 self.check_events("this < text > contains < bare>pointy< brackets", [
215 ("data", "this < text > contains < bare>pointy< brackets"),
216 ])
217
Fred Drake19ff4ac2001-07-16 18:52:40 +0000218 def test_attr_syntax(self):
219 output = [
220 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
221 ]
222 self.check_events("""<a b='v' c="v" d=v e>""", output)
223 self.check_events("""<a b = 'v' c = "v" d = v e>""", output)
224 self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
225 self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
226
227 def test_attr_values(self):
228 self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
229 [("starttag", "a", [("b", "xxx\n\txxx"),
230 ("c", "yyy\t\nyyy"),
231 ("d", "\txyz\n")])
232 ])
233 self.check_events("""<a b='' c="">""", [
234 ("starttag", "a", [("b", ""), ("c", "")]),
235 ])
Fred Drake75ab1462003-04-29 22:12:55 +0000236 # URL construction stuff from RFC 1808:
237 safe = "$-_.+"
238 extra = "!*'(),"
239 reserved = ";/?:@&="
240 url = "http://example.com:8080/path/to/file?%s%s%s" % (
241 safe, extra, reserved)
242 self.check_events("""<e a=%s>""" % url, [
243 ("starttag", "e", [("a", url)]),
244 ])
Fred Drake0834d772003-03-14 16:21:57 +0000245 # Regression test for SF patch #669683.
246 self.check_events("<e a=rgb(1,2,3)>", [
247 ("starttag", "e", [("a", "rgb(1,2,3)")]),
248 ])
Fred Drake19ff4ac2001-07-16 18:52:40 +0000249
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000250 def test_attr_values_entities(self):
251 """Substitution of entities and charrefs in attribute values"""
252 # SF bug #1452246
253 self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000254 f="&xxx;" g='&#32;&#33;' h='&#500;'
255 i='x?a=b&c=d;'
256 j='&amp;#42;' k='&#38;#42;'>""",
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257 [("starttag", "a", [("b", "<"),
258 ("c", "<>"),
259 ("d", "&lt->"),
260 ("e", "< "),
261 ("f", "&xxx;"),
262 ("g", " !"),
263 ("h", "&#500;"),
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000264 ("i", "x?a=b&c=d;"),
265 ("j", "&#42;"),
266 ("k", "&#42;"),
267 ])])
268
269 def test_convert_overrides(self):
270 # This checks that the character and entity reference
271 # conversion helpers are called at the documented times. No
272 # attempt is made to really change what the parser accepts.
273 #
274 self.collector = HTMLEntityCollector
275 self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
276 '&foobar;&#42;'), [
277 ('entityref', 'convert', 'ldquo'),
278 ('charref', 'convert', 'x201d'),
279 ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
280 ('data', 'foo'),
281 ('endtag', 'a'),
282 ('entityref', 'foobar'),
283 ('entityref', 'convert', 'foobar'),
284 ('charref', '42'),
285 ('charref', 'convert', '42'),
286 ('codepoint', 'convert', 42),
287 ])
288
Fred Drake19ff4ac2001-07-16 18:52:40 +0000289 def test_attr_funky_names(self):
290 self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
291 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
292 ])
293
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000294 def test_attr_value_ip6_url(self):
295 # http://www.python.org/sf/853506
296 self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
297 "<a href=http://[1080::8:800:200C:417A]/>"), [
298 ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
299 ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
300 ])
301
Fred Drake30c48492001-09-24 20:22:09 +0000302 def test_illegal_declarations(self):
303 s = 'abc<!spacer type="block" height="25">def'
304 self.check_events(s, [
305 ("data", "abc"),
306 ("unknown decl", 'spacer type="block" height="25"'),
307 ("data", "def"),
308 ])
309
Fred Drake19ff4ac2001-07-16 18:52:40 +0000310 def test_weird_starttags(self):
311 self.check_events("<a<a>", [
312 ("starttag", "a", []),
313 ("starttag", "a", []),
314 ])
315 self.check_events("</a<a>", [
316 ("endtag", "a"),
317 ("starttag", "a", []),
318 ])
319
320 def test_declaration_junk_chars(self):
321 self.check_parse_error("<!DOCTYPE foo $ >")
322
323 def test_get_starttag_text(self):
324 s = """<foobar \n one="1"\ttwo=2 >"""
325 self.check_events(s, [
326 ("starttag", "foobar", [("one", "1"), ("two", "2")]),
327 ])
328
329 def test_cdata_content(self):
330 s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
331 "<notcdata> <!-- comment --> </notcdata>")
332 self.collector = CDATAEventCollector
333 self.check_events(s, [
334 ("starttag", "cdata", []),
335 ("data", " <!-- not a comment --> &not-an-entity-ref; "),
336 ("endtag", "cdata"),
337 ("starttag", "notcdata", []),
338 ("data", " "),
339 ("comment", " comment "),
340 ("data", " "),
341 ("endtag", "notcdata"),
342 ])
343 s = """<cdata> <not a='start tag'> </cdata>"""
344 self.check_events(s, [
345 ("starttag", "cdata", []),
346 ("data", " <not a='start tag'> "),
347 ("endtag", "cdata"),
348 ])
349
Fred Drake30c48492001-09-24 20:22:09 +0000350 def test_illegal_declarations(self):
351 s = 'abc<!spacer type="block" height="25">def'
352 self.check_events(s, [
353 ("data", "abc"),
354 ("unknown decl", 'spacer type="block" height="25"'),
355 ("data", "def"),
356 ])
357
Fred Drake04d9a802002-09-25 16:29:17 +0000358 def test_enumerated_attr_type(self):
359 s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
360 self.check_events(s, [
361 ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
362 ])
363
Thomas Wouters89f507f2006-12-13 04:49:30 +0000364 def test_read_chunks(self):
365 # SF bug #1541697, this caused sgml parser to hang
366 # Just verify this code doesn't cause a hang.
367 CHUNK = 1024 # increasing this to 8212 makes the problem go away
368
Guido van Rossum624ab7c2007-07-27 17:12:11 +0000369 f = open(test_support.findfile('sgml_input.html'), encoding="latin-1")
Thomas Wouters89f507f2006-12-13 04:49:30 +0000370 fp = sgmllib.SGMLParser()
371 while 1:
372 data = f.read(CHUNK)
373 fp.feed(data)
374 if len(data) != CHUNK:
375 break
376
Fred Drake19ff4ac2001-07-16 18:52:40 +0000377 # XXX These tests have been disabled by prefixing their names with
378 # an underscore. The first two exercise outstanding bugs in the
379 # sgmllib module, and the third exhibits questionable behavior
380 # that needs to be carefully considered before changing it.
381
382 def _test_starttag_end_boundary(self):
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000383 self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
384 self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
Fred Drake19ff4ac2001-07-16 18:52:40 +0000385
386 def _test_buffer_artefacts(self):
387 output = [("starttag", "a", [("b", "<")])]
388 self.check_events(["<a b='<'>"], output)
389 self.check_events(["<a ", "b='<'>"], output)
390 self.check_events(["<a b", "='<'>"], output)
391 self.check_events(["<a b=", "'<'>"], output)
392 self.check_events(["<a b='<", "'>"], output)
393 self.check_events(["<a b='<'", ">"], output)
394
395 output = [("starttag", "a", [("b", ">")])]
396 self.check_events(["<a b='>'>"], output)
397 self.check_events(["<a ", "b='>'>"], output)
398 self.check_events(["<a b", "='>'>"], output)
399 self.check_events(["<a b=", "'>'>"], output)
400 self.check_events(["<a b='>", "'>"], output)
401 self.check_events(["<a b='>'", ">"], output)
402
Fred Drake75d9a622004-09-08 22:57:01 +0000403 output = [("comment", "abc")]
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000404 self.check_events(["", "<!--abc-->"], output)
405 self.check_events(["<", "!--abc-->"], output)
406 self.check_events(["<!", "--abc-->"], output)
407 self.check_events(["<!-", "-abc-->"], output)
408 self.check_events(["<!--", "abc-->"], output)
409 self.check_events(["<!--a", "bc-->"], output)
410 self.check_events(["<!--ab", "c-->"], output)
411 self.check_events(["<!--abc", "-->"], output)
412 self.check_events(["<!--abc-", "->"], output)
413 self.check_events(["<!--abc--", ">"], output)
414 self.check_events(["<!--abc-->", ""], output)
Fred Drake75d9a622004-09-08 22:57:01 +0000415
Fred Drake19ff4ac2001-07-16 18:52:40 +0000416 def _test_starttag_junk_chars(self):
417 self.check_parse_error("<")
418 self.check_parse_error("<>")
419 self.check_parse_error("</$>")
420 self.check_parse_error("</")
421 self.check_parse_error("</a")
422 self.check_parse_error("<$")
423 self.check_parse_error("<$>")
424 self.check_parse_error("<!")
425 self.check_parse_error("<a $>")
426 self.check_parse_error("<a")
427 self.check_parse_error("<a foo='bar'")
428 self.check_parse_error("<a foo='bar")
429 self.check_parse_error("<a foo='>'")
430 self.check_parse_error("<a foo='>")
431 self.check_parse_error("<a foo=>")
432
433
Fred Drake30c48492001-09-24 20:22:09 +0000434def test_main():
435 test_support.run_unittest(SGMLParserTestCase)
436
437
438if __name__ == "__main__":
439 test_main()