blob: 28a21a466bb7cdece529e14742e5ed7b9ba5a3b6 [file] [log] [blame]
Fred Drake2f99da62006-06-23 06:03:45 +00001import htmlentitydefs
Fred Drake19ff4ac2001-07-16 18:52:40 +00002import pprint
Fred Drake2f99da62006-06-23 06:03:45 +00003import re
Fred Drake19ff4ac2001-07-16 18:52:40 +00004import sgmllib
Fred Drake19ff4ac2001-07-16 18:52:40 +00005import unittest
Barry Warsaw04f357c2002-07-23 19:04:11 +00006from test import test_support
Fred Drake19ff4ac2001-07-16 18:52:40 +00007
8
9class EventCollector(sgmllib.SGMLParser):
10
11 def __init__(self):
12 self.events = []
13 self.append = self.events.append
14 sgmllib.SGMLParser.__init__(self)
15
16 def get_events(self):
17 # Normalize the list of events so that buffer artefacts don't
18 # separate runs of contiguous characters.
19 L = []
20 prevtype = None
21 for event in self.events:
22 type = event[0]
23 if type == prevtype == "data":
24 L[-1] = ("data", L[-1][1] + event[1])
25 else:
26 L.append(event)
27 prevtype = type
28 self.events = L
29 return L
30
31 # structure markup
32
33 def unknown_starttag(self, tag, attrs):
34 self.append(("starttag", tag, attrs))
35
36 def unknown_endtag(self, tag):
37 self.append(("endtag", tag))
38
39 # all other markup
40
41 def handle_comment(self, data):
42 self.append(("comment", data))
43
44 def handle_charref(self, data):
45 self.append(("charref", data))
46
47 def handle_data(self, data):
48 self.append(("data", data))
49
50 def handle_decl(self, decl):
51 self.append(("decl", decl))
52
53 def handle_entityref(self, data):
54 self.append(("entityref", data))
55
56 def handle_pi(self, data):
57 self.append(("pi", data))
58
Fred Drake30c48492001-09-24 20:22:09 +000059 def unknown_decl(self, decl):
60 self.append(("unknown decl", decl))
61
Fred Drake19ff4ac2001-07-16 18:52:40 +000062
63class CDATAEventCollector(EventCollector):
64 def start_cdata(self, attrs):
65 self.append(("starttag", "cdata", attrs))
66 self.setliteral()
67
68
Fred Drakefab461a2006-06-16 23:45:06 +000069class HTMLEntityCollector(EventCollector):
Fred Drake2f99da62006-06-23 06:03:45 +000070
Fred Drakefab461a2006-06-16 23:45:06 +000071 entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
72 '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
73
74 def convert_charref(self, name):
75 self.append(("charref", "convert", name))
Fred Drake2f99da62006-06-23 06:03:45 +000076 if name[0] != "x":
77 return EventCollector.convert_charref(self, name)
78
79 def convert_codepoint(self, codepoint):
80 self.append(("codepoint", "convert", codepoint))
81 EventCollector.convert_codepoint(self, codepoint)
Fred Drakefab461a2006-06-16 23:45:06 +000082
83 def convert_entityref(self, name):
84 self.append(("entityref", "convert", name))
Fred Drake2f99da62006-06-23 06:03:45 +000085 return EventCollector.convert_entityref(self, name)
86
87 # These to record that they were called, then pass the call along
88 # to the default implementation so that it's actions can be
89 # recorded.
90
91 def handle_charref(self, data):
92 self.append(("charref", data))
93 sgmllib.SGMLParser.handle_charref(self, data)
94
95 def handle_entityref(self, data):
96 self.append(("entityref", data))
97 sgmllib.SGMLParser.handle_entityref(self, data)
Fred Drakefab461a2006-06-16 23:45:06 +000098
99
Fred Drake19ff4ac2001-07-16 18:52:40 +0000100class SGMLParserTestCase(unittest.TestCase):
101
102 collector = EventCollector
103
Fred Drake30c48492001-09-24 20:22:09 +0000104 def get_events(self, source):
Fred Drake19ff4ac2001-07-16 18:52:40 +0000105 parser = self.collector()
Fred Drake30c48492001-09-24 20:22:09 +0000106 try:
107 for s in source:
108 parser.feed(s)
109 parser.close()
110 except:
111 #self.events = parser.events
112 raise
113 return parser.get_events()
114
115 def check_events(self, source, expected_events):
116 try:
117 events = self.get_events(source)
118 except:
119 import sys
120 #print >>sys.stderr, pprint.pformat(self.events)
121 raise
Fred Drake19ff4ac2001-07-16 18:52:40 +0000122 if events != expected_events:
123 self.fail("received events did not match expected events\n"
124 "Expected:\n" + pprint.pformat(expected_events) +
125 "\nReceived:\n" + pprint.pformat(events))
126
127 def check_parse_error(self, source):
128 parser = EventCollector()
129 try:
130 parser.feed(source)
131 parser.close()
132 except sgmllib.SGMLParseError:
133 pass
134 else:
135 self.fail("expected SGMLParseError for %r\nReceived:\n%s"
136 % (source, pprint.pformat(parser.get_events())))
137
Fred Drake30c48492001-09-24 20:22:09 +0000138 def test_doctype_decl_internal(self):
139 inside = """\
140DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
141 SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
142 <!ELEMENT html - O EMPTY>
143 <!ATTLIST html
144 version CDATA #IMPLIED
145 profile CDATA 'DublinCore'>
146 <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
147 <!ENTITY myEntity 'internal parsed entity'>
148 <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
149 <!ENTITY % paramEntity 'name|name|name'>
150 %paramEntity;
151 <!-- comment -->
152]"""
153 self.check_events(["<!%s>" % inside], [
154 ("decl", inside),
155 ])
156
157 def test_doctype_decl_external(self):
158 inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
159 self.check_events("<!%s>" % inside, [
160 ("decl", inside),
161 ])
162
Fred Drake19ff4ac2001-07-16 18:52:40 +0000163 def test_underscore_in_attrname(self):
164 # SF bug #436621
165 """Make sure attribute names with underscores are accepted"""
166 self.check_events("<a has_under _under>", [
167 ("starttag", "a", [("has_under", "has_under"),
168 ("_under", "_under")]),
169 ])
170
171 def test_underscore_in_tagname(self):
172 # SF bug #436621
173 """Make sure tag names with underscores are accepted"""
174 self.check_events("<has_under></has_under>", [
175 ("starttag", "has_under", []),
176 ("endtag", "has_under"),
177 ])
178
179 def test_quotes_in_unquoted_attrs(self):
180 # SF bug #436621
181 """Be sure quotes in unquoted attributes are made part of the value"""
182 self.check_events("<a href=foo'bar\"baz>", [
183 ("starttag", "a", [("href", "foo'bar\"baz")]),
184 ])
185
186 def test_xhtml_empty_tag(self):
187 """Handling of XHTML-style empty start tags"""
188 self.check_events("<br />text<i></i>", [
189 ("starttag", "br", []),
190 ("data", "text"),
191 ("starttag", "i", []),
192 ("endtag", "i"),
193 ])
194
195 def test_processing_instruction_only(self):
196 self.check_events("<?processing instruction>", [
197 ("pi", "processing instruction"),
198 ])
199
200 def test_bad_nesting(self):
201 self.check_events("<a><b></a></b>", [
202 ("starttag", "a", []),
203 ("starttag", "b", []),
204 ("endtag", "a"),
205 ("endtag", "b"),
206 ])
207
Fred Drake30c48492001-09-24 20:22:09 +0000208 def test_bare_ampersands(self):
209 self.check_events("this text & contains & ampersands &", [
210 ("data", "this text & contains & ampersands &"),
211 ])
212
213 def test_bare_pointy_brackets(self):
214 self.check_events("this < text > contains < bare>pointy< brackets", [
215 ("data", "this < text > contains < bare>pointy< brackets"),
216 ])
217
Fred Drake19ff4ac2001-07-16 18:52:40 +0000218 def test_attr_syntax(self):
219 output = [
220 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
221 ]
222 self.check_events("""<a b='v' c="v" d=v e>""", output)
223 self.check_events("""<a b = 'v' c = "v" d = v e>""", output)
224 self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
225 self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
226
227 def test_attr_values(self):
228 self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
229 [("starttag", "a", [("b", "xxx\n\txxx"),
230 ("c", "yyy\t\nyyy"),
231 ("d", "\txyz\n")])
232 ])
233 self.check_events("""<a b='' c="">""", [
234 ("starttag", "a", [("b", ""), ("c", "")]),
235 ])
Fred Drake75ab1462003-04-29 22:12:55 +0000236 # URL construction stuff from RFC 1808:
237 safe = "$-_.+"
238 extra = "!*'(),"
239 reserved = ";/?:@&="
240 url = "http://example.com:8080/path/to/file?%s%s%s" % (
241 safe, extra, reserved)
242 self.check_events("""<e a=%s>""" % url, [
243 ("starttag", "e", [("a", url)]),
244 ])
Fred Drake0834d772003-03-14 16:21:57 +0000245 # Regression test for SF patch #669683.
246 self.check_events("<e a=rgb(1,2,3)>", [
247 ("starttag", "e", [("a", "rgb(1,2,3)")]),
248 ])
Fred Drake19ff4ac2001-07-16 18:52:40 +0000249
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000250 def test_attr_values_entities(self):
251 """Substitution of entities and charrefs in attribute values"""
252 # SF bug #1452246
253 self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
Fred Drakea16393e2006-06-14 05:04:47 +0000254 f="&xxx;" g='&#32;&#33;' h='&#500;'
255 i='x?a=b&c=d;'
256 j='&amp;#42;' k='&#38;#42;'>""",
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000257 [("starttag", "a", [("b", "<"),
258 ("c", "<>"),
259 ("d", "&lt->"),
260 ("e", "< "),
261 ("f", "&xxx;"),
262 ("g", " !"),
263 ("h", "&#500;"),
Fred Drakea16393e2006-06-14 05:04:47 +0000264 ("i", "x?a=b&c=d;"),
265 ("j", "&#42;"),
266 ("k", "&#42;"),
267 ])])
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000268
Fred Drakefab461a2006-06-16 23:45:06 +0000269 def test_convert_overrides(self):
Fred Drake2f99da62006-06-23 06:03:45 +0000270 # This checks that the character and entity reference
271 # conversion helpers are called at the documented times. No
272 # attempt is made to really change what the parser accepts.
273 #
Fred Drakefab461a2006-06-16 23:45:06 +0000274 self.collector = HTMLEntityCollector
Fred Drake2f99da62006-06-23 06:03:45 +0000275 self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
276 '&foobar;&#42;'), [
Fred Drakefab461a2006-06-16 23:45:06 +0000277 ('entityref', 'convert', 'ldquo'),
278 ('charref', 'convert', 'x201d'),
Fred Drake2f99da62006-06-23 06:03:45 +0000279 ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
Fred Drakefab461a2006-06-16 23:45:06 +0000280 ('data', 'foo'),
281 ('endtag', 'a'),
Fred Drake2f99da62006-06-23 06:03:45 +0000282 ('entityref', 'foobar'),
283 ('entityref', 'convert', 'foobar'),
284 ('charref', '42'),
285 ('charref', 'convert', '42'),
286 ('codepoint', 'convert', 42),
Fred Drakefab461a2006-06-16 23:45:06 +0000287 ])
288
Fred Drakea1362102006-06-29 00:51:53 +0000289 def test_attr_values_quoted_markup(self):
290 """Multi-line and markup in attribute values"""
291 self.check_events("""<a title='foo\n<br>bar'>text</a>""",
292 [("starttag", "a", [("title", "foo\n<br>bar")]),
293 ("data", "text"),
294 ("endtag", "a")])
295 self.check_events("""<a title='less < than'>text</a>""",
296 [("starttag", "a", [("title", "less < than")]),
297 ("data", "text"),
298 ("endtag", "a")])
299 self.check_events("""<a title='greater > than'>text</a>""",
300 [("starttag", "a", [("title", "greater > than")]),
301 ("data", "text"),
302 ("endtag", "a")])
303
Fred Drake19ff4ac2001-07-16 18:52:40 +0000304 def test_attr_funky_names(self):
305 self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
306 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
307 ])
308
Fred Drake2f99da62006-06-23 06:03:45 +0000309 def test_attr_value_ip6_url(self):
310 # http://www.python.org/sf/853506
311 self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
312 "<a href=http://[1080::8:800:200C:417A]/>"), [
313 ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
314 ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
315 ])
316
Fred Drake30c48492001-09-24 20:22:09 +0000317 def test_illegal_declarations(self):
318 s = 'abc<!spacer type="block" height="25">def'
319 self.check_events(s, [
320 ("data", "abc"),
321 ("unknown decl", 'spacer type="block" height="25"'),
322 ("data", "def"),
323 ])
324
Fred Drake19ff4ac2001-07-16 18:52:40 +0000325 def test_weird_starttags(self):
326 self.check_events("<a<a>", [
327 ("starttag", "a", []),
328 ("starttag", "a", []),
329 ])
330 self.check_events("</a<a>", [
331 ("endtag", "a"),
332 ("starttag", "a", []),
333 ])
334
335 def test_declaration_junk_chars(self):
336 self.check_parse_error("<!DOCTYPE foo $ >")
337
338 def test_get_starttag_text(self):
339 s = """<foobar \n one="1"\ttwo=2 >"""
340 self.check_events(s, [
341 ("starttag", "foobar", [("one", "1"), ("two", "2")]),
342 ])
343
344 def test_cdata_content(self):
345 s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
346 "<notcdata> <!-- comment --> </notcdata>")
347 self.collector = CDATAEventCollector
348 self.check_events(s, [
349 ("starttag", "cdata", []),
350 ("data", " <!-- not a comment --> &not-an-entity-ref; "),
351 ("endtag", "cdata"),
352 ("starttag", "notcdata", []),
353 ("data", " "),
354 ("comment", " comment "),
355 ("data", " "),
356 ("endtag", "notcdata"),
357 ])
358 s = """<cdata> <not a='start tag'> </cdata>"""
359 self.check_events(s, [
360 ("starttag", "cdata", []),
361 ("data", " <not a='start tag'> "),
362 ("endtag", "cdata"),
363 ])
364
Fred Drake30c48492001-09-24 20:22:09 +0000365 def test_illegal_declarations(self):
366 s = 'abc<!spacer type="block" height="25">def'
367 self.check_events(s, [
368 ("data", "abc"),
369 ("unknown decl", 'spacer type="block" height="25"'),
370 ("data", "def"),
371 ])
372
Fred Drake04d9a802002-09-25 16:29:17 +0000373 def test_enumerated_attr_type(self):
374 s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
375 self.check_events(s, [
376 ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
377 ])
378
Fred Drake19ff4ac2001-07-16 18:52:40 +0000379 # XXX These tests have been disabled by prefixing their names with
380 # an underscore. The first two exercise outstanding bugs in the
381 # sgmllib module, and the third exhibits questionable behavior
382 # that needs to be carefully considered before changing it.
383
384 def _test_starttag_end_boundary(self):
Fred Drake72c9eff2006-06-14 04:25:02 +0000385 self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
386 self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
Fred Drake19ff4ac2001-07-16 18:52:40 +0000387
388 def _test_buffer_artefacts(self):
389 output = [("starttag", "a", [("b", "<")])]
390 self.check_events(["<a b='<'>"], output)
391 self.check_events(["<a ", "b='<'>"], output)
392 self.check_events(["<a b", "='<'>"], output)
393 self.check_events(["<a b=", "'<'>"], output)
394 self.check_events(["<a b='<", "'>"], output)
395 self.check_events(["<a b='<'", ">"], output)
396
397 output = [("starttag", "a", [("b", ">")])]
398 self.check_events(["<a b='>'>"], output)
399 self.check_events(["<a ", "b='>'>"], output)
400 self.check_events(["<a b", "='>'>"], output)
401 self.check_events(["<a b=", "'>'>"], output)
402 self.check_events(["<a b='>", "'>"], output)
403 self.check_events(["<a b='>'", ">"], output)
404
Fred Drake75d9a622004-09-08 22:57:01 +0000405 output = [("comment", "abc")]
Fred Drake72c9eff2006-06-14 04:25:02 +0000406 self.check_events(["", "<!--abc-->"], output)
407 self.check_events(["<", "!--abc-->"], output)
408 self.check_events(["<!", "--abc-->"], output)
409 self.check_events(["<!-", "-abc-->"], output)
410 self.check_events(["<!--", "abc-->"], output)
411 self.check_events(["<!--a", "bc-->"], output)
412 self.check_events(["<!--ab", "c-->"], output)
413 self.check_events(["<!--abc", "-->"], output)
414 self.check_events(["<!--abc-", "->"], output)
415 self.check_events(["<!--abc--", ">"], output)
416 self.check_events(["<!--abc-->", ""], output)
Fred Drake75d9a622004-09-08 22:57:01 +0000417
Fred Drake19ff4ac2001-07-16 18:52:40 +0000418 def _test_starttag_junk_chars(self):
419 self.check_parse_error("<")
420 self.check_parse_error("<>")
421 self.check_parse_error("</$>")
422 self.check_parse_error("</")
423 self.check_parse_error("</a")
424 self.check_parse_error("<$")
425 self.check_parse_error("<$>")
426 self.check_parse_error("<!")
427 self.check_parse_error("<a $>")
428 self.check_parse_error("<a")
429 self.check_parse_error("<a foo='bar'")
430 self.check_parse_error("<a foo='bar")
431 self.check_parse_error("<a foo='>'")
432 self.check_parse_error("<a foo='>")
433 self.check_parse_error("<a foo=>")
434
435
Fred Drake30c48492001-09-24 20:22:09 +0000436def test_main():
437 test_support.run_unittest(SGMLParserTestCase)
438
439
440if __name__ == "__main__":
441 test_main()