blob: a37696d4458f1ff29b77990c5de336240c4f575a [file] [log] [blame]
Fred Drake19ff4ac2001-07-16 18:52:40 +00001import pprint
2import sgmllib
3import test_support
4import unittest
5
6
7class EventCollector(sgmllib.SGMLParser):
8
9 def __init__(self):
10 self.events = []
11 self.append = self.events.append
12 sgmllib.SGMLParser.__init__(self)
13
14 def get_events(self):
15 # Normalize the list of events so that buffer artefacts don't
16 # separate runs of contiguous characters.
17 L = []
18 prevtype = None
19 for event in self.events:
20 type = event[0]
21 if type == prevtype == "data":
22 L[-1] = ("data", L[-1][1] + event[1])
23 else:
24 L.append(event)
25 prevtype = type
26 self.events = L
27 return L
28
29 # structure markup
30
31 def unknown_starttag(self, tag, attrs):
32 self.append(("starttag", tag, attrs))
33
34 def unknown_endtag(self, tag):
35 self.append(("endtag", tag))
36
37 # all other markup
38
39 def handle_comment(self, data):
40 self.append(("comment", data))
41
42 def handle_charref(self, data):
43 self.append(("charref", data))
44
45 def handle_data(self, data):
46 self.append(("data", data))
47
48 def handle_decl(self, decl):
49 self.append(("decl", decl))
50
51 def handle_entityref(self, data):
52 self.append(("entityref", data))
53
54 def handle_pi(self, data):
55 self.append(("pi", data))
56
57
58class CDATAEventCollector(EventCollector):
59 def start_cdata(self, attrs):
60 self.append(("starttag", "cdata", attrs))
61 self.setliteral()
62
63
64class SGMLParserTestCase(unittest.TestCase):
65
66 collector = EventCollector
67
68 def check_events(self, source, expected_events):
69 parser = self.collector()
70 for s in source:
71 parser.feed(s)
72 parser.close()
73 events = parser.get_events()
74 if events != expected_events:
75 self.fail("received events did not match expected events\n"
76 "Expected:\n" + pprint.pformat(expected_events) +
77 "\nReceived:\n" + pprint.pformat(events))
78
79 def check_parse_error(self, source):
80 parser = EventCollector()
81 try:
82 parser.feed(source)
83 parser.close()
84 except sgmllib.SGMLParseError:
85 pass
86 else:
87 self.fail("expected SGMLParseError for %r\nReceived:\n%s"
88 % (source, pprint.pformat(parser.get_events())))
89
90 def test_underscore_in_attrname(self):
91 # SF bug #436621
92 """Make sure attribute names with underscores are accepted"""
93 self.check_events("<a has_under _under>", [
94 ("starttag", "a", [("has_under", "has_under"),
95 ("_under", "_under")]),
96 ])
97
98 def test_underscore_in_tagname(self):
99 # SF bug #436621
100 """Make sure tag names with underscores are accepted"""
101 self.check_events("<has_under></has_under>", [
102 ("starttag", "has_under", []),
103 ("endtag", "has_under"),
104 ])
105
106 def test_quotes_in_unquoted_attrs(self):
107 # SF bug #436621
108 """Be sure quotes in unquoted attributes are made part of the value"""
109 self.check_events("<a href=foo'bar\"baz>", [
110 ("starttag", "a", [("href", "foo'bar\"baz")]),
111 ])
112
113 def test_xhtml_empty_tag(self):
114 """Handling of XHTML-style empty start tags"""
115 self.check_events("<br />text<i></i>", [
116 ("starttag", "br", []),
117 ("data", "text"),
118 ("starttag", "i", []),
119 ("endtag", "i"),
120 ])
121
122 def test_processing_instruction_only(self):
123 self.check_events("<?processing instruction>", [
124 ("pi", "processing instruction"),
125 ])
126
127 def test_bad_nesting(self):
128 self.check_events("<a><b></a></b>", [
129 ("starttag", "a", []),
130 ("starttag", "b", []),
131 ("endtag", "a"),
132 ("endtag", "b"),
133 ])
134
135 def test_attr_syntax(self):
136 output = [
137 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
138 ]
139 self.check_events("""<a b='v' c="v" d=v e>""", output)
140 self.check_events("""<a b = 'v' c = "v" d = v e>""", output)
141 self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
142 self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
143
144 def test_attr_values(self):
145 self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
146 [("starttag", "a", [("b", "xxx\n\txxx"),
147 ("c", "yyy\t\nyyy"),
148 ("d", "\txyz\n")])
149 ])
150 self.check_events("""<a b='' c="">""", [
151 ("starttag", "a", [("b", ""), ("c", "")]),
152 ])
153
154 def test_attr_funky_names(self):
155 self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
156 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
157 ])
158
159 def test_weird_starttags(self):
160 self.check_events("<a<a>", [
161 ("starttag", "a", []),
162 ("starttag", "a", []),
163 ])
164 self.check_events("</a<a>", [
165 ("endtag", "a"),
166 ("starttag", "a", []),
167 ])
168
169 def test_declaration_junk_chars(self):
170 self.check_parse_error("<!DOCTYPE foo $ >")
171
172 def test_get_starttag_text(self):
173 s = """<foobar \n one="1"\ttwo=2 >"""
174 self.check_events(s, [
175 ("starttag", "foobar", [("one", "1"), ("two", "2")]),
176 ])
177
178 def test_cdata_content(self):
179 s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
180 "<notcdata> <!-- comment --> </notcdata>")
181 self.collector = CDATAEventCollector
182 self.check_events(s, [
183 ("starttag", "cdata", []),
184 ("data", " <!-- not a comment --> &not-an-entity-ref; "),
185 ("endtag", "cdata"),
186 ("starttag", "notcdata", []),
187 ("data", " "),
188 ("comment", " comment "),
189 ("data", " "),
190 ("endtag", "notcdata"),
191 ])
192 s = """<cdata> <not a='start tag'> </cdata>"""
193 self.check_events(s, [
194 ("starttag", "cdata", []),
195 ("data", " <not a='start tag'> "),
196 ("endtag", "cdata"),
197 ])
198
199 # XXX These tests have been disabled by prefixing their names with
200 # an underscore. The first two exercise outstanding bugs in the
201 # sgmllib module, and the third exhibits questionable behavior
202 # that needs to be carefully considered before changing it.
203
204 def _test_starttag_end_boundary(self):
205 self.check_events("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
206 self.check_events("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
207
208 def _test_buffer_artefacts(self):
209 output = [("starttag", "a", [("b", "<")])]
210 self.check_events(["<a b='<'>"], output)
211 self.check_events(["<a ", "b='<'>"], output)
212 self.check_events(["<a b", "='<'>"], output)
213 self.check_events(["<a b=", "'<'>"], output)
214 self.check_events(["<a b='<", "'>"], output)
215 self.check_events(["<a b='<'", ">"], output)
216
217 output = [("starttag", "a", [("b", ">")])]
218 self.check_events(["<a b='>'>"], output)
219 self.check_events(["<a ", "b='>'>"], output)
220 self.check_events(["<a b", "='>'>"], output)
221 self.check_events(["<a b=", "'>'>"], output)
222 self.check_events(["<a b='>", "'>"], output)
223 self.check_events(["<a b='>'", ">"], output)
224
225 def _test_starttag_junk_chars(self):
226 self.check_parse_error("<")
227 self.check_parse_error("<>")
228 self.check_parse_error("</$>")
229 self.check_parse_error("</")
230 self.check_parse_error("</a")
231 self.check_parse_error("<$")
232 self.check_parse_error("<$>")
233 self.check_parse_error("<!")
234 self.check_parse_error("<a $>")
235 self.check_parse_error("<a")
236 self.check_parse_error("<a foo='bar'")
237 self.check_parse_error("<a foo='bar")
238 self.check_parse_error("<a foo='>'")
239 self.check_parse_error("<a foo='>")
240 self.check_parse_error("<a foo=>")
241
242
243test_support.run_unittest(SGMLParserTestCase)