blob: 0f3cf801faa64f18b4e6ce69dbeac856bc9c158d [file] [log] [blame]
Fred Drakebd3090d2001-05-18 15:32:59 +00001"""Tests for HTMLParser.py."""
2
3import HTMLParser
4import sys
5import test_support
6import unittest
7
8
9class EventCollector(HTMLParser.HTMLParser):
10
11 def __init__(self):
12 self.events = []
13 self.append = self.events.append
14 HTMLParser.HTMLParser.__init__(self)
15
16 def get_events(self):
17 # Normalize the list of events so that buffer artefacts don't
18 # separate runs of contiguous characters.
19 L = []
20 prevtype = None
21 for event in self.events:
22 type = event[0]
23 if type == prevtype == "data":
24 L[-1] = ("data", L[-1][1] + event[1])
25 else:
26 L.append(event)
27 prevtype = type
28 self.events = L
29 return L
30
31 # structure markup
32
33 def handle_starttag(self, tag, attrs):
34 self.append(("starttag", tag, attrs))
35
36 def handle_startendtag(self, tag, attrs):
37 self.append(("startendtag", tag, attrs))
38
39 def handle_endtag(self, tag):
40 self.append(("endtag", tag))
41
42 # all other markup
43
44 def handle_comment(self, data):
45 self.append(("comment", data))
46
47 def handle_charref(self, data):
48 self.append(("charref", data))
49
50 def handle_data(self, data):
51 self.append(("data", data))
52
53 def handle_decl(self, data):
54 self.append(("decl", data))
55
56 def handle_entityref(self, data):
57 self.append(("entityref", data))
58
59 def handle_pi(self, data):
60 self.append(("pi", data))
61
62
63class EventCollectorExtra(EventCollector):
64
65 def handle_starttag(self, tag, attrs):
66 EventCollector.handle_starttag(self, tag, attrs)
67 self.append(("starttag_text", self.get_starttag_text()))
68
69
70class TestCaseBase(unittest.TestCase):
71
72 # Constant pieces of source and events
73 prologue = ""
74 epilogue = ""
75 initial_events = []
76 final_events = []
77
78 def _run_check(self, source, events, collector=EventCollector):
79 parser = collector()
80 parser.feed(self.prologue)
81 for s in source:
82 parser.feed(s)
83 for c in self.epilogue:
84 parser.feed(c)
85 parser.close()
86 self.assert_(parser.get_events() ==
87 self.initial_events + events + self.final_events,
88 parser.get_events())
89
90 def _run_check_extra(self, source, events):
91 self._run_check(source, events, EventCollectorExtra)
92
93 def _parse_error(self, source):
94 def parse(source=source):
95 parser = HTMLParser.HTMLParser()
96 parser.feed(source)
97 parser.close()
98 self.assertRaises(HTMLParser.HTMLParseError, parse)
99
100
101class HTMLParserTestCase(TestCaseBase):
102
103 def check_processing_instruction_only(self):
104 self._run_check("<?processing instruction>", [
105 ("pi", "processing instruction"),
106 ])
107
108 def check_simple_html(self):
109 self._run_check("""
110<!DOCTYPE html PUBLIC 'foo'>
111<HTML>&entity;&#32;
112<!--comment1a
113-></foo><bar>&lt;<?pi?></foo<bar
114comment1b-->
115<Img sRc='Bar' isMAP>sample
116text
117<!--comment2a-- --comment2b-->
118</Html>
119""", [
120 ("data", "\n"),
121 ("decl", "DOCTYPE html PUBLIC 'foo'"),
122 ("data", "\n"),
123 ("starttag", "html", []),
124 ("entityref", "entity"),
125 ("charref", "32"),
126 ("data", "\n"),
127 ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
128 ("data", "\n"),
129 ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
130 ("data", "sample\ntext\n"),
131 ("comment", "comment2a-- --comment2b"),
132 ("data", "\n"),
133 ("endtag", "html"),
134 ("data", "\n"),
135 ])
136
137 def check_bad_nesting(self):
138 self._run_check("<a><b></a></b>", [
139 ("starttag", "a", []),
140 ("starttag", "b", []),
141 ("endtag", "a"),
142 ("endtag", "b"),
143 ])
144
145 def check_attr_syntax(self):
146 output = [
147 ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
148 ]
149 self._run_check("""<a b='v' c="v" d=v e>""", output)
150 self._run_check("""<a b = 'v' c = "v" d = v e>""", output)
151 self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
152 self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
153
154 def check_attr_values(self):
155 self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
156 [("starttag", "a", [("b", "xxx\n\txxx"),
157 ("c", "yyy\t\nyyy"),
158 ("d", "\txyz\n")])
159 ])
160 self._run_check("""<a b='' c="">""", [
161 ("starttag", "a", [("b", ""), ("c", "")]),
162 ])
163
164 def check_attr_entity_replacement(self):
165 self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
166 ("starttag", "a", [("b", "&><\"'")]),
167 ])
168
169 def check_attr_funky_names(self):
170 self._run_check("""<a a.b='v' c:d=v e-f=v>""", [
171 ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
172 ])
173
174 def check_starttag_end_boundary(self):
175 self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
176 self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
177
178 def check_buffer_artefacts(self):
179 output = [("starttag", "a", [("b", "<")])]
180 self._run_check(["<a b='<'>"], output)
181 self._run_check(["<a ", "b='<'>"], output)
182 self._run_check(["<a b", "='<'>"], output)
183 self._run_check(["<a b=", "'<'>"], output)
184 self._run_check(["<a b='<", "'>"], output)
185 self._run_check(["<a b='<'", ">"], output)
186
187 output = [("starttag", "a", [("b", ">")])]
188 self._run_check(["<a b='>'>"], output)
189 self._run_check(["<a ", "b='>'>"], output)
190 self._run_check(["<a b", "='>'>"], output)
191 self._run_check(["<a b=", "'>'>"], output)
192 self._run_check(["<a b='>", "'>"], output)
193 self._run_check(["<a b='>'", ">"], output)
194
195 def check_starttag_junk_chars(self):
196 self._parse_error("<")
197 self._parse_error("<>")
198 self._parse_error("</>")
199 self._parse_error("</$>")
200 self._parse_error("</")
201 self._parse_error("</a")
Fred Drakebd3090d2001-05-18 15:32:59 +0000202 self._parse_error("<a<a>")
203 self._parse_error("</a<a>")
204 self._parse_error("<$")
205 self._parse_error("<$>")
206 self._parse_error("<!")
207 self._parse_error("<a $>")
208 self._parse_error("<a")
209 self._parse_error("<a foo='bar'")
210 self._parse_error("<a foo='bar")
211 self._parse_error("<a foo='>'")
212 self._parse_error("<a foo='>")
213 self._parse_error("<a foo=>")
214
215 def check_declaration_junk_chars(self):
216 self._parse_error("<!DOCTYPE foo $ >")
217
218 def check_startendtag(self):
219 self._run_check("<p/>", [
220 ("startendtag", "p", []),
221 ])
222 self._run_check("<p></p>", [
223 ("starttag", "p", []),
224 ("endtag", "p"),
225 ])
226 self._run_check("<p><img src='foo' /></p>", [
227 ("starttag", "p", []),
228 ("startendtag", "img", [("src", "foo")]),
229 ("endtag", "p"),
230 ])
231
232 def check_get_starttag_text(self):
233 s = """<foo:bar \n one="1"\ttwo=2 >"""
234 self._run_check_extra(s, [
235 ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
236 ("starttag_text", s)])
237
238 def check_cdata_content(self):
239 s = """<script> <!-- not a comment --> &not-an-entity-ref; </script>"""
240 self._run_check(s, [
241 ("starttag", "script", []),
242 ("data", " <!-- not a comment --> &not-an-entity-ref; "),
243 ("endtag", "script"),
244 ])
245 s = """<script> <not a='start tag'> </script>"""
246 self._run_check(s, [
247 ("starttag", "script", []),
248 ("data", " <not a='start tag'> "),
249 ("endtag", "script"),
250 ])
251
252
253test_support.run_unittest(HTMLParserTestCase)