blob: 44647dbf026c054c563a78dee19020927cfd5d88 [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
Georg Brandlac19d852008-06-01 21:19:14 +00007from warnings import warnpy3k
8warnpy3k("the htmllib module has been removed in Python 3.0",
9 stacklevel=2)
10del warnpy3k
11
Fred Drake583359e2004-09-09 02:24:13 +000012import sgmllib
Guido van Rossum7c750e11995-02-27 13:16:55 +000013
Guido van Rossumf54d9671995-08-07 20:07:44 +000014from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000015
Fred Drake583359e2004-09-09 02:24:13 +000016__all__ = ["HTMLParser", "HTMLParseError"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000017
Fred Drake583359e2004-09-09 02:24:13 +000018
19class HTMLParseError(sgmllib.SGMLParseError):
20 """Error raised when an HTML document can't be parsed."""
21
22
23class HTMLParser(sgmllib.SGMLParser):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000024 """This is the basic HTML parser class.
25
Andrew M. Kuchling0f103432003-10-27 15:47:48 +000026 It supports all entity names required by the XHTML 1.0 Recommendation.
27 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
28 elements.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000029
30 """
Guido van Rossum7c750e11995-02-27 13:16:55 +000031
Fred Draked995e112008-05-20 06:08:38 +000032 from htmlentitydefs import entitydefs
Guido van Rossum65126d51995-09-27 16:22:17 +000033
Guido van Rossum453534a1995-09-22 00:55:50 +000034 def __init__(self, formatter, verbose=0):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000035 """Creates an instance of the HTMLParser class.
36
37 The formatter parameter is the formatter instance associated with
38 the parser.
39
40 """
Fred Drake583359e2004-09-09 02:24:13 +000041 sgmllib.SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000042 self.formatter = formatter
Martin v. Löwiscbe81f22003-09-12 16:38:00 +000043
Fred Drake583359e2004-09-09 02:24:13 +000044 def error(self, message):
45 raise HTMLParseError(message)
46
Martin v. Löwiscbe81f22003-09-12 16:38:00 +000047 def reset(self):
Fred Drake583359e2004-09-09 02:24:13 +000048 sgmllib.SGMLParser.reset(self)
Guido van Rossumf54d9671995-08-07 20:07:44 +000049 self.savedata = None
50 self.isindex = 0
51 self.title = None
52 self.base = None
53 self.anchor = None
54 self.anchorlist = []
55 self.nofill = 0
56 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000057
Guido van Rossumf54d9671995-08-07 20:07:44 +000058 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000059
Guido van Rossumf54d9671995-08-07 20:07:44 +000060 # --- Formatter interface, taking care of 'savedata' mode;
61 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000062
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000063 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000064 if self.savedata is not None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000065 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000066 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000067 if self.nofill:
68 self.formatter.add_literal_data(data)
69 else:
70 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000071
Guido van Rossumf54d9671995-08-07 20:07:44 +000072 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000073
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000074 def save_bgn(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000075 """Begins saving character data in a buffer instead of sending it
76 to the formatter object.
77
78 Retrieve the stored data via the save_end() method. Use of the
79 save_bgn() / save_end() pair may not be nested.
80
81 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000082 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000083
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000084 def save_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000085 """Ends buffering character data and returns all data saved since
86 the preceding call to the save_bgn() method.
87
88 If the nofill flag is false, whitespace is collapsed to single
89 spaces. A call to this method without a preceding call to the
90 save_bgn() method will raise a TypeError exception.
91
92 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000093 data = self.savedata
94 self.savedata = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000095 if not self.nofill:
Eric S. Raymond373c55e2001-02-09 08:25:29 +000096 data = ' '.join(data.split())
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000097 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000098
Guido van Rossumf54d9671995-08-07 20:07:44 +000099 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000100
101 def anchor_bgn(self, href, name, type):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000102 """This method is called at the start of an anchor region.
103
104 The arguments correspond to the attributes of the <A> tag with
105 the same names. The default implementation maintains a list of
106 hyperlinks (defined by the HREF attribute for <A> tags) within
107 the document. The list of hyperlinks is available as the data
108 attribute anchorlist.
109
110 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000111 self.anchor = href
112 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000113 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000114
115 def anchor_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000116 """This method is called at the end of an anchor region.
117
118 The default implementation adds a textual footnote marker using an
119 index into the list of hyperlinks created by the anchor_bgn()method.
120
121 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000122 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000123 self.handle_data("[%d]" % len(self.anchorlist))
124 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000125
Guido van Rossumf54d9671995-08-07 20:07:44 +0000126 # --- Hook for images; should probably be overridden
127
Guido van Rossum453534a1995-09-22 00:55:50 +0000128 def handle_image(self, src, alt, *args):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000129 """This method is called to handle images.
130
131 The default implementation simply passes the alt value to the
132 handle_data() method.
133
134 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000135 self.handle_data(alt)
136
Guido van Rossumf54d9671995-08-07 20:07:44 +0000137 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000138
139 def start_html(self, attrs): pass
140 def end_html(self): pass
141
142 def start_head(self, attrs): pass
143 def end_head(self): pass
144
145 def start_body(self, attrs): pass
146 def end_body(self): pass
147
Guido van Rossumf54d9671995-08-07 20:07:44 +0000148 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000149
150 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000151 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000152
153 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000154 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000155
Guido van Rossumf54d9671995-08-07 20:07:44 +0000156 def do_base(self, attrs):
157 for a, v in attrs:
158 if a == 'href':
159 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000160
Guido van Rossumf54d9671995-08-07 20:07:44 +0000161 def do_isindex(self, attrs):
162 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000163
Guido van Rossumf54d9671995-08-07 20:07:44 +0000164 def do_link(self, attrs):
165 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000166
Guido van Rossumf54d9671995-08-07 20:07:44 +0000167 def do_meta(self, attrs):
168 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000169
Guido van Rossumf54d9671995-08-07 20:07:44 +0000170 def do_nextid(self, attrs): # Deprecated
171 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000172
Guido van Rossumf54d9671995-08-07 20:07:44 +0000173 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000174
Guido van Rossumf54d9671995-08-07 20:07:44 +0000175 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000176
177 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000178 self.formatter.end_paragraph(1)
179 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000180
181 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000182 self.formatter.end_paragraph(1)
183 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000184
Guido van Rossumf54d9671995-08-07 20:07:44 +0000185 def start_h2(self, attrs):
186 self.formatter.end_paragraph(1)
187 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000188
Guido van Rossumf54d9671995-08-07 20:07:44 +0000189 def end_h2(self):
190 self.formatter.end_paragraph(1)
191 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000192
Guido van Rossumf54d9671995-08-07 20:07:44 +0000193 def start_h3(self, attrs):
194 self.formatter.end_paragraph(1)
195 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000196
Guido van Rossumf54d9671995-08-07 20:07:44 +0000197 def end_h3(self):
198 self.formatter.end_paragraph(1)
199 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000200
Guido van Rossumf54d9671995-08-07 20:07:44 +0000201 def start_h4(self, attrs):
202 self.formatter.end_paragraph(1)
203 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000204
Guido van Rossumf54d9671995-08-07 20:07:44 +0000205 def end_h4(self):
206 self.formatter.end_paragraph(1)
207 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000208
Guido van Rossumf54d9671995-08-07 20:07:44 +0000209 def start_h5(self, attrs):
210 self.formatter.end_paragraph(1)
211 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000212
Guido van Rossumf54d9671995-08-07 20:07:44 +0000213 def end_h5(self):
214 self.formatter.end_paragraph(1)
215 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000216
Guido van Rossumf54d9671995-08-07 20:07:44 +0000217 def start_h6(self, attrs):
218 self.formatter.end_paragraph(1)
219 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000220
Guido van Rossumf54d9671995-08-07 20:07:44 +0000221 def end_h6(self):
222 self.formatter.end_paragraph(1)
223 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000224
Guido van Rossumf54d9671995-08-07 20:07:44 +0000225 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000226
Guido van Rossumf54d9671995-08-07 20:07:44 +0000227 def do_p(self, attrs):
228 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000229
230 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000231 self.formatter.end_paragraph(1)
232 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
233 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000234
235 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000236 self.formatter.end_paragraph(1)
237 self.formatter.pop_font()
238 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000239
Guido van Rossumf54d9671995-08-07 20:07:44 +0000240 def start_xmp(self, attrs):
241 self.start_pre(attrs)
242 self.setliteral('xmp') # Tell SGML parser
243
244 def end_xmp(self):
245 self.end_pre()
246
247 def start_listing(self, attrs):
248 self.start_pre(attrs)
249 self.setliteral('listing') # Tell SGML parser
250
251 def end_listing(self):
252 self.end_pre()
253
254 def start_address(self, attrs):
255 self.formatter.end_paragraph(0)
256 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
257
258 def end_address(self):
259 self.formatter.end_paragraph(0)
260 self.formatter.pop_font()
261
262 def start_blockquote(self, attrs):
263 self.formatter.end_paragraph(1)
264 self.formatter.push_margin('blockquote')
265
266 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000267 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000268 self.formatter.pop_margin()
269
270 # --- List Elements
271
272 def start_ul(self, attrs):
273 self.formatter.end_paragraph(not self.list_stack)
274 self.formatter.push_margin('ul')
275 self.list_stack.append(['ul', '*', 0])
276
277 def end_ul(self):
278 if self.list_stack: del self.list_stack[-1]
279 self.formatter.end_paragraph(not self.list_stack)
280 self.formatter.pop_margin()
281
282 def do_li(self, attrs):
283 self.formatter.end_paragraph(0)
284 if self.list_stack:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000285 [dummy, label, counter] = top = self.list_stack[-1]
286 top[2] = counter = counter+1
Guido van Rossumf54d9671995-08-07 20:07:44 +0000287 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 label, counter = '*', 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000289 self.formatter.add_label_data(label, counter)
290
291 def start_ol(self, attrs):
292 self.formatter.end_paragraph(not self.list_stack)
293 self.formatter.push_margin('ol')
294 label = '1.'
295 for a, v in attrs:
296 if a == 'type':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000297 if len(v) == 1: v = v + '.'
298 label = v
Guido van Rossumf54d9671995-08-07 20:07:44 +0000299 self.list_stack.append(['ol', label, 0])
300
301 def end_ol(self):
302 if self.list_stack: del self.list_stack[-1]
303 self.formatter.end_paragraph(not self.list_stack)
304 self.formatter.pop_margin()
305
306 def start_menu(self, attrs):
307 self.start_ul(attrs)
308
309 def end_menu(self):
310 self.end_ul()
311
312 def start_dir(self, attrs):
313 self.start_ul(attrs)
314
315 def end_dir(self):
316 self.end_ul()
317
318 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000319 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000320 self.list_stack.append(['dl', '', 0])
321
322 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000323 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000324 if self.list_stack: del self.list_stack[-1]
325
326 def do_dt(self, attrs):
327 self.ddpop()
328
329 def do_dd(self, attrs):
330 self.ddpop()
331 self.formatter.push_margin('dd')
332 self.list_stack.append(['dd', '', 0])
333
Guido van Rossumaa763441995-08-09 02:31:00 +0000334 def ddpop(self, bl=0):
335 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000336 if self.list_stack:
337 if self.list_stack[-1][0] == 'dd':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 del self.list_stack[-1]
339 self.formatter.pop_margin()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000340
341 # --- Phrase Markup
342
343 # Idiomatic Elements
344
345 def start_cite(self, attrs): self.start_i(attrs)
346 def end_cite(self): self.end_i()
347
348 def start_code(self, attrs): self.start_tt(attrs)
349 def end_code(self): self.end_tt()
350
351 def start_em(self, attrs): self.start_i(attrs)
352 def end_em(self): self.end_i()
353
354 def start_kbd(self, attrs): self.start_tt(attrs)
355 def end_kbd(self): self.end_tt()
356
357 def start_samp(self, attrs): self.start_tt(attrs)
358 def end_samp(self): self.end_tt()
359
Guido van Rossumaa763441995-08-09 02:31:00 +0000360 def start_strong(self, attrs): self.start_b(attrs)
361 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000362
363 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000364 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000365
366 # Typographic Elements
367
368 def start_i(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000369 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000370 def end_i(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000371 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000372
373 def start_b(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000374 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000375 def end_b(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000376 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000377
378 def start_tt(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000379 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000380 def end_tt(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000381 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000382
383 def start_a(self, attrs):
384 href = ''
385 name = ''
386 type = ''
387 for attrname, value in attrs:
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000388 value = value.strip()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000389 if attrname == 'href':
390 href = value
391 if attrname == 'name':
392 name = value
393 if attrname == 'type':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000394 type = value.lower()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000395 self.anchor_bgn(href, name, type)
396
397 def end_a(self):
398 self.anchor_end()
399
400 # --- Line Break
401
402 def do_br(self, attrs):
403 self.formatter.add_line_break()
404
405 # --- Horizontal Rule
406
407 def do_hr(self, attrs):
408 self.formatter.add_hor_rule()
409
410 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000411
412 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000413 align = ''
414 alt = '(image)'
415 ismap = ''
416 src = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000417 width = 0
418 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000419 for attrname, value in attrs:
420 if attrname == 'align':
421 align = value
422 if attrname == 'alt':
423 alt = value
424 if attrname == 'ismap':
425 ismap = value
426 if attrname == 'src':
427 src = value
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000428 if attrname == 'width':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000429 try: width = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000430 except ValueError: pass
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000431 if attrname == 'height':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000432 try: height = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000433 except ValueError: pass
Guido van Rossum453534a1995-09-22 00:55:50 +0000434 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000435
Guido van Rossumf54d9671995-08-07 20:07:44 +0000436 # --- Really Old Unofficial Deprecated Stuff
437
438 def do_plaintext(self, attrs):
439 self.start_pre(attrs)
440 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000441
442 # --- Unhandled tags
443
444 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000445 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000446
447 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000448 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000449
Guido van Rossum7c750e11995-02-27 13:16:55 +0000450
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000451def test(args = None):
452 import sys, formatter
453
454 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000455 args = sys.argv[1:]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000456
457 silent = args and args[0] == '-s'
458 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000459 del args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000460
461 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000462 file = args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000463 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000464 file = 'test.html'
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000465
466 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000467 f = sys.stdin
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000468 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000469 try:
470 f = open(file, 'r')
471 except IOError, msg:
472 print file, ":", msg
473 sys.exit(1)
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000474
475 data = f.read()
476
477 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000478 f.close()
Tim Peters07e99cb2001-01-14 23:47:14 +0000479
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000480 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000481 f = formatter.NullFormatter()
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000482 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000483 f = formatter.AbstractFormatter(formatter.DumbWriter())
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000484
Guido van Rossumf54d9671995-08-07 20:07:44 +0000485 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000486 p.feed(data)
487 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000488
489
490if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000491 test()