blob: 94492a6da2417be91b79a8d04f6d73f6b567c29f [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
7
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00008from sgmllib import SGMLParser
Guido van Rossumf54d9671995-08-07 20:07:44 +00009from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
Skip Montanaro2dd42762001-01-23 15:35:05 +000011__all__ = ["HTMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000012
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000013class HTMLParser(SGMLParser):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000014 """This is the basic HTML parser class.
15
Andrew M. Kuchling0f103432003-10-27 15:47:48 +000016 It supports all entity names required by the XHTML 1.0 Recommendation.
17 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
18 elements.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000019
20 """
Guido van Rossum7c750e11995-02-27 13:16:55 +000021
Guido van Rossum65126d51995-09-27 16:22:17 +000022 from htmlentitydefs import entitydefs
23
Guido van Rossum453534a1995-09-22 00:55:50 +000024 def __init__(self, formatter, verbose=0):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000025 """Creates an instance of the HTMLParser class.
26
27 The formatter parameter is the formatter instance associated with
28 the parser.
29
30 """
Guido van Rossum453534a1995-09-22 00:55:50 +000031 SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000032 self.formatter = formatter
Martin v. Löwiscbe81f22003-09-12 16:38:00 +000033
34 def reset(self):
35 SGMLParser.reset(self)
Guido van Rossumf54d9671995-08-07 20:07:44 +000036 self.savedata = None
37 self.isindex = 0
38 self.title = None
39 self.base = None
40 self.anchor = None
41 self.anchorlist = []
42 self.nofill = 0
43 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
Guido van Rossumf54d9671995-08-07 20:07:44 +000045 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000046
Guido van Rossumf54d9671995-08-07 20:07:44 +000047 # --- Formatter interface, taking care of 'savedata' mode;
48 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000049
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000050 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000051 if self.savedata is not None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000052 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000053 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000054 if self.nofill:
55 self.formatter.add_literal_data(data)
56 else:
57 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000058
Guido van Rossumf54d9671995-08-07 20:07:44 +000059 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000060
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000061 def save_bgn(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000062 """Begins saving character data in a buffer instead of sending it
63 to the formatter object.
64
65 Retrieve the stored data via the save_end() method. Use of the
66 save_bgn() / save_end() pair may not be nested.
67
68 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000069 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000070
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000071 def save_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000072 """Ends buffering character data and returns all data saved since
73 the preceding call to the save_bgn() method.
74
75 If the nofill flag is false, whitespace is collapsed to single
76 spaces. A call to this method without a preceding call to the
77 save_bgn() method will raise a TypeError exception.
78
79 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000080 data = self.savedata
81 self.savedata = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 if not self.nofill:
Eric S. Raymond373c55e2001-02-09 08:25:29 +000083 data = ' '.join(data.split())
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000084 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000085
Guido van Rossumf54d9671995-08-07 20:07:44 +000086 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000087
88 def anchor_bgn(self, href, name, type):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000089 """This method is called at the start of an anchor region.
90
91 The arguments correspond to the attributes of the <A> tag with
92 the same names. The default implementation maintains a list of
93 hyperlinks (defined by the HREF attribute for <A> tags) within
94 the document. The list of hyperlinks is available as the data
95 attribute anchorlist.
96
97 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000098 self.anchor = href
99 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000100 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000101
102 def anchor_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000103 """This method is called at the end of an anchor region.
104
105 The default implementation adds a textual footnote marker using an
106 index into the list of hyperlinks created by the anchor_bgn()method.
107
108 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000109 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000110 self.handle_data("[%d]" % len(self.anchorlist))
111 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000112
Guido van Rossumf54d9671995-08-07 20:07:44 +0000113 # --- Hook for images; should probably be overridden
114
Guido van Rossum453534a1995-09-22 00:55:50 +0000115 def handle_image(self, src, alt, *args):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000116 """This method is called to handle images.
117
118 The default implementation simply passes the alt value to the
119 handle_data() method.
120
121 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000122 self.handle_data(alt)
123
Guido van Rossumf54d9671995-08-07 20:07:44 +0000124 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000125
126 def start_html(self, attrs): pass
127 def end_html(self): pass
128
129 def start_head(self, attrs): pass
130 def end_head(self): pass
131
132 def start_body(self, attrs): pass
133 def end_body(self): pass
134
Guido van Rossumf54d9671995-08-07 20:07:44 +0000135 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000136
137 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000138 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000139
140 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000141 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000142
Guido van Rossumf54d9671995-08-07 20:07:44 +0000143 def do_base(self, attrs):
144 for a, v in attrs:
145 if a == 'href':
146 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000147
Guido van Rossumf54d9671995-08-07 20:07:44 +0000148 def do_isindex(self, attrs):
149 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000150
Guido van Rossumf54d9671995-08-07 20:07:44 +0000151 def do_link(self, attrs):
152 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000153
Guido van Rossumf54d9671995-08-07 20:07:44 +0000154 def do_meta(self, attrs):
155 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000156
Guido van Rossumf54d9671995-08-07 20:07:44 +0000157 def do_nextid(self, attrs): # Deprecated
158 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000159
Guido van Rossumf54d9671995-08-07 20:07:44 +0000160 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000161
Guido van Rossumf54d9671995-08-07 20:07:44 +0000162 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000163
164 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000165 self.formatter.end_paragraph(1)
166 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000167
168 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000169 self.formatter.end_paragraph(1)
170 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000171
Guido van Rossumf54d9671995-08-07 20:07:44 +0000172 def start_h2(self, attrs):
173 self.formatter.end_paragraph(1)
174 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000175
Guido van Rossumf54d9671995-08-07 20:07:44 +0000176 def end_h2(self):
177 self.formatter.end_paragraph(1)
178 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000179
Guido van Rossumf54d9671995-08-07 20:07:44 +0000180 def start_h3(self, attrs):
181 self.formatter.end_paragraph(1)
182 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000183
Guido van Rossumf54d9671995-08-07 20:07:44 +0000184 def end_h3(self):
185 self.formatter.end_paragraph(1)
186 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000187
Guido van Rossumf54d9671995-08-07 20:07:44 +0000188 def start_h4(self, attrs):
189 self.formatter.end_paragraph(1)
190 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000191
Guido van Rossumf54d9671995-08-07 20:07:44 +0000192 def end_h4(self):
193 self.formatter.end_paragraph(1)
194 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000195
Guido van Rossumf54d9671995-08-07 20:07:44 +0000196 def start_h5(self, attrs):
197 self.formatter.end_paragraph(1)
198 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000199
Guido van Rossumf54d9671995-08-07 20:07:44 +0000200 def end_h5(self):
201 self.formatter.end_paragraph(1)
202 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000203
Guido van Rossumf54d9671995-08-07 20:07:44 +0000204 def start_h6(self, attrs):
205 self.formatter.end_paragraph(1)
206 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000207
Guido van Rossumf54d9671995-08-07 20:07:44 +0000208 def end_h6(self):
209 self.formatter.end_paragraph(1)
210 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000211
Guido van Rossumf54d9671995-08-07 20:07:44 +0000212 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000213
Guido van Rossumf54d9671995-08-07 20:07:44 +0000214 def do_p(self, attrs):
215 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000216
217 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000218 self.formatter.end_paragraph(1)
219 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
220 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000221
222 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000223 self.formatter.end_paragraph(1)
224 self.formatter.pop_font()
225 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000226
Guido van Rossumf54d9671995-08-07 20:07:44 +0000227 def start_xmp(self, attrs):
228 self.start_pre(attrs)
229 self.setliteral('xmp') # Tell SGML parser
230
231 def end_xmp(self):
232 self.end_pre()
233
234 def start_listing(self, attrs):
235 self.start_pre(attrs)
236 self.setliteral('listing') # Tell SGML parser
237
238 def end_listing(self):
239 self.end_pre()
240
241 def start_address(self, attrs):
242 self.formatter.end_paragraph(0)
243 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
244
245 def end_address(self):
246 self.formatter.end_paragraph(0)
247 self.formatter.pop_font()
248
249 def start_blockquote(self, attrs):
250 self.formatter.end_paragraph(1)
251 self.formatter.push_margin('blockquote')
252
253 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000254 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000255 self.formatter.pop_margin()
256
257 # --- List Elements
258
259 def start_ul(self, attrs):
260 self.formatter.end_paragraph(not self.list_stack)
261 self.formatter.push_margin('ul')
262 self.list_stack.append(['ul', '*', 0])
263
264 def end_ul(self):
265 if self.list_stack: del self.list_stack[-1]
266 self.formatter.end_paragraph(not self.list_stack)
267 self.formatter.pop_margin()
268
269 def do_li(self, attrs):
270 self.formatter.end_paragraph(0)
271 if self.list_stack:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 [dummy, label, counter] = top = self.list_stack[-1]
273 top[2] = counter = counter+1
Guido van Rossumf54d9671995-08-07 20:07:44 +0000274 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000275 label, counter = '*', 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000276 self.formatter.add_label_data(label, counter)
277
278 def start_ol(self, attrs):
279 self.formatter.end_paragraph(not self.list_stack)
280 self.formatter.push_margin('ol')
281 label = '1.'
282 for a, v in attrs:
283 if a == 'type':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000284 if len(v) == 1: v = v + '.'
285 label = v
Guido van Rossumf54d9671995-08-07 20:07:44 +0000286 self.list_stack.append(['ol', label, 0])
287
288 def end_ol(self):
289 if self.list_stack: del self.list_stack[-1]
290 self.formatter.end_paragraph(not self.list_stack)
291 self.formatter.pop_margin()
292
293 def start_menu(self, attrs):
294 self.start_ul(attrs)
295
296 def end_menu(self):
297 self.end_ul()
298
299 def start_dir(self, attrs):
300 self.start_ul(attrs)
301
302 def end_dir(self):
303 self.end_ul()
304
305 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000306 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000307 self.list_stack.append(['dl', '', 0])
308
309 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000310 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000311 if self.list_stack: del self.list_stack[-1]
312
313 def do_dt(self, attrs):
314 self.ddpop()
315
316 def do_dd(self, attrs):
317 self.ddpop()
318 self.formatter.push_margin('dd')
319 self.list_stack.append(['dd', '', 0])
320
Guido van Rossumaa763441995-08-09 02:31:00 +0000321 def ddpop(self, bl=0):
322 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000323 if self.list_stack:
324 if self.list_stack[-1][0] == 'dd':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000325 del self.list_stack[-1]
326 self.formatter.pop_margin()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000327
328 # --- Phrase Markup
329
330 # Idiomatic Elements
331
332 def start_cite(self, attrs): self.start_i(attrs)
333 def end_cite(self): self.end_i()
334
335 def start_code(self, attrs): self.start_tt(attrs)
336 def end_code(self): self.end_tt()
337
338 def start_em(self, attrs): self.start_i(attrs)
339 def end_em(self): self.end_i()
340
341 def start_kbd(self, attrs): self.start_tt(attrs)
342 def end_kbd(self): self.end_tt()
343
344 def start_samp(self, attrs): self.start_tt(attrs)
345 def end_samp(self): self.end_tt()
346
Guido van Rossumaa763441995-08-09 02:31:00 +0000347 def start_strong(self, attrs): self.start_b(attrs)
348 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000349
350 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000351 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000352
353 # Typographic Elements
354
355 def start_i(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000356 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000357 def end_i(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000359
360 def start_b(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000361 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000362 def end_b(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000363 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000364
365 def start_tt(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000366 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000367 def end_tt(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000368 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000369
370 def start_a(self, attrs):
371 href = ''
372 name = ''
373 type = ''
374 for attrname, value in attrs:
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000375 value = value.strip()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000376 if attrname == 'href':
377 href = value
378 if attrname == 'name':
379 name = value
380 if attrname == 'type':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000381 type = value.lower()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000382 self.anchor_bgn(href, name, type)
383
384 def end_a(self):
385 self.anchor_end()
386
387 # --- Line Break
388
389 def do_br(self, attrs):
390 self.formatter.add_line_break()
391
392 # --- Horizontal Rule
393
394 def do_hr(self, attrs):
395 self.formatter.add_hor_rule()
396
397 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000398
399 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000400 align = ''
401 alt = '(image)'
402 ismap = ''
403 src = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000404 width = 0
405 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000406 for attrname, value in attrs:
407 if attrname == 'align':
408 align = value
409 if attrname == 'alt':
410 alt = value
411 if attrname == 'ismap':
412 ismap = value
413 if attrname == 'src':
414 src = value
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000415 if attrname == 'width':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000416 try: width = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000417 except ValueError: pass
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000418 if attrname == 'height':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000419 try: height = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000420 except ValueError: pass
Guido van Rossum453534a1995-09-22 00:55:50 +0000421 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000422
Guido van Rossumf54d9671995-08-07 20:07:44 +0000423 # --- Really Old Unofficial Deprecated Stuff
424
425 def do_plaintext(self, attrs):
426 self.start_pre(attrs)
427 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000428
429 # --- Unhandled tags
430
431 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000432 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000433
434 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000435 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000436
Guido van Rossum7c750e11995-02-27 13:16:55 +0000437
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000438def test(args = None):
439 import sys, formatter
440
441 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000442 args = sys.argv[1:]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000443
444 silent = args and args[0] == '-s'
445 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 del args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000447
448 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000449 file = args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000450 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000451 file = 'test.html'
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000452
453 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000454 f = sys.stdin
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000455 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000456 try:
457 f = open(file, 'r')
458 except IOError, msg:
459 print file, ":", msg
460 sys.exit(1)
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000461
462 data = f.read()
463
464 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000465 f.close()
Tim Peters07e99cb2001-01-14 23:47:14 +0000466
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000467 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000468 f = formatter.NullFormatter()
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000469 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000470 f = formatter.AbstractFormatter(formatter.DumbWriter())
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000471
Guido van Rossumf54d9671995-08-07 20:07:44 +0000472 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000473 p.feed(data)
474 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000475
476
477if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000478 test()