blob: 6219bf06aa01d9105662b388f3f5aeef98120c39 [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
7
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00008from sgmllib import SGMLParser
Guido van Rossumf54d9671995-08-07 20:07:44 +00009from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
Skip Montanaro2dd42762001-01-23 15:35:05 +000011__all__ = ["HTMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000012
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000013class HTMLParser(SGMLParser):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000014 """This is the basic HTML parser class.
15
16 It supports all entity names required by the HTML 2.0 specification
17 RFC 1866. It also defines handlers for all HTML 2.0 and many HTML 3.0
18 and 3.2 elements.
19
20 """
Guido van Rossum7c750e11995-02-27 13:16:55 +000021
Guido van Rossum65126d51995-09-27 16:22:17 +000022 from htmlentitydefs import entitydefs
23
Guido van Rossum453534a1995-09-22 00:55:50 +000024 def __init__(self, formatter, verbose=0):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000025 """Creates an instance of the HTMLParser class.
26
27 The formatter parameter is the formatter instance associated with
28 the parser.
29
30 """
Guido van Rossum453534a1995-09-22 00:55:50 +000031 SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000032 self.formatter = formatter
33 self.savedata = None
34 self.isindex = 0
35 self.title = None
36 self.base = None
37 self.anchor = None
38 self.anchorlist = []
39 self.nofill = 0
40 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000041
Guido van Rossumf54d9671995-08-07 20:07:44 +000042 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000043
Guido van Rossumf54d9671995-08-07 20:07:44 +000044 # --- Formatter interface, taking care of 'savedata' mode;
45 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000046
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000047 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000048 if self.savedata is not None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000049 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000050 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000051 if self.nofill:
52 self.formatter.add_literal_data(data)
53 else:
54 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000055
Guido van Rossumf54d9671995-08-07 20:07:44 +000056 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000057
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000058 def save_bgn(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000059 """Begins saving character data in a buffer instead of sending it
60 to the formatter object.
61
62 Retrieve the stored data via the save_end() method. Use of the
63 save_bgn() / save_end() pair may not be nested.
64
65 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000066 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000067
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000068 def save_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000069 """Ends buffering character data and returns all data saved since
70 the preceding call to the save_bgn() method.
71
72 If the nofill flag is false, whitespace is collapsed to single
73 spaces. A call to this method without a preceding call to the
74 save_bgn() method will raise a TypeError exception.
75
76 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000077 data = self.savedata
78 self.savedata = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000079 if not self.nofill:
Eric S. Raymond373c55e2001-02-09 08:25:29 +000080 data = ' '.join(data.split())
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000081 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000082
Guido van Rossumf54d9671995-08-07 20:07:44 +000083 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000084
85 def anchor_bgn(self, href, name, type):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000086 """This method is called at the start of an anchor region.
87
88 The arguments correspond to the attributes of the <A> tag with
89 the same names. The default implementation maintains a list of
90 hyperlinks (defined by the HREF attribute for <A> tags) within
91 the document. The list of hyperlinks is available as the data
92 attribute anchorlist.
93
94 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000095 self.anchor = href
96 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000097 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000098
99 def anchor_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000100 """This method is called at the end of an anchor region.
101
102 The default implementation adds a textual footnote marker using an
103 index into the list of hyperlinks created by the anchor_bgn()method.
104
105 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000106 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000107 self.handle_data("[%d]" % len(self.anchorlist))
108 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000109
Guido van Rossumf54d9671995-08-07 20:07:44 +0000110 # --- Hook for images; should probably be overridden
111
Guido van Rossum453534a1995-09-22 00:55:50 +0000112 def handle_image(self, src, alt, *args):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000113 """This method is called to handle images.
114
115 The default implementation simply passes the alt value to the
116 handle_data() method.
117
118 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000119 self.handle_data(alt)
120
Guido van Rossumf54d9671995-08-07 20:07:44 +0000121 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000122
123 def start_html(self, attrs): pass
124 def end_html(self): pass
125
126 def start_head(self, attrs): pass
127 def end_head(self): pass
128
129 def start_body(self, attrs): pass
130 def end_body(self): pass
131
Guido van Rossumf54d9671995-08-07 20:07:44 +0000132 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000133
134 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000135 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000136
137 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000138 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000139
Guido van Rossumf54d9671995-08-07 20:07:44 +0000140 def do_base(self, attrs):
141 for a, v in attrs:
142 if a == 'href':
143 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000144
Guido van Rossumf54d9671995-08-07 20:07:44 +0000145 def do_isindex(self, attrs):
146 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000147
Guido van Rossumf54d9671995-08-07 20:07:44 +0000148 def do_link(self, attrs):
149 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000150
Guido van Rossumf54d9671995-08-07 20:07:44 +0000151 def do_meta(self, attrs):
152 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000153
Guido van Rossumf54d9671995-08-07 20:07:44 +0000154 def do_nextid(self, attrs): # Deprecated
155 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000156
Guido van Rossumf54d9671995-08-07 20:07:44 +0000157 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000158
Guido van Rossumf54d9671995-08-07 20:07:44 +0000159 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000160
161 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000162 self.formatter.end_paragraph(1)
163 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000164
165 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000166 self.formatter.end_paragraph(1)
167 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000168
Guido van Rossumf54d9671995-08-07 20:07:44 +0000169 def start_h2(self, attrs):
170 self.formatter.end_paragraph(1)
171 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000172
Guido van Rossumf54d9671995-08-07 20:07:44 +0000173 def end_h2(self):
174 self.formatter.end_paragraph(1)
175 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000176
Guido van Rossumf54d9671995-08-07 20:07:44 +0000177 def start_h3(self, attrs):
178 self.formatter.end_paragraph(1)
179 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000180
Guido van Rossumf54d9671995-08-07 20:07:44 +0000181 def end_h3(self):
182 self.formatter.end_paragraph(1)
183 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000184
Guido van Rossumf54d9671995-08-07 20:07:44 +0000185 def start_h4(self, attrs):
186 self.formatter.end_paragraph(1)
187 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000188
Guido van Rossumf54d9671995-08-07 20:07:44 +0000189 def end_h4(self):
190 self.formatter.end_paragraph(1)
191 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000192
Guido van Rossumf54d9671995-08-07 20:07:44 +0000193 def start_h5(self, attrs):
194 self.formatter.end_paragraph(1)
195 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000196
Guido van Rossumf54d9671995-08-07 20:07:44 +0000197 def end_h5(self):
198 self.formatter.end_paragraph(1)
199 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000200
Guido van Rossumf54d9671995-08-07 20:07:44 +0000201 def start_h6(self, attrs):
202 self.formatter.end_paragraph(1)
203 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000204
Guido van Rossumf54d9671995-08-07 20:07:44 +0000205 def end_h6(self):
206 self.formatter.end_paragraph(1)
207 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000208
Guido van Rossumf54d9671995-08-07 20:07:44 +0000209 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000210
Guido van Rossumf54d9671995-08-07 20:07:44 +0000211 def do_p(self, attrs):
212 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000213
214 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000215 self.formatter.end_paragraph(1)
216 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
217 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000218
219 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000220 self.formatter.end_paragraph(1)
221 self.formatter.pop_font()
222 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000223
Guido van Rossumf54d9671995-08-07 20:07:44 +0000224 def start_xmp(self, attrs):
225 self.start_pre(attrs)
226 self.setliteral('xmp') # Tell SGML parser
227
228 def end_xmp(self):
229 self.end_pre()
230
231 def start_listing(self, attrs):
232 self.start_pre(attrs)
233 self.setliteral('listing') # Tell SGML parser
234
235 def end_listing(self):
236 self.end_pre()
237
238 def start_address(self, attrs):
239 self.formatter.end_paragraph(0)
240 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
241
242 def end_address(self):
243 self.formatter.end_paragraph(0)
244 self.formatter.pop_font()
245
246 def start_blockquote(self, attrs):
247 self.formatter.end_paragraph(1)
248 self.formatter.push_margin('blockquote')
249
250 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000251 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000252 self.formatter.pop_margin()
253
254 # --- List Elements
255
256 def start_ul(self, attrs):
257 self.formatter.end_paragraph(not self.list_stack)
258 self.formatter.push_margin('ul')
259 self.list_stack.append(['ul', '*', 0])
260
261 def end_ul(self):
262 if self.list_stack: del self.list_stack[-1]
263 self.formatter.end_paragraph(not self.list_stack)
264 self.formatter.pop_margin()
265
266 def do_li(self, attrs):
267 self.formatter.end_paragraph(0)
268 if self.list_stack:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000269 [dummy, label, counter] = top = self.list_stack[-1]
270 top[2] = counter = counter+1
Guido van Rossumf54d9671995-08-07 20:07:44 +0000271 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 label, counter = '*', 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000273 self.formatter.add_label_data(label, counter)
274
275 def start_ol(self, attrs):
276 self.formatter.end_paragraph(not self.list_stack)
277 self.formatter.push_margin('ol')
278 label = '1.'
279 for a, v in attrs:
280 if a == 'type':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000281 if len(v) == 1: v = v + '.'
282 label = v
Guido van Rossumf54d9671995-08-07 20:07:44 +0000283 self.list_stack.append(['ol', label, 0])
284
285 def end_ol(self):
286 if self.list_stack: del self.list_stack[-1]
287 self.formatter.end_paragraph(not self.list_stack)
288 self.formatter.pop_margin()
289
290 def start_menu(self, attrs):
291 self.start_ul(attrs)
292
293 def end_menu(self):
294 self.end_ul()
295
296 def start_dir(self, attrs):
297 self.start_ul(attrs)
298
299 def end_dir(self):
300 self.end_ul()
301
302 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000303 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000304 self.list_stack.append(['dl', '', 0])
305
306 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000307 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000308 if self.list_stack: del self.list_stack[-1]
309
310 def do_dt(self, attrs):
311 self.ddpop()
312
313 def do_dd(self, attrs):
314 self.ddpop()
315 self.formatter.push_margin('dd')
316 self.list_stack.append(['dd', '', 0])
317
Guido van Rossumaa763441995-08-09 02:31:00 +0000318 def ddpop(self, bl=0):
319 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000320 if self.list_stack:
321 if self.list_stack[-1][0] == 'dd':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000322 del self.list_stack[-1]
323 self.formatter.pop_margin()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000324
325 # --- Phrase Markup
326
327 # Idiomatic Elements
328
329 def start_cite(self, attrs): self.start_i(attrs)
330 def end_cite(self): self.end_i()
331
332 def start_code(self, attrs): self.start_tt(attrs)
333 def end_code(self): self.end_tt()
334
335 def start_em(self, attrs): self.start_i(attrs)
336 def end_em(self): self.end_i()
337
338 def start_kbd(self, attrs): self.start_tt(attrs)
339 def end_kbd(self): self.end_tt()
340
341 def start_samp(self, attrs): self.start_tt(attrs)
342 def end_samp(self): self.end_tt()
343
Guido van Rossumaa763441995-08-09 02:31:00 +0000344 def start_strong(self, attrs): self.start_b(attrs)
345 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000346
347 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000348 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000349
350 # Typographic Elements
351
352 def start_i(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000353 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000354 def end_i(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000355 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000356
357 def start_b(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000359 def end_b(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000360 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000361
362 def start_tt(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000363 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000364 def end_tt(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000365 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000366
367 def start_a(self, attrs):
368 href = ''
369 name = ''
370 type = ''
371 for attrname, value in attrs:
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000372 value = value.strip()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000373 if attrname == 'href':
374 href = value
375 if attrname == 'name':
376 name = value
377 if attrname == 'type':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000378 type = value.lower()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000379 self.anchor_bgn(href, name, type)
380
381 def end_a(self):
382 self.anchor_end()
383
384 # --- Line Break
385
386 def do_br(self, attrs):
387 self.formatter.add_line_break()
388
389 # --- Horizontal Rule
390
391 def do_hr(self, attrs):
392 self.formatter.add_hor_rule()
393
394 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000395
396 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000397 align = ''
398 alt = '(image)'
399 ismap = ''
400 src = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000401 width = 0
402 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000403 for attrname, value in attrs:
404 if attrname == 'align':
405 align = value
406 if attrname == 'alt':
407 alt = value
408 if attrname == 'ismap':
409 ismap = value
410 if attrname == 'src':
411 src = value
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000412 if attrname == 'width':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000413 try: width = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000414 except ValueError: pass
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000415 if attrname == 'height':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000416 try: height = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000417 except ValueError: pass
Guido van Rossum453534a1995-09-22 00:55:50 +0000418 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000419
Guido van Rossumf54d9671995-08-07 20:07:44 +0000420 # --- Really Old Unofficial Deprecated Stuff
421
422 def do_plaintext(self, attrs):
423 self.start_pre(attrs)
424 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000425
426 # --- Unhandled tags
427
428 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000429 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000430
431 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000432 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000433
Guido van Rossum7c750e11995-02-27 13:16:55 +0000434
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000435def test(args = None):
436 import sys, formatter
437
438 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000439 args = sys.argv[1:]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000440
441 silent = args and args[0] == '-s'
442 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000443 del args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000444
445 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 file = args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000447 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000448 file = 'test.html'
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000449
450 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000451 f = sys.stdin
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000452 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000453 try:
454 f = open(file, 'r')
455 except IOError, msg:
456 print file, ":", msg
457 sys.exit(1)
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000458
459 data = f.read()
460
461 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000462 f.close()
Tim Peters07e99cb2001-01-14 23:47:14 +0000463
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000464 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000465 f = formatter.NullFormatter()
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000466 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000467 f = formatter.AbstractFormatter(formatter.DumbWriter())
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000468
Guido van Rossumf54d9671995-08-07 20:07:44 +0000469 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000470 p.feed(data)
471 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000472
473
474if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000475 test()