blob: 24a2e2f3c71ff45f9649519a81cf46cc99162028 [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
Fred Drake583359e2004-09-09 02:24:13 +00007import sgmllib
Guido van Rossum7c750e11995-02-27 13:16:55 +00008
Guido van Rossumf54d9671995-08-07 20:07:44 +00009from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
Fred Drake583359e2004-09-09 02:24:13 +000011__all__ = ["HTMLParser", "HTMLParseError"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000012
Fred Drake583359e2004-09-09 02:24:13 +000013
14class HTMLParseError(sgmllib.SGMLParseError):
15 """Error raised when an HTML document can't be parsed."""
16
17
18class HTMLParser(sgmllib.SGMLParser):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000019 """This is the basic HTML parser class.
20
Andrew M. Kuchling0f103432003-10-27 15:47:48 +000021 It supports all entity names required by the XHTML 1.0 Recommendation.
22 It also defines handlers for all HTML 2.0 and many HTML 3.0 and 3.2
23 elements.
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000024
25 """
Guido van Rossum7c750e11995-02-27 13:16:55 +000026
Guido van Rossum65126d51995-09-27 16:22:17 +000027 from htmlentitydefs import entitydefs
28
Guido van Rossum453534a1995-09-22 00:55:50 +000029 def __init__(self, formatter, verbose=0):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000030 """Creates an instance of the HTMLParser class.
31
32 The formatter parameter is the formatter instance associated with
33 the parser.
34
35 """
Fred Drake583359e2004-09-09 02:24:13 +000036 sgmllib.SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000037 self.formatter = formatter
Martin v. Löwiscbe81f22003-09-12 16:38:00 +000038
Fred Drake583359e2004-09-09 02:24:13 +000039 def error(self, message):
40 raise HTMLParseError(message)
41
Martin v. Löwiscbe81f22003-09-12 16:38:00 +000042 def reset(self):
Fred Drake583359e2004-09-09 02:24:13 +000043 sgmllib.SGMLParser.reset(self)
Guido van Rossumf54d9671995-08-07 20:07:44 +000044 self.savedata = None
45 self.isindex = 0
46 self.title = None
47 self.base = None
48 self.anchor = None
49 self.anchorlist = []
50 self.nofill = 0
51 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000052
Guido van Rossumf54d9671995-08-07 20:07:44 +000053 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000054
Guido van Rossumf54d9671995-08-07 20:07:44 +000055 # --- Formatter interface, taking care of 'savedata' mode;
56 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000057
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000058 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000059 if self.savedata is not None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000060 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000061 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000062 if self.nofill:
63 self.formatter.add_literal_data(data)
64 else:
65 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000066
Guido van Rossumf54d9671995-08-07 20:07:44 +000067 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000068
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000069 def save_bgn(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000070 """Begins saving character data in a buffer instead of sending it
71 to the formatter object.
72
73 Retrieve the stored data via the save_end() method. Use of the
74 save_bgn() / save_end() pair may not be nested.
75
76 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000077 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000078
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000079 def save_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000080 """Ends buffering character data and returns all data saved since
81 the preceding call to the save_bgn() method.
82
83 If the nofill flag is false, whitespace is collapsed to single
84 spaces. A call to this method without a preceding call to the
85 save_bgn() method will raise a TypeError exception.
86
87 """
Guido van Rossumf54d9671995-08-07 20:07:44 +000088 data = self.savedata
89 self.savedata = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000090 if not self.nofill:
Eric S. Raymond373c55e2001-02-09 08:25:29 +000091 data = ' '.join(data.split())
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000092 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000093
Guido van Rossumf54d9671995-08-07 20:07:44 +000094 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000095
96 def anchor_bgn(self, href, name, type):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +000097 """This method is called at the start of an anchor region.
98
99 The arguments correspond to the attributes of the <A> tag with
100 the same names. The default implementation maintains a list of
101 hyperlinks (defined by the HREF attribute for <A> tags) within
102 the document. The list of hyperlinks is available as the data
103 attribute anchorlist.
104
105 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000106 self.anchor = href
107 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000108 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000109
110 def anchor_end(self):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000111 """This method is called at the end of an anchor region.
112
113 The default implementation adds a textual footnote marker using an
114 index into the list of hyperlinks created by the anchor_bgn()method.
115
116 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000117 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 self.handle_data("[%d]" % len(self.anchorlist))
119 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000120
Guido van Rossumf54d9671995-08-07 20:07:44 +0000121 # --- Hook for images; should probably be overridden
122
Guido van Rossum453534a1995-09-22 00:55:50 +0000123 def handle_image(self, src, alt, *args):
Raymond Hettingeraef22fb2002-05-29 16:18:42 +0000124 """This method is called to handle images.
125
126 The default implementation simply passes the alt value to the
127 handle_data() method.
128
129 """
Guido van Rossumf54d9671995-08-07 20:07:44 +0000130 self.handle_data(alt)
131
Guido van Rossumf54d9671995-08-07 20:07:44 +0000132 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000133
134 def start_html(self, attrs): pass
135 def end_html(self): pass
136
137 def start_head(self, attrs): pass
138 def end_head(self): pass
139
140 def start_body(self, attrs): pass
141 def end_body(self): pass
142
Guido van Rossumf54d9671995-08-07 20:07:44 +0000143 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000144
145 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000146 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000147
148 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000149 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000150
Guido van Rossumf54d9671995-08-07 20:07:44 +0000151 def do_base(self, attrs):
152 for a, v in attrs:
153 if a == 'href':
154 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000155
Guido van Rossumf54d9671995-08-07 20:07:44 +0000156 def do_isindex(self, attrs):
157 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000158
Guido van Rossumf54d9671995-08-07 20:07:44 +0000159 def do_link(self, attrs):
160 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000161
Guido van Rossumf54d9671995-08-07 20:07:44 +0000162 def do_meta(self, attrs):
163 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000164
Guido van Rossumf54d9671995-08-07 20:07:44 +0000165 def do_nextid(self, attrs): # Deprecated
166 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000167
Guido van Rossumf54d9671995-08-07 20:07:44 +0000168 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000169
Guido van Rossumf54d9671995-08-07 20:07:44 +0000170 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000171
172 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000173 self.formatter.end_paragraph(1)
174 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000175
176 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000177 self.formatter.end_paragraph(1)
178 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000179
Guido van Rossumf54d9671995-08-07 20:07:44 +0000180 def start_h2(self, attrs):
181 self.formatter.end_paragraph(1)
182 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000183
Guido van Rossumf54d9671995-08-07 20:07:44 +0000184 def end_h2(self):
185 self.formatter.end_paragraph(1)
186 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000187
Guido van Rossumf54d9671995-08-07 20:07:44 +0000188 def start_h3(self, attrs):
189 self.formatter.end_paragraph(1)
190 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000191
Guido van Rossumf54d9671995-08-07 20:07:44 +0000192 def end_h3(self):
193 self.formatter.end_paragraph(1)
194 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000195
Guido van Rossumf54d9671995-08-07 20:07:44 +0000196 def start_h4(self, attrs):
197 self.formatter.end_paragraph(1)
198 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000199
Guido van Rossumf54d9671995-08-07 20:07:44 +0000200 def end_h4(self):
201 self.formatter.end_paragraph(1)
202 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000203
Guido van Rossumf54d9671995-08-07 20:07:44 +0000204 def start_h5(self, attrs):
205 self.formatter.end_paragraph(1)
206 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000207
Guido van Rossumf54d9671995-08-07 20:07:44 +0000208 def end_h5(self):
209 self.formatter.end_paragraph(1)
210 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000211
Guido van Rossumf54d9671995-08-07 20:07:44 +0000212 def start_h6(self, attrs):
213 self.formatter.end_paragraph(1)
214 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000215
Guido van Rossumf54d9671995-08-07 20:07:44 +0000216 def end_h6(self):
217 self.formatter.end_paragraph(1)
218 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000219
Guido van Rossumf54d9671995-08-07 20:07:44 +0000220 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000221
Guido van Rossumf54d9671995-08-07 20:07:44 +0000222 def do_p(self, attrs):
223 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000224
225 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000226 self.formatter.end_paragraph(1)
227 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
228 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000229
230 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000231 self.formatter.end_paragraph(1)
232 self.formatter.pop_font()
233 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000234
Guido van Rossumf54d9671995-08-07 20:07:44 +0000235 def start_xmp(self, attrs):
236 self.start_pre(attrs)
237 self.setliteral('xmp') # Tell SGML parser
238
239 def end_xmp(self):
240 self.end_pre()
241
242 def start_listing(self, attrs):
243 self.start_pre(attrs)
244 self.setliteral('listing') # Tell SGML parser
245
246 def end_listing(self):
247 self.end_pre()
248
249 def start_address(self, attrs):
250 self.formatter.end_paragraph(0)
251 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
252
253 def end_address(self):
254 self.formatter.end_paragraph(0)
255 self.formatter.pop_font()
256
257 def start_blockquote(self, attrs):
258 self.formatter.end_paragraph(1)
259 self.formatter.push_margin('blockquote')
260
261 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000262 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000263 self.formatter.pop_margin()
264
265 # --- List Elements
266
267 def start_ul(self, attrs):
268 self.formatter.end_paragraph(not self.list_stack)
269 self.formatter.push_margin('ul')
270 self.list_stack.append(['ul', '*', 0])
271
272 def end_ul(self):
273 if self.list_stack: del self.list_stack[-1]
274 self.formatter.end_paragraph(not self.list_stack)
275 self.formatter.pop_margin()
276
277 def do_li(self, attrs):
278 self.formatter.end_paragraph(0)
279 if self.list_stack:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000280 [dummy, label, counter] = top = self.list_stack[-1]
281 top[2] = counter = counter+1
Guido van Rossumf54d9671995-08-07 20:07:44 +0000282 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000283 label, counter = '*', 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000284 self.formatter.add_label_data(label, counter)
285
286 def start_ol(self, attrs):
287 self.formatter.end_paragraph(not self.list_stack)
288 self.formatter.push_margin('ol')
289 label = '1.'
290 for a, v in attrs:
291 if a == 'type':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 if len(v) == 1: v = v + '.'
293 label = v
Guido van Rossumf54d9671995-08-07 20:07:44 +0000294 self.list_stack.append(['ol', label, 0])
295
296 def end_ol(self):
297 if self.list_stack: del self.list_stack[-1]
298 self.formatter.end_paragraph(not self.list_stack)
299 self.formatter.pop_margin()
300
301 def start_menu(self, attrs):
302 self.start_ul(attrs)
303
304 def end_menu(self):
305 self.end_ul()
306
307 def start_dir(self, attrs):
308 self.start_ul(attrs)
309
310 def end_dir(self):
311 self.end_ul()
312
313 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000314 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000315 self.list_stack.append(['dl', '', 0])
316
317 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000318 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000319 if self.list_stack: del self.list_stack[-1]
320
321 def do_dt(self, attrs):
322 self.ddpop()
323
324 def do_dd(self, attrs):
325 self.ddpop()
326 self.formatter.push_margin('dd')
327 self.list_stack.append(['dd', '', 0])
328
Guido van Rossumaa763441995-08-09 02:31:00 +0000329 def ddpop(self, bl=0):
330 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000331 if self.list_stack:
332 if self.list_stack[-1][0] == 'dd':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 del self.list_stack[-1]
334 self.formatter.pop_margin()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000335
336 # --- Phrase Markup
337
338 # Idiomatic Elements
339
340 def start_cite(self, attrs): self.start_i(attrs)
341 def end_cite(self): self.end_i()
342
343 def start_code(self, attrs): self.start_tt(attrs)
344 def end_code(self): self.end_tt()
345
346 def start_em(self, attrs): self.start_i(attrs)
347 def end_em(self): self.end_i()
348
349 def start_kbd(self, attrs): self.start_tt(attrs)
350 def end_kbd(self): self.end_tt()
351
352 def start_samp(self, attrs): self.start_tt(attrs)
353 def end_samp(self): self.end_tt()
354
Guido van Rossumaa763441995-08-09 02:31:00 +0000355 def start_strong(self, attrs): self.start_b(attrs)
356 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000357
358 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000359 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000360
361 # Typographic Elements
362
363 def start_i(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000364 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000365 def end_i(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000366 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000367
368 def start_b(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000369 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000370 def end_b(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000371 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000372
373 def start_tt(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000374 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000375 def end_tt(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000376 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000377
378 def start_a(self, attrs):
379 href = ''
380 name = ''
381 type = ''
382 for attrname, value in attrs:
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000383 value = value.strip()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000384 if attrname == 'href':
385 href = value
386 if attrname == 'name':
387 name = value
388 if attrname == 'type':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000389 type = value.lower()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000390 self.anchor_bgn(href, name, type)
391
392 def end_a(self):
393 self.anchor_end()
394
395 # --- Line Break
396
397 def do_br(self, attrs):
398 self.formatter.add_line_break()
399
400 # --- Horizontal Rule
401
402 def do_hr(self, attrs):
403 self.formatter.add_hor_rule()
404
405 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000406
407 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000408 align = ''
409 alt = '(image)'
410 ismap = ''
411 src = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000412 width = 0
413 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000414 for attrname, value in attrs:
415 if attrname == 'align':
416 align = value
417 if attrname == 'alt':
418 alt = value
419 if attrname == 'ismap':
420 ismap = value
421 if attrname == 'src':
422 src = value
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000423 if attrname == 'width':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000424 try: width = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000425 except ValueError: pass
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000426 if attrname == 'height':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000427 try: height = int(value)
Fred Drake1b7e0792001-05-11 18:45:52 +0000428 except ValueError: pass
Guido van Rossum453534a1995-09-22 00:55:50 +0000429 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000430
Guido van Rossumf54d9671995-08-07 20:07:44 +0000431 # --- Really Old Unofficial Deprecated Stuff
432
433 def do_plaintext(self, attrs):
434 self.start_pre(attrs)
435 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000436
437 # --- Unhandled tags
438
439 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000440 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000441
442 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000443 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000444
Guido van Rossum7c750e11995-02-27 13:16:55 +0000445
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000446def test(args = None):
447 import sys, formatter
448
449 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000450 args = sys.argv[1:]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000451
452 silent = args and args[0] == '-s'
453 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000454 del args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000455
456 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000457 file = args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000458 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000459 file = 'test.html'
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000460
461 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000462 f = sys.stdin
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000463 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000464 try:
465 f = open(file, 'r')
466 except IOError, msg:
467 print file, ":", msg
468 sys.exit(1)
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000469
470 data = f.read()
471
472 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000473 f.close()
Tim Peters07e99cb2001-01-14 23:47:14 +0000474
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000475 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000476 f = formatter.NullFormatter()
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000477 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000478 f = formatter.AbstractFormatter(formatter.DumbWriter())
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000479
Guido van Rossumf54d9671995-08-07 20:07:44 +0000480 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000481 p.feed(data)
482 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000483
484
485if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000486 test()