blob: 73af9f6734c09a47282ce6fe715df402da25209a [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
7
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00008from sgmllib import SGMLParser
Guido van Rossumf54d9671995-08-07 20:07:44 +00009from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
Skip Montanaro2dd42762001-01-23 15:35:05 +000011__all__ = ["HTMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000012
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000013class HTMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Guido van Rossum65126d51995-09-27 16:22:17 +000015 from htmlentitydefs import entitydefs
16
Guido van Rossum453534a1995-09-22 00:55:50 +000017 def __init__(self, formatter, verbose=0):
18 SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000019 self.formatter = formatter
20 self.savedata = None
21 self.isindex = 0
22 self.title = None
23 self.base = None
24 self.anchor = None
25 self.anchorlist = []
26 self.nofill = 0
27 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000028
Guido van Rossumf54d9671995-08-07 20:07:44 +000029 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000030
Guido van Rossumf54d9671995-08-07 20:07:44 +000031 # --- Formatter interface, taking care of 'savedata' mode;
32 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000033
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000034 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000035 if self.savedata is not None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000036 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000037 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000038 if self.nofill:
39 self.formatter.add_literal_data(data)
40 else:
41 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000042
Guido van Rossumf54d9671995-08-07 20:07:44 +000043 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000045 def save_bgn(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000046 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000047
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000048 def save_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000049 data = self.savedata
50 self.savedata = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000051 if not self.nofill:
Eric S. Raymond373c55e2001-02-09 08:25:29 +000052 data = ' '.join(data.split())
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000053 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000054
Guido van Rossumf54d9671995-08-07 20:07:44 +000055 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000056
57 def anchor_bgn(self, href, name, type):
Guido van Rossumf54d9671995-08-07 20:07:44 +000058 self.anchor = href
59 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000060 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000061
62 def anchor_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000063 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000064 self.handle_data("[%d]" % len(self.anchorlist))
65 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000066
Guido van Rossumf54d9671995-08-07 20:07:44 +000067 # --- Hook for images; should probably be overridden
68
Guido van Rossum453534a1995-09-22 00:55:50 +000069 def handle_image(self, src, alt, *args):
Guido van Rossumf54d9671995-08-07 20:07:44 +000070 self.handle_data(alt)
71
Guido van Rossumf54d9671995-08-07 20:07:44 +000072 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000073
74 def start_html(self, attrs): pass
75 def end_html(self): pass
76
77 def start_head(self, attrs): pass
78 def end_head(self): pass
79
80 def start_body(self, attrs): pass
81 def end_body(self): pass
82
Guido van Rossumf54d9671995-08-07 20:07:44 +000083 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000084
85 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +000086 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000087
88 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000089 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000090
Guido van Rossumf54d9671995-08-07 20:07:44 +000091 def do_base(self, attrs):
92 for a, v in attrs:
93 if a == 'href':
94 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000095
Guido van Rossumf54d9671995-08-07 20:07:44 +000096 def do_isindex(self, attrs):
97 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000098
Guido van Rossumf54d9671995-08-07 20:07:44 +000099 def do_link(self, attrs):
100 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000101
Guido van Rossumf54d9671995-08-07 20:07:44 +0000102 def do_meta(self, attrs):
103 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000104
Guido van Rossumf54d9671995-08-07 20:07:44 +0000105 def do_nextid(self, attrs): # Deprecated
106 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000107
Guido van Rossumf54d9671995-08-07 20:07:44 +0000108 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000109
Guido van Rossumf54d9671995-08-07 20:07:44 +0000110 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000111
112 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000113 self.formatter.end_paragraph(1)
114 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000115
116 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000117 self.formatter.end_paragraph(1)
118 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000119
Guido van Rossumf54d9671995-08-07 20:07:44 +0000120 def start_h2(self, attrs):
121 self.formatter.end_paragraph(1)
122 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000123
Guido van Rossumf54d9671995-08-07 20:07:44 +0000124 def end_h2(self):
125 self.formatter.end_paragraph(1)
126 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000127
Guido van Rossumf54d9671995-08-07 20:07:44 +0000128 def start_h3(self, attrs):
129 self.formatter.end_paragraph(1)
130 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000131
Guido van Rossumf54d9671995-08-07 20:07:44 +0000132 def end_h3(self):
133 self.formatter.end_paragraph(1)
134 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000135
Guido van Rossumf54d9671995-08-07 20:07:44 +0000136 def start_h4(self, attrs):
137 self.formatter.end_paragraph(1)
138 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000139
Guido van Rossumf54d9671995-08-07 20:07:44 +0000140 def end_h4(self):
141 self.formatter.end_paragraph(1)
142 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000143
Guido van Rossumf54d9671995-08-07 20:07:44 +0000144 def start_h5(self, attrs):
145 self.formatter.end_paragraph(1)
146 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000147
Guido van Rossumf54d9671995-08-07 20:07:44 +0000148 def end_h5(self):
149 self.formatter.end_paragraph(1)
150 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000151
Guido van Rossumf54d9671995-08-07 20:07:44 +0000152 def start_h6(self, attrs):
153 self.formatter.end_paragraph(1)
154 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000155
Guido van Rossumf54d9671995-08-07 20:07:44 +0000156 def end_h6(self):
157 self.formatter.end_paragraph(1)
158 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000159
Guido van Rossumf54d9671995-08-07 20:07:44 +0000160 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000161
Guido van Rossumf54d9671995-08-07 20:07:44 +0000162 def do_p(self, attrs):
163 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000164
165 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000166 self.formatter.end_paragraph(1)
167 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
168 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000169
170 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000171 self.formatter.end_paragraph(1)
172 self.formatter.pop_font()
173 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000174
Guido van Rossumf54d9671995-08-07 20:07:44 +0000175 def start_xmp(self, attrs):
176 self.start_pre(attrs)
177 self.setliteral('xmp') # Tell SGML parser
178
179 def end_xmp(self):
180 self.end_pre()
181
182 def start_listing(self, attrs):
183 self.start_pre(attrs)
184 self.setliteral('listing') # Tell SGML parser
185
186 def end_listing(self):
187 self.end_pre()
188
189 def start_address(self, attrs):
190 self.formatter.end_paragraph(0)
191 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
192
193 def end_address(self):
194 self.formatter.end_paragraph(0)
195 self.formatter.pop_font()
196
197 def start_blockquote(self, attrs):
198 self.formatter.end_paragraph(1)
199 self.formatter.push_margin('blockquote')
200
201 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000202 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000203 self.formatter.pop_margin()
204
205 # --- List Elements
206
207 def start_ul(self, attrs):
208 self.formatter.end_paragraph(not self.list_stack)
209 self.formatter.push_margin('ul')
210 self.list_stack.append(['ul', '*', 0])
211
212 def end_ul(self):
213 if self.list_stack: del self.list_stack[-1]
214 self.formatter.end_paragraph(not self.list_stack)
215 self.formatter.pop_margin()
216
217 def do_li(self, attrs):
218 self.formatter.end_paragraph(0)
219 if self.list_stack:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000220 [dummy, label, counter] = top = self.list_stack[-1]
221 top[2] = counter = counter+1
Guido van Rossumf54d9671995-08-07 20:07:44 +0000222 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000223 label, counter = '*', 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000224 self.formatter.add_label_data(label, counter)
225
226 def start_ol(self, attrs):
227 self.formatter.end_paragraph(not self.list_stack)
228 self.formatter.push_margin('ol')
229 label = '1.'
230 for a, v in attrs:
231 if a == 'type':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 if len(v) == 1: v = v + '.'
233 label = v
Guido van Rossumf54d9671995-08-07 20:07:44 +0000234 self.list_stack.append(['ol', label, 0])
235
236 def end_ol(self):
237 if self.list_stack: del self.list_stack[-1]
238 self.formatter.end_paragraph(not self.list_stack)
239 self.formatter.pop_margin()
240
241 def start_menu(self, attrs):
242 self.start_ul(attrs)
243
244 def end_menu(self):
245 self.end_ul()
246
247 def start_dir(self, attrs):
248 self.start_ul(attrs)
249
250 def end_dir(self):
251 self.end_ul()
252
253 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000254 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000255 self.list_stack.append(['dl', '', 0])
256
257 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000258 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000259 if self.list_stack: del self.list_stack[-1]
260
261 def do_dt(self, attrs):
262 self.ddpop()
263
264 def do_dd(self, attrs):
265 self.ddpop()
266 self.formatter.push_margin('dd')
267 self.list_stack.append(['dd', '', 0])
268
Guido van Rossumaa763441995-08-09 02:31:00 +0000269 def ddpop(self, bl=0):
270 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000271 if self.list_stack:
272 if self.list_stack[-1][0] == 'dd':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 del self.list_stack[-1]
274 self.formatter.pop_margin()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000275
276 # --- Phrase Markup
277
278 # Idiomatic Elements
279
280 def start_cite(self, attrs): self.start_i(attrs)
281 def end_cite(self): self.end_i()
282
283 def start_code(self, attrs): self.start_tt(attrs)
284 def end_code(self): self.end_tt()
285
286 def start_em(self, attrs): self.start_i(attrs)
287 def end_em(self): self.end_i()
288
289 def start_kbd(self, attrs): self.start_tt(attrs)
290 def end_kbd(self): self.end_tt()
291
292 def start_samp(self, attrs): self.start_tt(attrs)
293 def end_samp(self): self.end_tt()
294
Guido van Rossumaa763441995-08-09 02:31:00 +0000295 def start_strong(self, attrs): self.start_b(attrs)
296 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000297
298 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000299 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000300
301 # Typographic Elements
302
303 def start_i(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000304 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000305 def end_i(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000306 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000307
308 def start_b(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000309 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000310 def end_b(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000311 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000312
313 def start_tt(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000314 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000315 def end_tt(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000317
318 def start_a(self, attrs):
319 href = ''
320 name = ''
321 type = ''
322 for attrname, value in attrs:
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000323 value = value.strip()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000324 if attrname == 'href':
325 href = value
326 if attrname == 'name':
327 name = value
328 if attrname == 'type':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000329 type = value.lower()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000330 self.anchor_bgn(href, name, type)
331
332 def end_a(self):
333 self.anchor_end()
334
335 # --- Line Break
336
337 def do_br(self, attrs):
338 self.formatter.add_line_break()
339
340 # --- Horizontal Rule
341
342 def do_hr(self, attrs):
343 self.formatter.add_hor_rule()
344
345 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000346
347 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000348 align = ''
349 alt = '(image)'
350 ismap = ''
351 src = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000352 width = 0
353 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000354 for attrname, value in attrs:
355 if attrname == 'align':
356 align = value
357 if attrname == 'alt':
358 alt = value
359 if attrname == 'ismap':
360 ismap = value
361 if attrname == 'src':
362 src = value
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000363 if attrname == 'width':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000364 try: width = int(value)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000365 except: pass
366 if attrname == 'height':
Eric S. Raymond373c55e2001-02-09 08:25:29 +0000367 try: height = int(value)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000368 except: pass
Guido van Rossum453534a1995-09-22 00:55:50 +0000369 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000370
Guido van Rossumf54d9671995-08-07 20:07:44 +0000371 # --- Really Old Unofficial Deprecated Stuff
372
373 def do_plaintext(self, attrs):
374 self.start_pre(attrs)
375 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000376
377 # --- Unhandled tags
378
379 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000380 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000381
382 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000383 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000384
Guido van Rossum7c750e11995-02-27 13:16:55 +0000385
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000386def test(args = None):
387 import sys, formatter
388
389 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000390 args = sys.argv[1:]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000391
392 silent = args and args[0] == '-s'
393 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000394 del args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000395
396 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000397 file = args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000398 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000399 file = 'test.html'
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000400
401 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000402 f = sys.stdin
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000403 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000404 try:
405 f = open(file, 'r')
406 except IOError, msg:
407 print file, ":", msg
408 sys.exit(1)
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000409
410 data = f.read()
411
412 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000413 f.close()
Tim Peters07e99cb2001-01-14 23:47:14 +0000414
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000415 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000416 f = formatter.NullFormatter()
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000417 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000418 f = formatter.AbstractFormatter(formatter.DumbWriter())
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000419
Guido van Rossumf54d9671995-08-07 20:07:44 +0000420 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000421 p.feed(data)
422 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000423
424
425if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000426 test()