blob: 38312c6670850d6fc1dce0eba4b71c4b069630df [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
7
Guido van Rossum7c750e11995-02-27 13:16:55 +00008import sys
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00009import regsub
Guido van Rossum7c750e11995-02-27 13:16:55 +000010import string
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000011from sgmllib import SGMLParser
Guido van Rossumf54d9671995-08-07 20:07:44 +000012from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000013
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000015class HTMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
Guido van Rossumf54d9671995-08-07 20:07:44 +000017 def __init__(self, formatter):
18 SGMLParser.__init__(self)
19 self.formatter = formatter
20 self.savedata = None
21 self.isindex = 0
22 self.title = None
23 self.base = None
24 self.anchor = None
25 self.anchorlist = []
26 self.nofill = 0
27 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000028
Guido van Rossumf54d9671995-08-07 20:07:44 +000029 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000030
Guido van Rossumf54d9671995-08-07 20:07:44 +000031 # --- Formatter interface, taking care of 'savedata' mode;
32 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000033
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000034 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000035 if self.savedata is not None:
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000036 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000037 else:
38 if self.nofill:
39 self.formatter.add_literal_data(data)
40 else:
41 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000042
Guido van Rossumf54d9671995-08-07 20:07:44 +000043 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000045 def save_bgn(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000046 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000047
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000048 def save_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000049 data = self.savedata
50 self.savedata = None
51 return string.join(string.split(data))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000052
Guido van Rossumf54d9671995-08-07 20:07:44 +000053 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000054
55 def anchor_bgn(self, href, name, type):
Guido van Rossumf54d9671995-08-07 20:07:44 +000056 self.anchor = href
57 if self.anchor:
58 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000059
60 def anchor_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000061 if self.anchor:
62 self.handle_data("[%d]" % len(self.anchorlist))
63 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000064
Guido van Rossumf54d9671995-08-07 20:07:44 +000065 # --- Hook for images; should probably be overridden
66
67 def handle_image(self, src, alt):
68 self.handle_data(alt)
69
70 # --- Hooks for forms; should probably be overridden
71
72 def form_bgn(self, action, method, enctype):
73 self.do_p([])
74 self.handle_data("<FORM>")
75
76 def form_end(self):
77 self.handle_data("</FORM>")
78 self.do_p([])
79
80 def handle_input(self, type, options):
81 self.handle_data("<INPUT>")
82
83 def select_bgn(self, name, size, multiple):
84 self.handle_data("<SELECT>")
85
86 def select_end(self):
87 self.handle_data("</SELECT>")
88
89 def handle_option(self, value, selected):
90 self.handle_data("<OPTION>")
91
92 def textarea_bgn(self, name, rows, cols):
93 self.handle_data("<TEXTAREA>")
94 self.start_pre([])
95
96 def textarea_end(self):
97 self.end_pre()
98 self.handle_data("</TEXTAREA>")
99
100 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000101
102 def start_html(self, attrs): pass
103 def end_html(self): pass
104
105 def start_head(self, attrs): pass
106 def end_head(self): pass
107
108 def start_body(self, attrs): pass
109 def end_body(self): pass
110
Guido van Rossumf54d9671995-08-07 20:07:44 +0000111 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000112
113 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000114 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000115
116 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000117 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000118
Guido van Rossumf54d9671995-08-07 20:07:44 +0000119 def do_base(self, attrs):
120 for a, v in attrs:
121 if a == 'href':
122 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000123
Guido van Rossumf54d9671995-08-07 20:07:44 +0000124 def do_isindex(self, attrs):
125 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000126
Guido van Rossumf54d9671995-08-07 20:07:44 +0000127 def do_link(self, attrs):
128 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000129
Guido van Rossumf54d9671995-08-07 20:07:44 +0000130 def do_meta(self, attrs):
131 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000132
Guido van Rossumf54d9671995-08-07 20:07:44 +0000133 def do_nextid(self, attrs): # Deprecated
134 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000135
Guido van Rossumf54d9671995-08-07 20:07:44 +0000136 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000137
Guido van Rossumf54d9671995-08-07 20:07:44 +0000138 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000139
140 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000141 self.formatter.end_paragraph(1)
142 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000143
144 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000145 self.formatter.end_paragraph(1)
146 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000147
Guido van Rossumf54d9671995-08-07 20:07:44 +0000148 def start_h2(self, attrs):
149 self.formatter.end_paragraph(1)
150 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000151
Guido van Rossumf54d9671995-08-07 20:07:44 +0000152 def end_h2(self):
153 self.formatter.end_paragraph(1)
154 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000155
Guido van Rossumf54d9671995-08-07 20:07:44 +0000156 def start_h3(self, attrs):
157 self.formatter.end_paragraph(1)
158 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000159
Guido van Rossumf54d9671995-08-07 20:07:44 +0000160 def end_h3(self):
161 self.formatter.end_paragraph(1)
162 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000163
Guido van Rossumf54d9671995-08-07 20:07:44 +0000164 def start_h4(self, attrs):
165 self.formatter.end_paragraph(1)
166 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000167
Guido van Rossumf54d9671995-08-07 20:07:44 +0000168 def end_h4(self):
169 self.formatter.end_paragraph(1)
170 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000171
Guido van Rossumf54d9671995-08-07 20:07:44 +0000172 def start_h5(self, attrs):
173 self.formatter.end_paragraph(1)
174 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000175
Guido van Rossumf54d9671995-08-07 20:07:44 +0000176 def end_h5(self):
177 self.formatter.end_paragraph(1)
178 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000179
Guido van Rossumf54d9671995-08-07 20:07:44 +0000180 def start_h6(self, attrs):
181 self.formatter.end_paragraph(1)
182 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000183
Guido van Rossumf54d9671995-08-07 20:07:44 +0000184 def end_h6(self):
185 self.formatter.end_paragraph(1)
186 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000187
Guido van Rossumf54d9671995-08-07 20:07:44 +0000188 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000189
Guido van Rossumf54d9671995-08-07 20:07:44 +0000190 def do_p(self, attrs):
191 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000192
193 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000194 self.formatter.end_paragraph(1)
195 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
196 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000197
198 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000199 self.formatter.end_paragraph(1)
200 self.formatter.pop_font()
201 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000202
Guido van Rossumf54d9671995-08-07 20:07:44 +0000203 def start_xmp(self, attrs):
204 self.start_pre(attrs)
205 self.setliteral('xmp') # Tell SGML parser
206
207 def end_xmp(self):
208 self.end_pre()
209
210 def start_listing(self, attrs):
211 self.start_pre(attrs)
212 self.setliteral('listing') # Tell SGML parser
213
214 def end_listing(self):
215 self.end_pre()
216
217 def start_address(self, attrs):
218 self.formatter.end_paragraph(0)
219 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
220
221 def end_address(self):
222 self.formatter.end_paragraph(0)
223 self.formatter.pop_font()
224
225 def start_blockquote(self, attrs):
226 self.formatter.end_paragraph(1)
227 self.formatter.push_margin('blockquote')
228
229 def end_blockquote(self):
230 self.formatter.end_paragraph(0)
231 self.formatter.pop_margin()
232
233 # --- List Elements
234
235 def start_ul(self, attrs):
236 self.formatter.end_paragraph(not self.list_stack)
237 self.formatter.push_margin('ul')
238 self.list_stack.append(['ul', '*', 0])
239
240 def end_ul(self):
241 if self.list_stack: del self.list_stack[-1]
242 self.formatter.end_paragraph(not self.list_stack)
243 self.formatter.pop_margin()
244
245 def do_li(self, attrs):
246 self.formatter.end_paragraph(0)
247 if self.list_stack:
248 [dummy, label, counter] = top = self.list_stack[-1]
249 top[2] = counter = counter+1
250 else:
251 label, counter = '*', 0
252 self.formatter.add_label_data(label, counter)
253
254 def start_ol(self, attrs):
255 self.formatter.end_paragraph(not self.list_stack)
256 self.formatter.push_margin('ol')
257 label = '1.'
258 for a, v in attrs:
259 if a == 'type':
260 if len(v) == 1: v = v + '.'
261 label = v
262 self.list_stack.append(['ol', label, 0])
263
264 def end_ol(self):
265 if self.list_stack: del self.list_stack[-1]
266 self.formatter.end_paragraph(not self.list_stack)
267 self.formatter.pop_margin()
268
269 def start_menu(self, attrs):
270 self.start_ul(attrs)
271
272 def end_menu(self):
273 self.end_ul()
274
275 def start_dir(self, attrs):
276 self.start_ul(attrs)
277
278 def end_dir(self):
279 self.end_ul()
280
281 def start_dl(self, attrs):
282 self.formatter.end_paragraph(0)
283 self.list_stack.append(['dl', '', 0])
284
285 def end_dl(self):
286 self.ddpop()
287 if self.list_stack: del self.list_stack[-1]
288
289 def do_dt(self, attrs):
290 self.ddpop()
291
292 def do_dd(self, attrs):
293 self.ddpop()
294 self.formatter.push_margin('dd')
295 self.list_stack.append(['dd', '', 0])
296
297 def ddpop(self):
298 self.formatter.end_paragraph(0)
299 if self.list_stack:
300 if self.list_stack[-1][0] == 'dd':
301 del self.list_stack[-1]
302 self.formatter.pop_margin()
303
304 # --- Phrase Markup
305
306 # Idiomatic Elements
307
308 def start_cite(self, attrs): self.start_i(attrs)
309 def end_cite(self): self.end_i()
310
311 def start_code(self, attrs): self.start_tt(attrs)
312 def end_code(self): self.end_tt()
313
314 def start_em(self, attrs): self.start_i(attrs)
315 def end_em(self): self.end_i()
316
317 def start_kbd(self, attrs): self.start_tt(attrs)
318 def end_kbd(self): self.end_tt()
319
320 def start_samp(self, attrs): self.start_tt(attrs)
321 def end_samp(self): self.end_tt()
322
323 def start_string(self, attrs): self.start_b(attrs)
324 def end_b(self): self.end_b()
325
326 def start_var(self, attrs): self.start_i(attrs)
327 def end_var(self): self.end_var()
328
329 # Typographic Elements
330
331 def start_i(self, attrs):
332 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
333 def end_i(self):
334 self.formatter.pop_font()
335
336 def start_b(self, attrs):
337 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
338 def end_b(self):
339 self.formatter.pop_font()
340
341 def start_tt(self, attrs):
342 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
343 def end_tt(self):
344 self.formatter.pop_font()
345
346 def start_a(self, attrs):
347 href = ''
348 name = ''
349 type = ''
350 for attrname, value in attrs:
351 if attrname == 'href':
352 href = value
353 if attrname == 'name':
354 name = value
355 if attrname == 'type':
356 type = string.lower(value)
357 self.anchor_bgn(href, name, type)
358
359 def end_a(self):
360 self.anchor_end()
361
362 # --- Line Break
363
364 def do_br(self, attrs):
365 self.formatter.add_line_break()
366
367 # --- Horizontal Rule
368
369 def do_hr(self, attrs):
370 self.formatter.add_hor_rule()
371
372 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000373
374 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000375 align = ''
376 alt = '(image)'
377 ismap = ''
378 src = ''
379 for attrname, value in attrs:
380 if attrname == 'align':
381 align = value
382 if attrname == 'alt':
383 alt = value
384 if attrname == 'ismap':
385 ismap = value
386 if attrname == 'src':
387 src = value
388 self.handle_image(src, alt)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000389
Guido van Rossumf54d9671995-08-07 20:07:44 +0000390 # ------ Forms
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000391
392 def start_form(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000393 action = ''
394 method = ''
395 enctype = ''
396 for a, v in attrs:
397 if a == 'action': action = v
398 if a == 'method': method = v
399 if a == 'enctype': enctype = v
400 self.form_bgn(action, method, enctype)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000401
402 def end_form(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000403 self.form_end()
404
405 def do_input(self, attrs):
406 type = ''
407 options = {}
408 for a, v in attrs:
409 if a == 'type': type = string.lower(v)
410 else: options[a] = v
411 self.handle_input(type, options)
412
413 def start_select(self, attrs):
414 name = ''
415 size = 0
416 multiple = 0
417 for a, v in attrs:
418 if a == 'multiple': multiple = 1
419 if a == 'name': name = v
420 if a == 'size':
421 try: size = string.atoi(size)
422 except: pass
423 self.select_bgn(name, size, multiple)
424
425 def end_select(self):
426 self.select_end()
427
428 def do_option(self, attrs):
429 value = ''
430 selected = 1
431 for a, v in attrs:
432 if a == 'value': value = v
433 if a == 'selected': selected = 1
434 self.handle_option(value, selected)
435
436 def start_textarea(self, attrs):
437 name = ''
438 rows = 0
439 cols = 0
440 for a, v in attrs:
441 if a == 'name': name = v
442 if a == 'rows':
443 try: rows = string.atoi(v)
444 except: pass
445 if a == 'cols':
446 try: cols = string.atoi(v)
447 except: pass
448 self.textarea_bgn(name, rows, cols)
449
450 def end_textarea(self):
451 self.textarea_end()
452
453 # --- Really Old Unofficial Deprecated Stuff
454
455 def do_plaintext(self, attrs):
456 self.start_pre(attrs)
457 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000458
459 # --- Unhandled tags
460
461 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000462 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000463
464 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000465 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000466
Guido van Rossum7c750e11995-02-27 13:16:55 +0000467
468def test():
Guido van Rossumf54d9671995-08-07 20:07:44 +0000469 import sys
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000470 file = 'test.html'
Guido van Rossumf54d9671995-08-07 20:07:44 +0000471 if sys.argv[1:]: file = sys.argv[1]
472 fp = open(file, 'r')
473 data = fp.read()
474 fp.close()
475 from formatter import DumbWriter, AbstractFormatter
476 w = DumbWriter()
477 f = AbstractFormatter(w)
478 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000479 p.feed(data)
480 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000481
482
483if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000484 test()