blob: c5c6f09e8b432742eb28b6a85f491ad4f8b563bb [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
7
Guido van Rossum7c750e11995-02-27 13:16:55 +00008import sys
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00009import regsub
Guido van Rossum7c750e11995-02-27 13:16:55 +000010import string
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000011from sgmllib import SGMLParser
Guido van Rossumf54d9671995-08-07 20:07:44 +000012from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000013
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000015class HTMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
Guido van Rossum65126d51995-09-27 16:22:17 +000017 from htmlentitydefs import entitydefs
18
Guido van Rossum453534a1995-09-22 00:55:50 +000019 def __init__(self, formatter, verbose=0):
20 SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000021 self.formatter = formatter
22 self.savedata = None
23 self.isindex = 0
24 self.title = None
25 self.base = None
26 self.anchor = None
27 self.anchorlist = []
28 self.nofill = 0
29 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000030
Guido van Rossumf54d9671995-08-07 20:07:44 +000031 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000032
Guido van Rossumf54d9671995-08-07 20:07:44 +000033 # --- Formatter interface, taking care of 'savedata' mode;
34 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000035
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000036 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000037 if self.savedata is not None:
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000038 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000039 else:
40 if self.nofill:
41 self.formatter.add_literal_data(data)
42 else:
43 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
Guido van Rossumf54d9671995-08-07 20:07:44 +000045 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000046
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000047 def save_bgn(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000048 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000049
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000050 def save_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000051 data = self.savedata
52 self.savedata = None
Guido van Rossumfd504d91995-09-01 20:33:32 +000053 if not self.nofill:
54 data = string.join(string.split(data))
55 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000056
Guido van Rossumf54d9671995-08-07 20:07:44 +000057 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000058
59 def anchor_bgn(self, href, name, type):
Guido van Rossumf54d9671995-08-07 20:07:44 +000060 self.anchor = href
61 if self.anchor:
62 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000063
64 def anchor_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000065 if self.anchor:
66 self.handle_data("[%d]" % len(self.anchorlist))
67 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000068
Guido van Rossumf54d9671995-08-07 20:07:44 +000069 # --- Hook for images; should probably be overridden
70
Guido van Rossum453534a1995-09-22 00:55:50 +000071 def handle_image(self, src, alt, *args):
Guido van Rossumf54d9671995-08-07 20:07:44 +000072 self.handle_data(alt)
73
Guido van Rossumf54d9671995-08-07 20:07:44 +000074 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000075
76 def start_html(self, attrs): pass
77 def end_html(self): pass
78
79 def start_head(self, attrs): pass
80 def end_head(self): pass
81
82 def start_body(self, attrs): pass
83 def end_body(self): pass
84
Guido van Rossumf54d9671995-08-07 20:07:44 +000085 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000086
87 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +000088 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000089
90 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000091 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000092
Guido van Rossumf54d9671995-08-07 20:07:44 +000093 def do_base(self, attrs):
94 for a, v in attrs:
95 if a == 'href':
96 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000097
Guido van Rossumf54d9671995-08-07 20:07:44 +000098 def do_isindex(self, attrs):
99 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000100
Guido van Rossumf54d9671995-08-07 20:07:44 +0000101 def do_link(self, attrs):
102 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000103
Guido van Rossumf54d9671995-08-07 20:07:44 +0000104 def do_meta(self, attrs):
105 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000106
Guido van Rossumf54d9671995-08-07 20:07:44 +0000107 def do_nextid(self, attrs): # Deprecated
108 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000109
Guido van Rossumf54d9671995-08-07 20:07:44 +0000110 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000111
Guido van Rossumf54d9671995-08-07 20:07:44 +0000112 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000113
114 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000115 self.formatter.end_paragraph(1)
116 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000117
118 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000119 self.formatter.end_paragraph(1)
120 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000121
Guido van Rossumf54d9671995-08-07 20:07:44 +0000122 def start_h2(self, attrs):
123 self.formatter.end_paragraph(1)
124 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000125
Guido van Rossumf54d9671995-08-07 20:07:44 +0000126 def end_h2(self):
127 self.formatter.end_paragraph(1)
128 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000129
Guido van Rossumf54d9671995-08-07 20:07:44 +0000130 def start_h3(self, attrs):
131 self.formatter.end_paragraph(1)
132 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000133
Guido van Rossumf54d9671995-08-07 20:07:44 +0000134 def end_h3(self):
135 self.formatter.end_paragraph(1)
136 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000137
Guido van Rossumf54d9671995-08-07 20:07:44 +0000138 def start_h4(self, attrs):
139 self.formatter.end_paragraph(1)
140 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000141
Guido van Rossumf54d9671995-08-07 20:07:44 +0000142 def end_h4(self):
143 self.formatter.end_paragraph(1)
144 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000145
Guido van Rossumf54d9671995-08-07 20:07:44 +0000146 def start_h5(self, attrs):
147 self.formatter.end_paragraph(1)
148 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000149
Guido van Rossumf54d9671995-08-07 20:07:44 +0000150 def end_h5(self):
151 self.formatter.end_paragraph(1)
152 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000153
Guido van Rossumf54d9671995-08-07 20:07:44 +0000154 def start_h6(self, attrs):
155 self.formatter.end_paragraph(1)
156 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000157
Guido van Rossumf54d9671995-08-07 20:07:44 +0000158 def end_h6(self):
159 self.formatter.end_paragraph(1)
160 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000161
Guido van Rossumf54d9671995-08-07 20:07:44 +0000162 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000163
Guido van Rossumf54d9671995-08-07 20:07:44 +0000164 def do_p(self, attrs):
165 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000166
167 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000168 self.formatter.end_paragraph(1)
169 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
170 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000171
172 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000173 self.formatter.end_paragraph(1)
174 self.formatter.pop_font()
175 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000176
Guido van Rossumf54d9671995-08-07 20:07:44 +0000177 def start_xmp(self, attrs):
178 self.start_pre(attrs)
179 self.setliteral('xmp') # Tell SGML parser
180
181 def end_xmp(self):
182 self.end_pre()
183
184 def start_listing(self, attrs):
185 self.start_pre(attrs)
186 self.setliteral('listing') # Tell SGML parser
187
188 def end_listing(self):
189 self.end_pre()
190
191 def start_address(self, attrs):
192 self.formatter.end_paragraph(0)
193 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
194
195 def end_address(self):
196 self.formatter.end_paragraph(0)
197 self.formatter.pop_font()
198
199 def start_blockquote(self, attrs):
200 self.formatter.end_paragraph(1)
201 self.formatter.push_margin('blockquote')
202
203 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000204 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000205 self.formatter.pop_margin()
206
207 # --- List Elements
208
209 def start_ul(self, attrs):
210 self.formatter.end_paragraph(not self.list_stack)
211 self.formatter.push_margin('ul')
212 self.list_stack.append(['ul', '*', 0])
213
214 def end_ul(self):
215 if self.list_stack: del self.list_stack[-1]
216 self.formatter.end_paragraph(not self.list_stack)
217 self.formatter.pop_margin()
218
219 def do_li(self, attrs):
220 self.formatter.end_paragraph(0)
221 if self.list_stack:
222 [dummy, label, counter] = top = self.list_stack[-1]
223 top[2] = counter = counter+1
224 else:
225 label, counter = '*', 0
226 self.formatter.add_label_data(label, counter)
227
228 def start_ol(self, attrs):
229 self.formatter.end_paragraph(not self.list_stack)
230 self.formatter.push_margin('ol')
231 label = '1.'
232 for a, v in attrs:
233 if a == 'type':
234 if len(v) == 1: v = v + '.'
235 label = v
236 self.list_stack.append(['ol', label, 0])
237
238 def end_ol(self):
239 if self.list_stack: del self.list_stack[-1]
240 self.formatter.end_paragraph(not self.list_stack)
241 self.formatter.pop_margin()
242
243 def start_menu(self, attrs):
244 self.start_ul(attrs)
245
246 def end_menu(self):
247 self.end_ul()
248
249 def start_dir(self, attrs):
250 self.start_ul(attrs)
251
252 def end_dir(self):
253 self.end_ul()
254
255 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000256 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000257 self.list_stack.append(['dl', '', 0])
258
259 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000260 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000261 if self.list_stack: del self.list_stack[-1]
262
263 def do_dt(self, attrs):
264 self.ddpop()
265
266 def do_dd(self, attrs):
267 self.ddpop()
268 self.formatter.push_margin('dd')
269 self.list_stack.append(['dd', '', 0])
270
Guido van Rossumaa763441995-08-09 02:31:00 +0000271 def ddpop(self, bl=0):
272 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000273 if self.list_stack:
274 if self.list_stack[-1][0] == 'dd':
275 del self.list_stack[-1]
276 self.formatter.pop_margin()
277
278 # --- Phrase Markup
279
280 # Idiomatic Elements
281
282 def start_cite(self, attrs): self.start_i(attrs)
283 def end_cite(self): self.end_i()
284
285 def start_code(self, attrs): self.start_tt(attrs)
286 def end_code(self): self.end_tt()
287
288 def start_em(self, attrs): self.start_i(attrs)
289 def end_em(self): self.end_i()
290
291 def start_kbd(self, attrs): self.start_tt(attrs)
292 def end_kbd(self): self.end_tt()
293
294 def start_samp(self, attrs): self.start_tt(attrs)
295 def end_samp(self): self.end_tt()
296
Guido van Rossumaa763441995-08-09 02:31:00 +0000297 def start_strong(self, attrs): self.start_b(attrs)
298 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000299
300 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000301 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000302
303 # Typographic Elements
304
305 def start_i(self, attrs):
306 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
307 def end_i(self):
308 self.formatter.pop_font()
309
310 def start_b(self, attrs):
311 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
312 def end_b(self):
313 self.formatter.pop_font()
314
315 def start_tt(self, attrs):
316 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
317 def end_tt(self):
318 self.formatter.pop_font()
319
320 def start_a(self, attrs):
321 href = ''
322 name = ''
323 type = ''
324 for attrname, value in attrs:
325 if attrname == 'href':
326 href = value
327 if attrname == 'name':
328 name = value
329 if attrname == 'type':
330 type = string.lower(value)
331 self.anchor_bgn(href, name, type)
332
333 def end_a(self):
334 self.anchor_end()
335
336 # --- Line Break
337
338 def do_br(self, attrs):
339 self.formatter.add_line_break()
340
341 # --- Horizontal Rule
342
343 def do_hr(self, attrs):
344 self.formatter.add_hor_rule()
345
346 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000347
348 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000349 align = ''
350 alt = '(image)'
351 ismap = ''
352 src = ''
Guido van Rossum453534a1995-09-22 00:55:50 +0000353 width = 0
354 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000355 for attrname, value in attrs:
356 if attrname == 'align':
357 align = value
358 if attrname == 'alt':
359 alt = value
360 if attrname == 'ismap':
361 ismap = value
362 if attrname == 'src':
363 src = value
Guido van Rossum453534a1995-09-22 00:55:50 +0000364 if attrname == 'width':
365 try: width = string.atoi(value)
366 except: pass
367 if attrname == 'height':
368 try: height = string.atoi(value)
369 except: pass
370 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000371
Guido van Rossumf54d9671995-08-07 20:07:44 +0000372 # --- Really Old Unofficial Deprecated Stuff
373
374 def do_plaintext(self, attrs):
375 self.start_pre(attrs)
376 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000377
378 # --- Unhandled tags
379
380 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000381 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000382
383 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000384 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000385
Guido van Rossum7c750e11995-02-27 13:16:55 +0000386
387def test():
Guido van Rossumf54d9671995-08-07 20:07:44 +0000388 import sys
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000389 file = 'test.html'
Guido van Rossumf54d9671995-08-07 20:07:44 +0000390 if sys.argv[1:]: file = sys.argv[1]
391 fp = open(file, 'r')
392 data = fp.read()
393 fp.close()
394 from formatter import DumbWriter, AbstractFormatter
395 w = DumbWriter()
396 f = AbstractFormatter(w)
397 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000398 p.feed(data)
399 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000400
401
402if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000403 test()