blob: 3288ce8860fed62d8fcc96aeb5caca6fa7925afd [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
7
Guido van Rossum7c750e11995-02-27 13:16:55 +00008import sys
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00009import regsub
Guido van Rossum7c750e11995-02-27 13:16:55 +000010import string
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000011from sgmllib import SGMLParser
Guido van Rossumf54d9671995-08-07 20:07:44 +000012from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000013
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000015class HTMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
Guido van Rossum453534a1995-09-22 00:55:50 +000017 def __init__(self, formatter, verbose=0):
18 SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000019 self.formatter = formatter
20 self.savedata = None
21 self.isindex = 0
22 self.title = None
23 self.base = None
24 self.anchor = None
25 self.anchorlist = []
26 self.nofill = 0
27 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000028
Guido van Rossumf54d9671995-08-07 20:07:44 +000029 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000030
Guido van Rossumf54d9671995-08-07 20:07:44 +000031 # --- Formatter interface, taking care of 'savedata' mode;
32 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000033
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000034 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000035 if self.savedata is not None:
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000036 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000037 else:
38 if self.nofill:
39 self.formatter.add_literal_data(data)
40 else:
41 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000042
Guido van Rossumf54d9671995-08-07 20:07:44 +000043 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000045 def save_bgn(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000046 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000047
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000048 def save_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000049 data = self.savedata
50 self.savedata = None
Guido van Rossumfd504d91995-09-01 20:33:32 +000051 if not self.nofill:
52 data = string.join(string.split(data))
53 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000054
Guido van Rossumf54d9671995-08-07 20:07:44 +000055 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000056
57 def anchor_bgn(self, href, name, type):
Guido van Rossumf54d9671995-08-07 20:07:44 +000058 self.anchor = href
59 if self.anchor:
60 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000061
62 def anchor_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000063 if self.anchor:
64 self.handle_data("[%d]" % len(self.anchorlist))
65 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000066
Guido van Rossumf54d9671995-08-07 20:07:44 +000067 # --- Hook for images; should probably be overridden
68
Guido van Rossum453534a1995-09-22 00:55:50 +000069 def handle_image(self, src, alt, *args):
Guido van Rossumf54d9671995-08-07 20:07:44 +000070 self.handle_data(alt)
71
Guido van Rossumf54d9671995-08-07 20:07:44 +000072 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000073
74 def start_html(self, attrs): pass
75 def end_html(self): pass
76
77 def start_head(self, attrs): pass
78 def end_head(self): pass
79
80 def start_body(self, attrs): pass
81 def end_body(self): pass
82
Guido van Rossumf54d9671995-08-07 20:07:44 +000083 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000084
85 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +000086 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000087
88 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000089 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000090
Guido van Rossumf54d9671995-08-07 20:07:44 +000091 def do_base(self, attrs):
92 for a, v in attrs:
93 if a == 'href':
94 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000095
Guido van Rossumf54d9671995-08-07 20:07:44 +000096 def do_isindex(self, attrs):
97 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000098
Guido van Rossumf54d9671995-08-07 20:07:44 +000099 def do_link(self, attrs):
100 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000101
Guido van Rossumf54d9671995-08-07 20:07:44 +0000102 def do_meta(self, attrs):
103 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000104
Guido van Rossumf54d9671995-08-07 20:07:44 +0000105 def do_nextid(self, attrs): # Deprecated
106 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000107
Guido van Rossumf54d9671995-08-07 20:07:44 +0000108 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000109
Guido van Rossumf54d9671995-08-07 20:07:44 +0000110 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000111
112 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000113 self.formatter.end_paragraph(1)
114 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000115
116 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000117 self.formatter.end_paragraph(1)
118 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000119
Guido van Rossumf54d9671995-08-07 20:07:44 +0000120 def start_h2(self, attrs):
121 self.formatter.end_paragraph(1)
122 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000123
Guido van Rossumf54d9671995-08-07 20:07:44 +0000124 def end_h2(self):
125 self.formatter.end_paragraph(1)
126 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000127
Guido van Rossumf54d9671995-08-07 20:07:44 +0000128 def start_h3(self, attrs):
129 self.formatter.end_paragraph(1)
130 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000131
Guido van Rossumf54d9671995-08-07 20:07:44 +0000132 def end_h3(self):
133 self.formatter.end_paragraph(1)
134 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000135
Guido van Rossumf54d9671995-08-07 20:07:44 +0000136 def start_h4(self, attrs):
137 self.formatter.end_paragraph(1)
138 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000139
Guido van Rossumf54d9671995-08-07 20:07:44 +0000140 def end_h4(self):
141 self.formatter.end_paragraph(1)
142 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000143
Guido van Rossumf54d9671995-08-07 20:07:44 +0000144 def start_h5(self, attrs):
145 self.formatter.end_paragraph(1)
146 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000147
Guido van Rossumf54d9671995-08-07 20:07:44 +0000148 def end_h5(self):
149 self.formatter.end_paragraph(1)
150 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000151
Guido van Rossumf54d9671995-08-07 20:07:44 +0000152 def start_h6(self, attrs):
153 self.formatter.end_paragraph(1)
154 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000155
Guido van Rossumf54d9671995-08-07 20:07:44 +0000156 def end_h6(self):
157 self.formatter.end_paragraph(1)
158 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000159
Guido van Rossumf54d9671995-08-07 20:07:44 +0000160 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000161
Guido van Rossumf54d9671995-08-07 20:07:44 +0000162 def do_p(self, attrs):
163 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000164
165 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000166 self.formatter.end_paragraph(1)
167 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
168 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000169
170 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000171 self.formatter.end_paragraph(1)
172 self.formatter.pop_font()
173 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000174
Guido van Rossumf54d9671995-08-07 20:07:44 +0000175 def start_xmp(self, attrs):
176 self.start_pre(attrs)
177 self.setliteral('xmp') # Tell SGML parser
178
179 def end_xmp(self):
180 self.end_pre()
181
182 def start_listing(self, attrs):
183 self.start_pre(attrs)
184 self.setliteral('listing') # Tell SGML parser
185
186 def end_listing(self):
187 self.end_pre()
188
189 def start_address(self, attrs):
190 self.formatter.end_paragraph(0)
191 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
192
193 def end_address(self):
194 self.formatter.end_paragraph(0)
195 self.formatter.pop_font()
196
197 def start_blockquote(self, attrs):
198 self.formatter.end_paragraph(1)
199 self.formatter.push_margin('blockquote')
200
201 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000202 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000203 self.formatter.pop_margin()
204
205 # --- List Elements
206
207 def start_ul(self, attrs):
208 self.formatter.end_paragraph(not self.list_stack)
209 self.formatter.push_margin('ul')
210 self.list_stack.append(['ul', '*', 0])
211
212 def end_ul(self):
213 if self.list_stack: del self.list_stack[-1]
214 self.formatter.end_paragraph(not self.list_stack)
215 self.formatter.pop_margin()
216
217 def do_li(self, attrs):
218 self.formatter.end_paragraph(0)
219 if self.list_stack:
220 [dummy, label, counter] = top = self.list_stack[-1]
221 top[2] = counter = counter+1
222 else:
223 label, counter = '*', 0
224 self.formatter.add_label_data(label, counter)
225
226 def start_ol(self, attrs):
227 self.formatter.end_paragraph(not self.list_stack)
228 self.formatter.push_margin('ol')
229 label = '1.'
230 for a, v in attrs:
231 if a == 'type':
232 if len(v) == 1: v = v + '.'
233 label = v
234 self.list_stack.append(['ol', label, 0])
235
236 def end_ol(self):
237 if self.list_stack: del self.list_stack[-1]
238 self.formatter.end_paragraph(not self.list_stack)
239 self.formatter.pop_margin()
240
241 def start_menu(self, attrs):
242 self.start_ul(attrs)
243
244 def end_menu(self):
245 self.end_ul()
246
247 def start_dir(self, attrs):
248 self.start_ul(attrs)
249
250 def end_dir(self):
251 self.end_ul()
252
253 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000254 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000255 self.list_stack.append(['dl', '', 0])
256
257 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000258 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000259 if self.list_stack: del self.list_stack[-1]
260
261 def do_dt(self, attrs):
262 self.ddpop()
263
264 def do_dd(self, attrs):
265 self.ddpop()
266 self.formatter.push_margin('dd')
267 self.list_stack.append(['dd', '', 0])
268
Guido van Rossumaa763441995-08-09 02:31:00 +0000269 def ddpop(self, bl=0):
270 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000271 if self.list_stack:
272 if self.list_stack[-1][0] == 'dd':
273 del self.list_stack[-1]
274 self.formatter.pop_margin()
275
276 # --- Phrase Markup
277
278 # Idiomatic Elements
279
280 def start_cite(self, attrs): self.start_i(attrs)
281 def end_cite(self): self.end_i()
282
283 def start_code(self, attrs): self.start_tt(attrs)
284 def end_code(self): self.end_tt()
285
286 def start_em(self, attrs): self.start_i(attrs)
287 def end_em(self): self.end_i()
288
289 def start_kbd(self, attrs): self.start_tt(attrs)
290 def end_kbd(self): self.end_tt()
291
292 def start_samp(self, attrs): self.start_tt(attrs)
293 def end_samp(self): self.end_tt()
294
Guido van Rossumaa763441995-08-09 02:31:00 +0000295 def start_strong(self, attrs): self.start_b(attrs)
296 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000297
298 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000299 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000300
301 # Typographic Elements
302
303 def start_i(self, attrs):
304 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
305 def end_i(self):
306 self.formatter.pop_font()
307
308 def start_b(self, attrs):
309 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
310 def end_b(self):
311 self.formatter.pop_font()
312
313 def start_tt(self, attrs):
314 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
315 def end_tt(self):
316 self.formatter.pop_font()
317
318 def start_a(self, attrs):
319 href = ''
320 name = ''
321 type = ''
322 for attrname, value in attrs:
323 if attrname == 'href':
324 href = value
325 if attrname == 'name':
326 name = value
327 if attrname == 'type':
328 type = string.lower(value)
329 self.anchor_bgn(href, name, type)
330
331 def end_a(self):
332 self.anchor_end()
333
334 # --- Line Break
335
336 def do_br(self, attrs):
337 self.formatter.add_line_break()
338
339 # --- Horizontal Rule
340
341 def do_hr(self, attrs):
342 self.formatter.add_hor_rule()
343
344 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000345
346 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000347 align = ''
348 alt = '(image)'
349 ismap = ''
350 src = ''
Guido van Rossum453534a1995-09-22 00:55:50 +0000351 width = 0
352 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000353 for attrname, value in attrs:
354 if attrname == 'align':
355 align = value
356 if attrname == 'alt':
357 alt = value
358 if attrname == 'ismap':
359 ismap = value
360 if attrname == 'src':
361 src = value
Guido van Rossum453534a1995-09-22 00:55:50 +0000362 if attrname == 'width':
363 try: width = string.atoi(value)
364 except: pass
365 if attrname == 'height':
366 try: height = string.atoi(value)
367 except: pass
368 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000369
Guido van Rossumf54d9671995-08-07 20:07:44 +0000370 # --- Really Old Unofficial Deprecated Stuff
371
372 def do_plaintext(self, attrs):
373 self.start_pre(attrs)
374 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000375
376 # --- Unhandled tags
377
378 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000379 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000380
381 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000382 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000383
Guido van Rossum7c750e11995-02-27 13:16:55 +0000384
385def test():
Guido van Rossumf54d9671995-08-07 20:07:44 +0000386 import sys
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000387 file = 'test.html'
Guido van Rossumf54d9671995-08-07 20:07:44 +0000388 if sys.argv[1:]: file = sys.argv[1]
389 fp = open(file, 'r')
390 data = fp.read()
391 fp.close()
392 from formatter import DumbWriter, AbstractFormatter
393 w = DumbWriter()
394 f = AbstractFormatter(w)
395 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000396 p.feed(data)
397 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000398
399
400if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000401 test()