blob: 77be4709ce54a9ea6d78e9390c8c10a2965ecf24 [file] [log] [blame]
Guido van Rossumf54d9671995-08-07 20:07:44 +00001"""HTML 2.0 parser.
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00002
Guido van Rossumf54d9671995-08-07 20:07:44 +00003See the HTML 2.0 specification:
4http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5"""
Guido van Rossum7c750e11995-02-27 13:16:55 +00006
7
Guido van Rossum7c750e11995-02-27 13:16:55 +00008import sys
Guido van Rossum7c750e11995-02-27 13:16:55 +00009import string
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000010from sgmllib import SGMLParser
Guido van Rossumf54d9671995-08-07 20:07:44 +000011from formatter import AS_IS
Guido van Rossum7c750e11995-02-27 13:16:55 +000012
Guido van Rossum7c750e11995-02-27 13:16:55 +000013
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000014class HTMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +000015
Guido van Rossum65126d51995-09-27 16:22:17 +000016 from htmlentitydefs import entitydefs
17
Guido van Rossum453534a1995-09-22 00:55:50 +000018 def __init__(self, formatter, verbose=0):
19 SGMLParser.__init__(self, verbose)
Guido van Rossumf54d9671995-08-07 20:07:44 +000020 self.formatter = formatter
21 self.savedata = None
22 self.isindex = 0
23 self.title = None
24 self.base = None
25 self.anchor = None
26 self.anchorlist = []
27 self.nofill = 0
28 self.list_stack = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000029
Guido van Rossumf54d9671995-08-07 20:07:44 +000030 # ------ Methods used internally; some may be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000031
Guido van Rossumf54d9671995-08-07 20:07:44 +000032 # --- Formatter interface, taking care of 'savedata' mode;
33 # shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000034
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000035 def handle_data(self, data):
Guido van Rossumf54d9671995-08-07 20:07:44 +000036 if self.savedata is not None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000037 self.savedata = self.savedata + data
Guido van Rossumf54d9671995-08-07 20:07:44 +000038 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000039 if self.nofill:
40 self.formatter.add_literal_data(data)
41 else:
42 self.formatter.add_flowing_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000043
Guido van Rossumf54d9671995-08-07 20:07:44 +000044 # --- Hooks to save data; shouldn't need to be overridden
Guido van Rossum7c750e11995-02-27 13:16:55 +000045
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000046 def save_bgn(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000047 self.savedata = ''
Guido van Rossum7c750e11995-02-27 13:16:55 +000048
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000049 def save_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000050 data = self.savedata
51 self.savedata = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000052 if not self.nofill:
53 data = string.join(string.split(data))
54 return data
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000055
Guido van Rossumf54d9671995-08-07 20:07:44 +000056 # --- Hooks for anchors; should probably be overridden
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000057
58 def anchor_bgn(self, href, name, type):
Guido van Rossumf54d9671995-08-07 20:07:44 +000059 self.anchor = href
60 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000061 self.anchorlist.append(href)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000062
63 def anchor_end(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000064 if self.anchor:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000065 self.handle_data("[%d]" % len(self.anchorlist))
66 self.anchor = None
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000067
Guido van Rossumf54d9671995-08-07 20:07:44 +000068 # --- Hook for images; should probably be overridden
69
Guido van Rossum453534a1995-09-22 00:55:50 +000070 def handle_image(self, src, alt, *args):
Guido van Rossumf54d9671995-08-07 20:07:44 +000071 self.handle_data(alt)
72
Guido van Rossumf54d9671995-08-07 20:07:44 +000073 # --------- Top level elememts
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000074
75 def start_html(self, attrs): pass
76 def end_html(self): pass
77
78 def start_head(self, attrs): pass
79 def end_head(self): pass
80
81 def start_body(self, attrs): pass
82 def end_body(self): pass
83
Guido van Rossumf54d9671995-08-07 20:07:44 +000084 # ------ Head elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000085
86 def start_title(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +000087 self.save_bgn()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000088
89 def end_title(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +000090 self.title = self.save_end()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000091
Guido van Rossumf54d9671995-08-07 20:07:44 +000092 def do_base(self, attrs):
93 for a, v in attrs:
94 if a == 'href':
95 self.base = v
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000096
Guido van Rossumf54d9671995-08-07 20:07:44 +000097 def do_isindex(self, attrs):
98 self.isindex = 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000099
Guido van Rossumf54d9671995-08-07 20:07:44 +0000100 def do_link(self, attrs):
101 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000102
Guido van Rossumf54d9671995-08-07 20:07:44 +0000103 def do_meta(self, attrs):
104 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000105
Guido van Rossumf54d9671995-08-07 20:07:44 +0000106 def do_nextid(self, attrs): # Deprecated
107 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000108
Guido van Rossumf54d9671995-08-07 20:07:44 +0000109 # ------ Body elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000110
Guido van Rossumf54d9671995-08-07 20:07:44 +0000111 # --- Headings
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000112
113 def start_h1(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000114 self.formatter.end_paragraph(1)
115 self.formatter.push_font(('h1', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000116
117 def end_h1(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000118 self.formatter.end_paragraph(1)
119 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000120
Guido van Rossumf54d9671995-08-07 20:07:44 +0000121 def start_h2(self, attrs):
122 self.formatter.end_paragraph(1)
123 self.formatter.push_font(('h2', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000124
Guido van Rossumf54d9671995-08-07 20:07:44 +0000125 def end_h2(self):
126 self.formatter.end_paragraph(1)
127 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000128
Guido van Rossumf54d9671995-08-07 20:07:44 +0000129 def start_h3(self, attrs):
130 self.formatter.end_paragraph(1)
131 self.formatter.push_font(('h3', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000132
Guido van Rossumf54d9671995-08-07 20:07:44 +0000133 def end_h3(self):
134 self.formatter.end_paragraph(1)
135 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000136
Guido van Rossumf54d9671995-08-07 20:07:44 +0000137 def start_h4(self, attrs):
138 self.formatter.end_paragraph(1)
139 self.formatter.push_font(('h4', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000140
Guido van Rossumf54d9671995-08-07 20:07:44 +0000141 def end_h4(self):
142 self.formatter.end_paragraph(1)
143 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000144
Guido van Rossumf54d9671995-08-07 20:07:44 +0000145 def start_h5(self, attrs):
146 self.formatter.end_paragraph(1)
147 self.formatter.push_font(('h5', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000148
Guido van Rossumf54d9671995-08-07 20:07:44 +0000149 def end_h5(self):
150 self.formatter.end_paragraph(1)
151 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000152
Guido van Rossumf54d9671995-08-07 20:07:44 +0000153 def start_h6(self, attrs):
154 self.formatter.end_paragraph(1)
155 self.formatter.push_font(('h6', 0, 1, 0))
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000156
Guido van Rossumf54d9671995-08-07 20:07:44 +0000157 def end_h6(self):
158 self.formatter.end_paragraph(1)
159 self.formatter.pop_font()
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000160
Guido van Rossumf54d9671995-08-07 20:07:44 +0000161 # --- Block Structuring Elements
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000162
Guido van Rossumf54d9671995-08-07 20:07:44 +0000163 def do_p(self, attrs):
164 self.formatter.end_paragraph(1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000165
166 def start_pre(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000167 self.formatter.end_paragraph(1)
168 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
169 self.nofill = self.nofill + 1
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000170
171 def end_pre(self):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000172 self.formatter.end_paragraph(1)
173 self.formatter.pop_font()
174 self.nofill = max(0, self.nofill - 1)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000175
Guido van Rossumf54d9671995-08-07 20:07:44 +0000176 def start_xmp(self, attrs):
177 self.start_pre(attrs)
178 self.setliteral('xmp') # Tell SGML parser
179
180 def end_xmp(self):
181 self.end_pre()
182
183 def start_listing(self, attrs):
184 self.start_pre(attrs)
185 self.setliteral('listing') # Tell SGML parser
186
187 def end_listing(self):
188 self.end_pre()
189
190 def start_address(self, attrs):
191 self.formatter.end_paragraph(0)
192 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
193
194 def end_address(self):
195 self.formatter.end_paragraph(0)
196 self.formatter.pop_font()
197
198 def start_blockquote(self, attrs):
199 self.formatter.end_paragraph(1)
200 self.formatter.push_margin('blockquote')
201
202 def end_blockquote(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000203 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000204 self.formatter.pop_margin()
205
206 # --- List Elements
207
208 def start_ul(self, attrs):
209 self.formatter.end_paragraph(not self.list_stack)
210 self.formatter.push_margin('ul')
211 self.list_stack.append(['ul', '*', 0])
212
213 def end_ul(self):
214 if self.list_stack: del self.list_stack[-1]
215 self.formatter.end_paragraph(not self.list_stack)
216 self.formatter.pop_margin()
217
218 def do_li(self, attrs):
219 self.formatter.end_paragraph(0)
220 if self.list_stack:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000221 [dummy, label, counter] = top = self.list_stack[-1]
222 top[2] = counter = counter+1
Guido van Rossumf54d9671995-08-07 20:07:44 +0000223 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000224 label, counter = '*', 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000225 self.formatter.add_label_data(label, counter)
226
227 def start_ol(self, attrs):
228 self.formatter.end_paragraph(not self.list_stack)
229 self.formatter.push_margin('ol')
230 label = '1.'
231 for a, v in attrs:
232 if a == 'type':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000233 if len(v) == 1: v = v + '.'
234 label = v
Guido van Rossumf54d9671995-08-07 20:07:44 +0000235 self.list_stack.append(['ol', label, 0])
236
237 def end_ol(self):
238 if self.list_stack: del self.list_stack[-1]
239 self.formatter.end_paragraph(not self.list_stack)
240 self.formatter.pop_margin()
241
242 def start_menu(self, attrs):
243 self.start_ul(attrs)
244
245 def end_menu(self):
246 self.end_ul()
247
248 def start_dir(self, attrs):
249 self.start_ul(attrs)
250
251 def end_dir(self):
252 self.end_ul()
253
254 def start_dl(self, attrs):
Guido van Rossumaa763441995-08-09 02:31:00 +0000255 self.formatter.end_paragraph(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000256 self.list_stack.append(['dl', '', 0])
257
258 def end_dl(self):
Guido van Rossumaa763441995-08-09 02:31:00 +0000259 self.ddpop(1)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000260 if self.list_stack: del self.list_stack[-1]
261
262 def do_dt(self, attrs):
263 self.ddpop()
264
265 def do_dd(self, attrs):
266 self.ddpop()
267 self.formatter.push_margin('dd')
268 self.list_stack.append(['dd', '', 0])
269
Guido van Rossumaa763441995-08-09 02:31:00 +0000270 def ddpop(self, bl=0):
271 self.formatter.end_paragraph(bl)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000272 if self.list_stack:
273 if self.list_stack[-1][0] == 'dd':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000274 del self.list_stack[-1]
275 self.formatter.pop_margin()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000276
277 # --- Phrase Markup
278
279 # Idiomatic Elements
280
281 def start_cite(self, attrs): self.start_i(attrs)
282 def end_cite(self): self.end_i()
283
284 def start_code(self, attrs): self.start_tt(attrs)
285 def end_code(self): self.end_tt()
286
287 def start_em(self, attrs): self.start_i(attrs)
288 def end_em(self): self.end_i()
289
290 def start_kbd(self, attrs): self.start_tt(attrs)
291 def end_kbd(self): self.end_tt()
292
293 def start_samp(self, attrs): self.start_tt(attrs)
294 def end_samp(self): self.end_tt()
295
Guido van Rossumaa763441995-08-09 02:31:00 +0000296 def start_strong(self, attrs): self.start_b(attrs)
297 def end_strong(self): self.end_b()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000298
299 def start_var(self, attrs): self.start_i(attrs)
Guido van Rossumaa763441995-08-09 02:31:00 +0000300 def end_var(self): self.end_i()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000301
302 # Typographic Elements
303
304 def start_i(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000305 self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000306 def end_i(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000308
309 def start_b(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000310 self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000311 def end_b(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000313
314 def start_tt(self, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
Guido van Rossumf54d9671995-08-07 20:07:44 +0000316 def end_tt(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 self.formatter.pop_font()
Guido van Rossumf54d9671995-08-07 20:07:44 +0000318
319 def start_a(self, attrs):
320 href = ''
321 name = ''
322 type = ''
323 for attrname, value in attrs:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 value = string.strip(value)
Guido van Rossumf54d9671995-08-07 20:07:44 +0000325 if attrname == 'href':
326 href = value
327 if attrname == 'name':
328 name = value
329 if attrname == 'type':
330 type = string.lower(value)
331 self.anchor_bgn(href, name, type)
332
333 def end_a(self):
334 self.anchor_end()
335
336 # --- Line Break
337
338 def do_br(self, attrs):
339 self.formatter.add_line_break()
340
341 # --- Horizontal Rule
342
343 def do_hr(self, attrs):
344 self.formatter.add_hor_rule()
345
346 # --- Image
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000347
348 def do_img(self, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000349 align = ''
350 alt = '(image)'
351 ismap = ''
352 src = ''
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000353 width = 0
354 height = 0
Guido van Rossumf54d9671995-08-07 20:07:44 +0000355 for attrname, value in attrs:
356 if attrname == 'align':
357 align = value
358 if attrname == 'alt':
359 alt = value
360 if attrname == 'ismap':
361 ismap = value
362 if attrname == 'src':
363 src = value
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000364 if attrname == 'width':
365 try: width = string.atoi(value)
366 except: pass
367 if attrname == 'height':
368 try: height = string.atoi(value)
369 except: pass
Guido van Rossum453534a1995-09-22 00:55:50 +0000370 self.handle_image(src, alt, ismap, align, width, height)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000371
Guido van Rossumf54d9671995-08-07 20:07:44 +0000372 # --- Really Old Unofficial Deprecated Stuff
373
374 def do_plaintext(self, attrs):
375 self.start_pre(attrs)
376 self.setnomoretags() # Tell SGML parser
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000377
378 # --- Unhandled tags
379
380 def unknown_starttag(self, tag, attrs):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000381 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000382
383 def unknown_endtag(self, tag):
Guido van Rossumf54d9671995-08-07 20:07:44 +0000384 pass
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000385
Guido van Rossum7c750e11995-02-27 13:16:55 +0000386
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000387def test(args = None):
388 import sys, formatter
389
390 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000391 args = sys.argv[1:]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000392
393 silent = args and args[0] == '-s'
394 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000395 del args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000396
397 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 file = args[0]
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000399 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000400 file = 'test.html'
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000401
402 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000403 f = sys.stdin
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000404 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000405 try:
406 f = open(file, 'r')
407 except IOError, msg:
408 print file, ":", msg
409 sys.exit(1)
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000410
411 data = f.read()
412
413 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000414 f.close()
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000415
416 if silent:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000417 f = formatter.NullFormatter()
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000418 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000419 f = formatter.AbstractFormatter(formatter.DumbWriter())
Guido van Rossuma98b0a11996-02-13 00:02:10 +0000420
Guido van Rossumf54d9671995-08-07 20:07:44 +0000421 p = HTMLParser(f)
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000422 p.feed(data)
423 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000424
425
426if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000427 test()