blob: 4af446a57d3876dbc4d9a610f311150b935378f8 [file] [log] [blame]
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +00001# New HTML class
2
3# XXX Check against HTML 2.0 spec
4
5# XXX reorder methods according to hierarchy
6# - html structure: head, body, title, isindex
7# - headers
8# - lists, items
9# - paragraph styles
10# - forms
11# - character styles
12# - images
13# - bookkeeping
14# - output generation
Guido van Rossum7c750e11995-02-27 13:16:55 +000015
16
Guido van Rossum7c750e11995-02-27 13:16:55 +000017import sys
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000018import regsub
Guido van Rossum7c750e11995-02-27 13:16:55 +000019import string
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000020from sgmllib import SGMLParser
Guido van Rossum7c750e11995-02-27 13:16:55 +000021
Guido van Rossum7c750e11995-02-27 13:16:55 +000022
23ROMAN = 0
24ITALIC = 1
25BOLD = 2
26FIXED = 3
27
Guido van Rossum7c750e11995-02-27 13:16:55 +000028
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000029class HTMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +000030
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000031 def __init__(self):
32 SGMLParser.__init__(self)
33 self.savedata = None
34 self.isindex = 0
35 self.title = ''
36 self.para = None
37 self.lists = []
38 self.styles = []
39 self.nofill = 0
40 self.nospace = 1
41 self.softspace = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000042
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000043 # --- Data
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000045 def handle_image(self, src, alt):
46 self.handle_data(alt)
Guido van Rossum7c750e11995-02-27 13:16:55 +000047
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000048 def handle_data(self, data):
49 if self.nofill:
50 self.handle_literal(data)
51 return
52 data = regsub.gsub('[ \t\n\r]+', ' ', data)
53 if self.nospace and data[:1] == ' ': data = data[1:]
54 if not data: return
55 self.nospace = 0
56 if self.softspace and data[:1] != ' ': data = ' ' + data
57 if data[-1:] == ' ':
58 data = data[:-1]
59 self.softspace = 1
60 self.output_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000061
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000062 def handle_literal(self, data):
63 self.nospace = 0
64 self.softspace = 0
65 self.output_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000066
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000067 def output_data(self, data):
68 if self.savedata is not None:
69 self.savedata = self.savedata + data
70 else:
71 self.write_data(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000072
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000073 def write_data(self, data):
74 sys.stdout.write(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +000075
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000076 def save_bgn(self):
77 self.savedata = ''
78 self.nospace = 1
79 self.softspace = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000080
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +000081 def save_end(self):
82 saved = self.savedata
83 self.savedata = None
84 self.nospace = 1
85 self.softspace = 0
86 return saved
87
88 def new_para(self):
89 pass
90
91 def new_style(self):
92 pass
93
94 # --- Generic style changes
95
96 def para_bgn(self, tag):
97 if not self.nospace:
98 self.handle_literal('\n')
99 self.nospace = 1
100 self.softspace = 0
101 if tag is not None:
102 self.para = tag
103 self.new_para()
104
105 def para_end(self):
106 self.para_bgn('')
107
108 def push_list(self, tag):
109 self.lists.append(tag)
110 self.para_bgn(None)
111
112 def pop_list(self):
113 del self.lists[-1]
114 self.para_end()
115
116 def literal_bgn(self, tag, attrs):
117 self.para_bgn(tag)
118
119 def literal_end(self, tag):
120 self.para_end()
121
122 def push_style(self, tag):
123 self.styles.append(tag)
124 self.new_style()
125
126 def pop_style(self):
127 del self.styles[-1]
128 self.new_style()
129
130 def anchor_bgn(self, href, name, type):
131 self.push_style(href and 'a' or None)
132
133 def anchor_end(self):
134 self.pop_style()
135
136 # --- Top level tags
137
138 def start_html(self, attrs): pass
139 def end_html(self): pass
140
141 def start_head(self, attrs): pass
142 def end_head(self): pass
143
144 def start_body(self, attrs): pass
145 def end_body(self): pass
146
147 def do_isindex(self, attrs):
148 self.isindex = 1
149
150 def start_title(self, attrs):
151 self.save_bgn()
152
153 def end_title(self):
154 self.title = self.save_end()
155
156 # --- Old HTML 'literal text' tags
157
158 def start_listing(self, attrs):
159 self.setliteral('listing')
160 self.literal_bgn('listing', attrs)
161
162 def end_listing(self):
163 self.literal_end('listing')
164
165 def start_xmp(self, attrs):
166 self.setliteral('xmp')
167 self.literal_bgn('xmp', attrs)
168
169 def end_xmp(self):
170 self.literal_end('xmp')
171
172 def do_plaintext(self, attrs):
173 self.setnomoretags()
174 self.literal_bgn('plaintext', attrs)
175
176 # --- Anchors
177
178 def start_a(self, attrs):
179 href = ''
180 name = ''
181 type = ''
182 for attrname, value in attrs:
183 if attrname == 'href':
184 href = value
185 if attrname == 'name':
186 name = value
187 if attrname == 'type':
188 type = string.lower(value)
189 if not (href or name):
190 return
191 self.anchor_bgn(href, name, type)
192
193 def end_a(self):
194 self.anchor_end()
195
196 # --- Paragraph tags
197
198 def do_p(self, attrs):
199 self.para_bgn(None)
200
201 def do_br(self, attrs):
202 self.handle_literal('\n')
203 self.nospace = 1
204 self.softspace = 0
205
206 def do_hr(self, attrs):
207 self.para_bgn(None)
208 self.handle_literal('-'*40)
209 self.para_end()
210
211 def start_h1(self, attrs):
212 self.para_bgn('h1')
213
214 def start_h2(self, attrs):
215 self.para_bgn('h2')
216
217 def start_h3(self, attrs):
218 self.para_bgn('h3')
219
220 def start_h4(self, attrs):
221 self.para_bgn('h4')
222
223 def start_h5(self, attrs):
224 self.para_bgn('h5')
225
226 def start_h6(self, attrs):
227 self.para_bgn('h6')
228
229 def end_h1(self):
230 self.para_end()
231
232 end_h2 = end_h1
233 end_h3 = end_h2
234 end_h4 = end_h3
235 end_h5 = end_h4
236 end_h6 = end_h5
237
238 def start_ul(self, attrs):
239 self.para_bgn(None)
240 self.push_list('ul')
241
242 def start_ol(self, attrs):
243 self.para_bgn(None)
244 self.push_list('ol')
245
246 def end_ul(self):
247 self.pop_list()
248 self.para_end()
249
250 def do_li(self, attrs):
251 self.para_bgn('li%d' % len(self.lists))
252
253 start_dir = start_menu = start_ul
254 end_dir = end_menu = end_ol = end_ul
255
256 def start_dl(self, attrs):
257 self.para_bgn(None)
258 self.push_list('dl')
259
260 def end_dl(self):
261 self.pop_list()
262 self.para_end()
263
264 def do_dt(self, attrs):
265 self.para_bgn('dt%d' % len(self.lists))
266
267 def do_dd(self, attrs):
268 self.para_bgn('dd%d' % len(self.lists))
269
270 def start_address(self, attrs):
271 self.para_bgn('address')
272
273 def end_address(self):
274 self.para_end()
275
276 def start_pre(self, attrs):
277 self.para_bgn('pre')
278 self.nofill = self.nofill + 1
279
280 def end_pre(self):
281 self.nofill = self.nofill - 1
282 self.para_end()
283
284 start_typewriter = start_pre
285 end_typewriter = end_pre
286
287 def do_img(self, attrs):
288 src = ''
289 alt = ' (image) '
290 for attrname, value in attrs:
291 if attrname == 'alt':
292 alt = value
293 if attrname == 'src':
294 src = value
295 self.handle_image(src, alt)
296
297 # --- Character tags -- physical styles
298
299 def start_tt(self, attrs): self.push_style(FIXED)
300 def end_tt(self): self.pop_style()
301
302 def start_b(self, attrs): self.push_style(BOLD)
303 def end_b(self): self.pop_style()
304
305 def start_i(self, attrs): self.push_style(ITALIC)
306 def end_i(self): self.pop_style()
307
308 def start_u(self, attrs): self.push_style(ITALIC) # Underline???
309 def end_u(self): self.pop_style()
310
311 def start_r(self, attrs): self.push_style(ROMAN) # Not official
312 def end_r(self): self.pop_style()
313
314 # --- Charaacter tags -- logical styles
315
316 start_em = start_i
317 end_em = end_i
318
319 start_strong = start_b
320 end_strong = end_b
321
322 start_code = start_tt
323 end_code = end_tt
324
325 start_samp = start_tt
326 end_samp = end_tt
327
328 start_kbd = start_tt
329 end_kbd = end_tt
330
331 start_file = start_tt # unofficial
332 end_file = end_tt
333
334 start_var = start_i
335 end_var = end_i
336
337 start_dfn = start_i
338 end_dfn = end_i
339
340 start_cite = start_i
341 end_cite = end_i
342
343 start_hp1 = start_i
344 end_hp1 = start_i
345
346 start_hp2 = start_b
347 end_hp2 = end_b
348
349 # --- Form tags
350
351 def start_form(self, attrs):
352 self.para_bgn(None)
353
354 def end_form(self):
355 self.para_end()
356
357 # --- Unhandled tags
358
359 def unknown_starttag(self, tag, attrs):
360 pass
361
362 def unknown_endtag(self, tag):
363 pass
364
Guido van Rossum7c750e11995-02-27 13:16:55 +0000365
366def test():
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000367 file = 'test.html'
368 f = open(file, 'r')
369 data = f.read()
370 f.close()
371 p = HTMLParser()
372 p.feed(data)
373 p.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000374
375
376if __name__ == '__main__':
Guido van Rossum7ff5d7f1995-08-04 04:23:30 +0000377 test()