blob: 10ca81063c7caa49776cfb6547f0c477799f105b [file] [log] [blame]
Guido van Rossum7c750e11995-02-27 13:16:55 +00001# A parser for HTML documents
2
3
4# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
5# describe hypertext documents
6#
7# SGML: Standard Generalized Markup Language
8#
9# WWW: World-Wide Web; a distributed hypertext system develped at CERN
10#
11# CERN: European Particle Physics Laboratory in Geneva, Switzerland
12
13
14# This file is only concerned with parsing and formatting HTML
15# documents, not with the other (hypertext and networking) aspects of
16# the WWW project. (It does support highlighting of anchors.)
17
18
19import os
20import sys
21import regex
22import string
23import sgmllib
24
25
26class HTMLParser(sgmllib.SGMLParser):
27
28 # Copy base class entities and add some
29 entitydefs = {}
30 for key in sgmllib.SGMLParser.entitydefs.keys():
31 entitydefs[key] = sgmllib.SGMLParser.entitydefs[key]
32 entitydefs['bullet'] = '*'
33
34 # Provided -- handlers for tags introducing literal text
35
36 def start_listing(self, attrs):
37 self.setliteral('listing')
38 self.literal_bgn('listing', attrs)
39
40 def end_listing(self):
41 self.literal_end('listing')
42
43 def start_xmp(self, attrs):
44 self.setliteral('xmp')
45 self.literal_bgn('xmp', attrs)
46
47 def end_xmp(self):
48 self.literal_end('xmp')
49
50 def do_plaintext(self, attrs):
51 self.setnomoretags()
52 self.literal_bgn('plaintext', attrs)
53
54 # To be overridden -- begin/end literal mode
55 def literal_bgn(self, tag, attrs): pass
56 def literal_end(self, tag): pass
57
58
59# Next level of sophistication -- collect anchors, title, nextid and isindex
60class CollectingParser(HTMLParser):
61 #
62 def __init__(self):
63 HTMLParser.__init__(self)
64 self.savetext = None
Guido van Rossum1dba24e1995-03-04 22:28:49 +000065 self.nextid = []
Guido van Rossum7c750e11995-02-27 13:16:55 +000066 self.isindex = 0
67 self.title = ''
68 self.inanchor = 0
69 self.anchors = []
70 self.anchornames = []
71 self.anchortypes = []
72 #
73 def start_a(self, attrs):
74 self.inanchor = 0
75 href = ''
76 name = ''
77 type = ''
78 for attrname, value in attrs:
79 if attrname == 'href':
80 href = value
81 if attrname == 'name=':
82 name = value
83 if attrname == 'type=':
84 type = string.lower(value)
85 if not (href or name):
86 return
87 self.anchors.append(href)
88 self.anchornames.append(name)
89 self.anchortypes.append(type)
90 self.inanchor = len(self.anchors)
91 if not href:
92 self.inanchor = -self.inanchor
93 #
94 def end_a(self):
95 if self.inanchor > 0:
96 # Don't show anchors pointing into the current document
97 if self.anchors[self.inanchor-1][:1] <> '#':
98 self.handle_data('[' + `self.inanchor` + ']')
99 self.inanchor = 0
100 #
Guido van Rossum2d957061995-06-22 18:46:12 +0000101 def start_html(self, attrs): pass
102 def end_html(self): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000103 #
Guido van Rossum7c750e11995-02-27 13:16:55 +0000104 def start_head(self, attrs): pass
105 def end_head(self): pass
106 #
107 def start_body(self, attrs): pass
108 def end_body(self): pass
109 #
110 def do_nextid(self, attrs):
111 self.nextid = attrs
112 #
113 def do_isindex(self, attrs):
114 self.isindex = 1
115 #
116 def start_title(self, attrs):
117 self.savetext = ''
118 #
119 def end_title(self):
120 if self.savetext <> None:
121 self.title = self.savetext
122 self.savetext = None
123 #
124 def handle_data(self, text):
125 if self.savetext is not None:
126 self.savetext = self.savetext + text
127
128
129# Formatting parser -- takes a formatter and a style sheet as arguments
130
131# XXX The use of style sheets should change: for each tag and end tag
132# there should be a style definition, and a style definition should
133# encompass many more parameters: font, justification, indentation,
134# vspace before, vspace after, hanging tag...
135
136wordprog = regex.compile('[^ \t\n]*')
137spaceprog = regex.compile('[ \t\n]*')
138
139class FormattingParser(CollectingParser):
140
141 def __init__(self, formatter, stylesheet):
142 CollectingParser.__init__(self)
143 self.fmt = formatter
144 self.stl = stylesheet
145 self.savetext = None
146 self.compact = 0
147 self.nofill = 0
148 self.resetfont()
149 self.setindent(self.stl.stdindent)
150
151 def resetfont(self):
152 self.fontstack = []
153 self.stylestack = []
154 self.fontset = self.stl.stdfontset
155 self.style = ROMAN
156 self.passfont()
157
158 def passfont(self):
159 font = self.fontset[self.style]
160 self.fmt.setfont(font)
161
162 def pushstyle(self, style):
163 self.stylestack.append(self.style)
164 self.style = min(style, len(self.fontset)-1)
165 self.passfont()
166
167 def popstyle(self):
168 self.style = self.stylestack[-1]
169 del self.stylestack[-1]
170 self.passfont()
171
172 def pushfontset(self, fontset, style):
173 self.fontstack.append(self.fontset)
174 self.fontset = fontset
175 self.pushstyle(style)
176
177 def popfontset(self):
178 self.fontset = self.fontstack[-1]
179 del self.fontstack[-1]
180 self.popstyle()
181
182 def flush(self):
183 self.fmt.flush()
184
185 def setindent(self, n):
186 self.fmt.setleftindent(n)
187
188 def needvspace(self, n):
189 self.fmt.needvspace(n)
190
191 def close(self):
192 HTMLParser.close(self)
193 self.fmt.flush()
194
195 def handle_literal(self, text):
196 lines = string.splitfields(text, '\n')
197 for i in range(1, len(lines)):
198 lines[i] = string.expandtabs(lines[i], 8)
199 for line in lines[:-1]:
200 self.fmt.addword(line, 0)
201 self.fmt.flush()
202 self.fmt.nospace = 0
203 for line in lines[-1:]:
204 self.fmt.addword(line, 0)
205
206 def handle_data(self, text):
207 if self.savetext is not None:
208 self.savetext = self.savetext + text
209 return
210 if self.literal:
211 self.handle_literal(text)
212 return
213 i = 0
214 n = len(text)
215 while i < n:
216 j = i + wordprog.match(text, i)
217 word = text[i:j]
218 i = j + spaceprog.match(text, j)
219 self.fmt.addword(word, i-j)
220 if self.nofill and '\n' in text[j:i]:
221 self.fmt.flush()
222 self.fmt.nospace = 0
223 i = j+1
224 while text[i-1] <> '\n': i = i+1
225
226 def literal_bgn(self, tag, attrs):
227 if tag == 'plaintext':
228 self.flush()
229 else:
230 self.needvspace(1)
231 self.pushfontset(self.stl.stdfontset, FIXED)
232 self.setindent(self.stl.literalindent)
233
234 def literal_end(self, tag):
235 self.needvspace(1)
236 self.popfontset()
237 self.setindent(self.stl.stdindent)
238
239 def start_title(self, attrs):
240 self.flush()
241 self.savetext = ''
242 # NB end_title is unchanged
243
244 def do_p(self, attrs):
245 if self.compact:
246 self.flush()
247 else:
248 self.needvspace(1)
249
250 def start_h1(self, attrs):
251 self.needvspace(2)
252 self.setindent(self.stl.h1indent)
253 self.pushfontset(self.stl.h1fontset, BOLD)
254 self.fmt.setjust('c')
255
256 def end_h1(self):
257 self.popfontset()
258 self.needvspace(2)
259 self.setindent(self.stl.stdindent)
260 self.fmt.setjust('l')
261
262 def start_h2(self, attrs):
263 self.needvspace(1)
264 self.setindent(self.stl.h2indent)
265 self.pushfontset(self.stl.h2fontset, BOLD)
266
267 def end_h2(self):
268 self.popfontset()
269 self.needvspace(1)
270 self.setindent(self.stl.stdindent)
271
272 def start_h3(self, attrs):
273 self.needvspace(1)
274 self.setindent(self.stl.stdindent)
275 self.pushfontset(self.stl.h3fontset, BOLD)
276
277 def end_h3(self):
278 self.popfontset()
279 self.needvspace(1)
280 self.setindent(self.stl.stdindent)
281
282 def start_h4(self, attrs):
283 self.needvspace(1)
284 self.setindent(self.stl.stdindent)
285 self.pushfontset(self.stl.stdfontset, BOLD)
286
287 def end_h4(self):
288 self.popfontset()
289 self.needvspace(1)
290 self.setindent(self.stl.stdindent)
291
292 start_h5 = start_h4
293 end_h5 = end_h4
294
295 start_h6 = start_h5
296 end_h6 = end_h5
297
298 start_h7 = start_h6
299 end_h7 = end_h6
300
301 def start_ul(self, attrs):
302 self.needvspace(1)
303 for attrname, value in attrs:
304 if attrname == 'compact':
305 self.compact = 1
306 self.setindent(0)
307 break
308 else:
309 self.setindent(self.stl.ulindent)
310
311 start_dir = start_menu = start_ol = start_ul
312
313 do_li = do_p
314
315 def end_ul(self):
316 self.compact = 0
317 self.needvspace(1)
318 self.setindent(self.stl.stdindent)
319
320 end_dir = end_menu = end_ol = end_ul
321
322 def start_dl(self, attrs):
323 for attrname, value in attrs:
324 if attrname == 'compact':
325 self.compact = 1
326 self.needvspace(1)
327
328 def end_dl(self):
329 self.compact = 0
330 self.needvspace(1)
331 self.setindent(self.stl.stdindent)
332
333 def do_dt(self, attrs):
334 if self.compact:
335 self.flush()
336 else:
337 self.needvspace(1)
338 self.setindent(self.stl.stdindent)
339
340 def do_dd(self, attrs):
341 self.fmt.addword('', 1)
342 self.setindent(self.stl.ddindent)
343
344 def start_address(self, attrs):
345 self.compact = 1
346 self.needvspace(1)
347 self.fmt.setjust('r')
348
349 def end_address(self):
350 self.compact = 0
351 self.needvspace(1)
352 self.setindent(self.stl.stdindent)
353 self.fmt.setjust('l')
354
355 def start_pre(self, attrs):
356 self.needvspace(1)
357 self.nofill = self.nofill + 1
358 self.pushstyle(FIXED)
359
360 def end_pre(self):
361 self.popstyle()
362 self.nofill = self.nofill - 1
363 self.needvspace(1)
364
365 start_typewriter = start_pre
366 end_typewriter = end_pre
367
368 def do_img(self, attrs):
369 self.fmt.addword('(image)', 0)
370
371 # Physical styles
372
373 def start_tt(self, attrs): self.pushstyle(FIXED)
374 def end_tt(self): self.popstyle()
375
376 def start_b(self, attrs): self.pushstyle(BOLD)
377 def end_b(self): self.popstyle()
378
379 def start_i(self, attrs): self.pushstyle(ITALIC)
380 def end_i(self): self.popstyle()
381
382 def start_u(self, attrs): self.pushstyle(ITALIC) # Underline???
383 def end_u(self): self.popstyle()
384
385 def start_r(self, attrs): self.pushstyle(ROMAN) # Not official
386 def end_r(self): self.popstyle()
387
388 # Logical styles
389
390 start_em = start_i
391 end_em = end_i
392
393 start_strong = start_b
394 end_strong = end_b
395
396 start_code = start_tt
397 end_code = end_tt
398
399 start_samp = start_tt
400 end_samp = end_tt
401
402 start_kbd = start_tt
403 end_kbd = end_tt
404
405 start_file = start_tt # unofficial
406 end_file = end_tt
407
408 start_var = start_i
409 end_var = end_i
410
411 start_dfn = start_i
412 end_dfn = end_i
413
414 start_cite = start_i
415 end_cite = end_i
416
417 start_hp1 = start_i
418 end_hp1 = start_i
419
420 start_hp2 = start_b
421 end_hp2 = end_b
422
423 def unknown_starttag(self, tag, attrs):
424 print '*** unknown <' + tag + '>'
425
426 def unknown_endtag(self, tag):
427 print '*** unknown </' + tag + '>'
428
429
430# An extension of the formatting parser which formats anchors differently.
431class AnchoringParser(FormattingParser):
432
433 def start_a(self, attrs):
434 FormattingParser.start_a(self, attrs)
435 if self.inanchor:
436 self.fmt.bgn_anchor(self.inanchor)
437
438 def end_a(self):
439 if self.inanchor:
440 self.fmt.end_anchor(self.inanchor)
441 self.inanchor = 0
442
443
444# Style sheet -- this is never instantiated, but the attributes
445# of the class object itself are used to specify fonts to be used
446# for various paragraph styles.
447# A font set is a non-empty list of fonts, in the order:
448# [roman, italic, bold, fixed].
449# When a style is not available the nearest lower style is used
450
451ROMAN = 0
452ITALIC = 1
453BOLD = 2
454FIXED = 3
455
456class NullStylesheet:
457 # Fonts -- none
458 stdfontset = [None]
459 h1fontset = [None]
460 h2fontset = [None]
461 h3fontset = [None]
462 # Indents
463 stdindent = 2
464 ddindent = 25
465 ulindent = 4
466 h1indent = 0
467 h2indent = 0
468 literalindent = 0
469
470
471class X11Stylesheet(NullStylesheet):
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000472 stdfontset = [
473 '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*',
474 '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*',
475 '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*',
476 '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000477 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000478 h1fontset = [
479 '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*',
480 '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*',
481 '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000482 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000483 h2fontset = [
484 '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*',
485 '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*',
486 '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000487 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000488 h3fontset = [
489 '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*',
490 '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*',
491 '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000492 ]
493 ddindent = 40
494
495
496class MacStylesheet(NullStylesheet):
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000497 stdfontset = [
498 ('Geneva', 'p', 10),
499 ('Geneva', 'i', 10),
500 ('Geneva', 'b', 10),
501 ('Monaco', 'p', 10),
Guido van Rossum7c750e11995-02-27 13:16:55 +0000502 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000503 h1fontset = [
504 ('Geneva', 'p', 18),
505 ('Geneva', 'i', 18),
506 ('Geneva', 'b', 18),
507 ('Monaco', 'p', 18),
Guido van Rossum7c750e11995-02-27 13:16:55 +0000508 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000509 h3fontset = [
510 ('Geneva', 'p', 14),
511 ('Geneva', 'i', 14),
512 ('Geneva', 'b', 14),
513 ('Monaco', 'p', 14),
Guido van Rossum7c750e11995-02-27 13:16:55 +0000514 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000515 h3fontset = [
516 ('Geneva', 'p', 12),
517 ('Geneva', 'i', 12),
518 ('Geneva', 'b', 12),
519 ('Monaco', 'p', 12),
Guido van Rossum7c750e11995-02-27 13:16:55 +0000520 ]
521
522
523if os.name == 'mac':
524 StdwinStylesheet = MacStylesheet
525else:
526 StdwinStylesheet = X11Stylesheet
527
528
529class GLStylesheet(NullStylesheet):
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000530 stdfontset = [
531 'Helvetica 10',
532 'Helvetica-Italic 10',
533 'Helvetica-Bold 10',
534 'Courier 10',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000535 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000536 h1fontset = [
537 'Helvetica 18',
538 'Helvetica-Italic 18',
539 'Helvetica-Bold 18',
540 'Courier 18',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000541 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000542 h2fontset = [
543 'Helvetica 14',
544 'Helvetica-Italic 14',
545 'Helvetica-Bold 14',
546 'Courier 14',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000547 ]
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000548 h3fontset = [
549 'Helvetica 12',
550 'Helvetica-Italic 12',
551 'Helvetica-Bold 12',
552 'Courier 12',
Guido van Rossum7c750e11995-02-27 13:16:55 +0000553 ]
554
555
556# Test program -- produces no output but times how long it takes
557# to send a document to a null formatter, exclusive of I/O
558
559def test():
560 import fmt
561 import time
562 if sys.argv[1:]: file = sys.argv[1]
563 else: file = 'test.html'
564 data = open(file, 'r').read()
565 t0 = time.time()
566 fmtr = fmt.WritingFormatter(sys.stdout, 79)
567 p = FormattingParser(fmtr, NullStylesheet)
568 p.feed(data)
569 p.close()
570 t1 = time.time()
571 print
572 print '*** Formatting time:', round(t1-t0, 3), 'seconds.'
573
574
575# Test program using stdwin
576
577def testStdwin():
578 import stdwin, fmt
579 from stdwinevents import *
580 if sys.argv[1:]: file = sys.argv[1]
581 else: file = 'test.html'
582 data = open(file, 'r').read()
583 window = stdwin.open('testStdwin')
584 b = None
585 while 1:
586 etype, ewin, edetail = stdwin.getevent()
587 if etype == WE_CLOSE:
588 break
589 if etype == WE_SIZE:
590 window.setdocsize(0, 0)
591 window.setorigin(0, 0)
592 window.change((0, 0), (10000, 30000)) # XXX
593 if etype == WE_DRAW:
594 if not b:
595 b = fmt.StdwinBackEnd(window, 1)
596 f = fmt.BaseFormatter(b.d, b)
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000597 p = FormattingParser(f, MacStylesheet)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000598 p.feed(data)
599 p.close()
600 b.finish()
601 else:
602 b.redraw(edetail)
603 window.close()
604
605
606# Test program using GL
607
608def testGL():
609 import gl, GL, fmt
610 if sys.argv[1:]: file = sys.argv[1]
611 else: file = 'test.html'
612 data = open(file, 'r').read()
613 W, H = 600, 600
614 gl.foreground()
615 gl.prefsize(W, H)
616 wid = gl.winopen('testGL')
617 gl.ortho2(0, W, H, 0)
618 gl.color(GL.WHITE)
619 gl.clear()
620 gl.color(GL.BLACK)
621 b = fmt.GLBackEnd(wid)
622 f = fmt.BaseFormatter(b.d, b)
623 p = FormattingParser(f, GLStylesheet)
624 p.feed(data)
625 p.close()
626 b.finish()
627 #
628 import time
629 time.sleep(5)
630
631
632if __name__ == '__main__':
633 test()