blob: 8b3e62b70854ec97e267a0ba30014596a20bec29 [file] [log] [blame]
Guido van Rossum7c750e11995-02-27 13:16:55 +00001# A parser for HTML documents
2
3
4# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
5# describe hypertext documents
6#
7# SGML: Standard Generalized Markup Language
8#
9# WWW: World-Wide Web; a distributed hypertext system develped at CERN
10#
11# CERN: European Particle Physics Laboratory in Geneva, Switzerland
12
13
14# This file is only concerned with parsing and formatting HTML
15# documents, not with the other (hypertext and networking) aspects of
16# the WWW project. (It does support highlighting of anchors.)
17
18
19import os
20import sys
21import regex
22import string
23import sgmllib
24
25
26class HTMLParser(sgmllib.SGMLParser):
27
28 # Copy base class entities and add some
29 entitydefs = {}
30 for key in sgmllib.SGMLParser.entitydefs.keys():
31 entitydefs[key] = sgmllib.SGMLParser.entitydefs[key]
32 entitydefs['bullet'] = '*'
33
34 # Provided -- handlers for tags introducing literal text
35
36 def start_listing(self, attrs):
37 self.setliteral('listing')
38 self.literal_bgn('listing', attrs)
39
40 def end_listing(self):
41 self.literal_end('listing')
42
43 def start_xmp(self, attrs):
44 self.setliteral('xmp')
45 self.literal_bgn('xmp', attrs)
46
47 def end_xmp(self):
48 self.literal_end('xmp')
49
50 def do_plaintext(self, attrs):
51 self.setnomoretags()
52 self.literal_bgn('plaintext', attrs)
53
54 # To be overridden -- begin/end literal mode
55 def literal_bgn(self, tag, attrs): pass
56 def literal_end(self, tag): pass
57
58
59# Next level of sophistication -- collect anchors, title, nextid and isindex
60class CollectingParser(HTMLParser):
61 #
62 def __init__(self):
63 HTMLParser.__init__(self)
64 self.savetext = None
65 self.nextid = ''
66 self.isindex = 0
67 self.title = ''
68 self.inanchor = 0
69 self.anchors = []
70 self.anchornames = []
71 self.anchortypes = []
72 #
73 def start_a(self, attrs):
74 self.inanchor = 0
75 href = ''
76 name = ''
77 type = ''
78 for attrname, value in attrs:
79 if attrname == 'href':
80 href = value
81 if attrname == 'name=':
82 name = value
83 if attrname == 'type=':
84 type = string.lower(value)
85 if not (href or name):
86 return
87 self.anchors.append(href)
88 self.anchornames.append(name)
89 self.anchortypes.append(type)
90 self.inanchor = len(self.anchors)
91 if not href:
92 self.inanchor = -self.inanchor
93 #
94 def end_a(self):
95 if self.inanchor > 0:
96 # Don't show anchors pointing into the current document
97 if self.anchors[self.inanchor-1][:1] <> '#':
98 self.handle_data('[' + `self.inanchor` + ']')
99 self.inanchor = 0
100 #
101 def start_header(self, attrs): pass
102 def end_header(self): pass
103 #
104 # (head is the same as header)
105 def start_head(self, attrs): pass
106 def end_head(self): pass
107 #
108 def start_body(self, attrs): pass
109 def end_body(self): pass
110 #
111 def do_nextid(self, attrs):
112 self.nextid = attrs
113 #
114 def do_isindex(self, attrs):
115 self.isindex = 1
116 #
117 def start_title(self, attrs):
118 self.savetext = ''
119 #
120 def end_title(self):
121 if self.savetext <> None:
122 self.title = self.savetext
123 self.savetext = None
124 #
125 def handle_data(self, text):
126 if self.savetext is not None:
127 self.savetext = self.savetext + text
128
129
130# Formatting parser -- takes a formatter and a style sheet as arguments
131
132# XXX The use of style sheets should change: for each tag and end tag
133# there should be a style definition, and a style definition should
134# encompass many more parameters: font, justification, indentation,
135# vspace before, vspace after, hanging tag...
136
137wordprog = regex.compile('[^ \t\n]*')
138spaceprog = regex.compile('[ \t\n]*')
139
140class FormattingParser(CollectingParser):
141
142 def __init__(self, formatter, stylesheet):
143 CollectingParser.__init__(self)
144 self.fmt = formatter
145 self.stl = stylesheet
146 self.savetext = None
147 self.compact = 0
148 self.nofill = 0
149 self.resetfont()
150 self.setindent(self.stl.stdindent)
151
152 def resetfont(self):
153 self.fontstack = []
154 self.stylestack = []
155 self.fontset = self.stl.stdfontset
156 self.style = ROMAN
157 self.passfont()
158
159 def passfont(self):
160 font = self.fontset[self.style]
161 self.fmt.setfont(font)
162
163 def pushstyle(self, style):
164 self.stylestack.append(self.style)
165 self.style = min(style, len(self.fontset)-1)
166 self.passfont()
167
168 def popstyle(self):
169 self.style = self.stylestack[-1]
170 del self.stylestack[-1]
171 self.passfont()
172
173 def pushfontset(self, fontset, style):
174 self.fontstack.append(self.fontset)
175 self.fontset = fontset
176 self.pushstyle(style)
177
178 def popfontset(self):
179 self.fontset = self.fontstack[-1]
180 del self.fontstack[-1]
181 self.popstyle()
182
183 def flush(self):
184 self.fmt.flush()
185
186 def setindent(self, n):
187 self.fmt.setleftindent(n)
188
189 def needvspace(self, n):
190 self.fmt.needvspace(n)
191
192 def close(self):
193 HTMLParser.close(self)
194 self.fmt.flush()
195
196 def handle_literal(self, text):
197 lines = string.splitfields(text, '\n')
198 for i in range(1, len(lines)):
199 lines[i] = string.expandtabs(lines[i], 8)
200 for line in lines[:-1]:
201 self.fmt.addword(line, 0)
202 self.fmt.flush()
203 self.fmt.nospace = 0
204 for line in lines[-1:]:
205 self.fmt.addword(line, 0)
206
207 def handle_data(self, text):
208 if self.savetext is not None:
209 self.savetext = self.savetext + text
210 return
211 if self.literal:
212 self.handle_literal(text)
213 return
214 i = 0
215 n = len(text)
216 while i < n:
217 j = i + wordprog.match(text, i)
218 word = text[i:j]
219 i = j + spaceprog.match(text, j)
220 self.fmt.addword(word, i-j)
221 if self.nofill and '\n' in text[j:i]:
222 self.fmt.flush()
223 self.fmt.nospace = 0
224 i = j+1
225 while text[i-1] <> '\n': i = i+1
226
227 def literal_bgn(self, tag, attrs):
228 if tag == 'plaintext':
229 self.flush()
230 else:
231 self.needvspace(1)
232 self.pushfontset(self.stl.stdfontset, FIXED)
233 self.setindent(self.stl.literalindent)
234
235 def literal_end(self, tag):
236 self.needvspace(1)
237 self.popfontset()
238 self.setindent(self.stl.stdindent)
239
240 def start_title(self, attrs):
241 self.flush()
242 self.savetext = ''
243 # NB end_title is unchanged
244
245 def do_p(self, attrs):
246 if self.compact:
247 self.flush()
248 else:
249 self.needvspace(1)
250
251 def start_h1(self, attrs):
252 self.needvspace(2)
253 self.setindent(self.stl.h1indent)
254 self.pushfontset(self.stl.h1fontset, BOLD)
255 self.fmt.setjust('c')
256
257 def end_h1(self):
258 self.popfontset()
259 self.needvspace(2)
260 self.setindent(self.stl.stdindent)
261 self.fmt.setjust('l')
262
263 def start_h2(self, attrs):
264 self.needvspace(1)
265 self.setindent(self.stl.h2indent)
266 self.pushfontset(self.stl.h2fontset, BOLD)
267
268 def end_h2(self):
269 self.popfontset()
270 self.needvspace(1)
271 self.setindent(self.stl.stdindent)
272
273 def start_h3(self, attrs):
274 self.needvspace(1)
275 self.setindent(self.stl.stdindent)
276 self.pushfontset(self.stl.h3fontset, BOLD)
277
278 def end_h3(self):
279 self.popfontset()
280 self.needvspace(1)
281 self.setindent(self.stl.stdindent)
282
283 def start_h4(self, attrs):
284 self.needvspace(1)
285 self.setindent(self.stl.stdindent)
286 self.pushfontset(self.stl.stdfontset, BOLD)
287
288 def end_h4(self):
289 self.popfontset()
290 self.needvspace(1)
291 self.setindent(self.stl.stdindent)
292
293 start_h5 = start_h4
294 end_h5 = end_h4
295
296 start_h6 = start_h5
297 end_h6 = end_h5
298
299 start_h7 = start_h6
300 end_h7 = end_h6
301
302 def start_ul(self, attrs):
303 self.needvspace(1)
304 for attrname, value in attrs:
305 if attrname == 'compact':
306 self.compact = 1
307 self.setindent(0)
308 break
309 else:
310 self.setindent(self.stl.ulindent)
311
312 start_dir = start_menu = start_ol = start_ul
313
314 do_li = do_p
315
316 def end_ul(self):
317 self.compact = 0
318 self.needvspace(1)
319 self.setindent(self.stl.stdindent)
320
321 end_dir = end_menu = end_ol = end_ul
322
323 def start_dl(self, attrs):
324 for attrname, value in attrs:
325 if attrname == 'compact':
326 self.compact = 1
327 self.needvspace(1)
328
329 def end_dl(self):
330 self.compact = 0
331 self.needvspace(1)
332 self.setindent(self.stl.stdindent)
333
334 def do_dt(self, attrs):
335 if self.compact:
336 self.flush()
337 else:
338 self.needvspace(1)
339 self.setindent(self.stl.stdindent)
340
341 def do_dd(self, attrs):
342 self.fmt.addword('', 1)
343 self.setindent(self.stl.ddindent)
344
345 def start_address(self, attrs):
346 self.compact = 1
347 self.needvspace(1)
348 self.fmt.setjust('r')
349
350 def end_address(self):
351 self.compact = 0
352 self.needvspace(1)
353 self.setindent(self.stl.stdindent)
354 self.fmt.setjust('l')
355
356 def start_pre(self, attrs):
357 self.needvspace(1)
358 self.nofill = self.nofill + 1
359 self.pushstyle(FIXED)
360
361 def end_pre(self):
362 self.popstyle()
363 self.nofill = self.nofill - 1
364 self.needvspace(1)
365
366 start_typewriter = start_pre
367 end_typewriter = end_pre
368
369 def do_img(self, attrs):
370 self.fmt.addword('(image)', 0)
371
372 # Physical styles
373
374 def start_tt(self, attrs): self.pushstyle(FIXED)
375 def end_tt(self): self.popstyle()
376
377 def start_b(self, attrs): self.pushstyle(BOLD)
378 def end_b(self): self.popstyle()
379
380 def start_i(self, attrs): self.pushstyle(ITALIC)
381 def end_i(self): self.popstyle()
382
383 def start_u(self, attrs): self.pushstyle(ITALIC) # Underline???
384 def end_u(self): self.popstyle()
385
386 def start_r(self, attrs): self.pushstyle(ROMAN) # Not official
387 def end_r(self): self.popstyle()
388
389 # Logical styles
390
391 start_em = start_i
392 end_em = end_i
393
394 start_strong = start_b
395 end_strong = end_b
396
397 start_code = start_tt
398 end_code = end_tt
399
400 start_samp = start_tt
401 end_samp = end_tt
402
403 start_kbd = start_tt
404 end_kbd = end_tt
405
406 start_file = start_tt # unofficial
407 end_file = end_tt
408
409 start_var = start_i
410 end_var = end_i
411
412 start_dfn = start_i
413 end_dfn = end_i
414
415 start_cite = start_i
416 end_cite = end_i
417
418 start_hp1 = start_i
419 end_hp1 = start_i
420
421 start_hp2 = start_b
422 end_hp2 = end_b
423
424 def unknown_starttag(self, tag, attrs):
425 print '*** unknown <' + tag + '>'
426
427 def unknown_endtag(self, tag):
428 print '*** unknown </' + tag + '>'
429
430
431# An extension of the formatting parser which formats anchors differently.
432class AnchoringParser(FormattingParser):
433
434 def start_a(self, attrs):
435 FormattingParser.start_a(self, attrs)
436 if self.inanchor:
437 self.fmt.bgn_anchor(self.inanchor)
438
439 def end_a(self):
440 if self.inanchor:
441 self.fmt.end_anchor(self.inanchor)
442 self.inanchor = 0
443
444
445# Style sheet -- this is never instantiated, but the attributes
446# of the class object itself are used to specify fonts to be used
447# for various paragraph styles.
448# A font set is a non-empty list of fonts, in the order:
449# [roman, italic, bold, fixed].
450# When a style is not available the nearest lower style is used
451
452ROMAN = 0
453ITALIC = 1
454BOLD = 2
455FIXED = 3
456
457class NullStylesheet:
458 # Fonts -- none
459 stdfontset = [None]
460 h1fontset = [None]
461 h2fontset = [None]
462 h3fontset = [None]
463 # Indents
464 stdindent = 2
465 ddindent = 25
466 ulindent = 4
467 h1indent = 0
468 h2indent = 0
469 literalindent = 0
470
471
472class X11Stylesheet(NullStylesheet):
473 stdfontset = [ \
474 '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \
475 '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \
476 '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \
477 '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \
478 ]
479 h1fontset = [ \
480 '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \
481 '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \
482 '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \
483 ]
484 h2fontset = [ \
485 '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \
486 '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \
487 '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \
488 ]
489 h3fontset = [ \
490 '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \
491 '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \
492 '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \
493 ]
494 ddindent = 40
495
496
497class MacStylesheet(NullStylesheet):
498 stdfontset = [ \
499 ('Geneva', 'p', 10), \
500 ('Geneva', 'i', 10), \
501 ('Geneva', 'b', 10), \
502 ('Monaco', 'p', 10), \
503 ]
504 h1fontset = [ \
505 ('Geneva', 'p', 18), \
506 ('Geneva', 'i', 18), \
507 ('Geneva', 'b', 18), \
508 ('Monaco', 'p', 18), \
509 ]
510 h3fontset = [ \
511 ('Geneva', 'p', 14), \
512 ('Geneva', 'i', 14), \
513 ('Geneva', 'b', 14), \
514 ('Monaco', 'p', 14), \
515 ]
516 h3fontset = [ \
517 ('Geneva', 'p', 12), \
518 ('Geneva', 'i', 12), \
519 ('Geneva', 'b', 12), \
520 ('Monaco', 'p', 12), \
521 ]
522
523
524if os.name == 'mac':
525 StdwinStylesheet = MacStylesheet
526else:
527 StdwinStylesheet = X11Stylesheet
528
529
530class GLStylesheet(NullStylesheet):
531 stdfontset = [ \
532 'Helvetica 10', \
533 'Helvetica-Italic 10', \
534 'Helvetica-Bold 10', \
535 'Courier 10', \
536 ]
537 h1fontset = [ \
538 'Helvetica 18', \
539 'Helvetica-Italic 18', \
540 'Helvetica-Bold 18', \
541 'Courier 18', \
542 ]
543 h2fontset = [ \
544 'Helvetica 14', \
545 'Helvetica-Italic 14', \
546 'Helvetica-Bold 14', \
547 'Courier 14', \
548 ]
549 h3fontset = [ \
550 'Helvetica 12', \
551 'Helvetica-Italic 12', \
552 'Helvetica-Bold 12', \
553 'Courier 12', \
554 ]
555
556
557# Test program -- produces no output but times how long it takes
558# to send a document to a null formatter, exclusive of I/O
559
560def test():
561 import fmt
562 import time
563 if sys.argv[1:]: file = sys.argv[1]
564 else: file = 'test.html'
565 data = open(file, 'r').read()
566 t0 = time.time()
567 fmtr = fmt.WritingFormatter(sys.stdout, 79)
568 p = FormattingParser(fmtr, NullStylesheet)
569 p.feed(data)
570 p.close()
571 t1 = time.time()
572 print
573 print '*** Formatting time:', round(t1-t0, 3), 'seconds.'
574
575
576# Test program using stdwin
577
578def testStdwin():
579 import stdwin, fmt
580 from stdwinevents import *
581 if sys.argv[1:]: file = sys.argv[1]
582 else: file = 'test.html'
583 data = open(file, 'r').read()
584 window = stdwin.open('testStdwin')
585 b = None
586 while 1:
587 etype, ewin, edetail = stdwin.getevent()
588 if etype == WE_CLOSE:
589 break
590 if etype == WE_SIZE:
591 window.setdocsize(0, 0)
592 window.setorigin(0, 0)
593 window.change((0, 0), (10000, 30000)) # XXX
594 if etype == WE_DRAW:
595 if not b:
596 b = fmt.StdwinBackEnd(window, 1)
597 f = fmt.BaseFormatter(b.d, b)
598 p = FormattingParser(f, \
599 MacStylesheet)
600 p.feed(data)
601 p.close()
602 b.finish()
603 else:
604 b.redraw(edetail)
605 window.close()
606
607
608# Test program using GL
609
610def testGL():
611 import gl, GL, fmt
612 if sys.argv[1:]: file = sys.argv[1]
613 else: file = 'test.html'
614 data = open(file, 'r').read()
615 W, H = 600, 600
616 gl.foreground()
617 gl.prefsize(W, H)
618 wid = gl.winopen('testGL')
619 gl.ortho2(0, W, H, 0)
620 gl.color(GL.WHITE)
621 gl.clear()
622 gl.color(GL.BLACK)
623 b = fmt.GLBackEnd(wid)
624 f = fmt.BaseFormatter(b.d, b)
625 p = FormattingParser(f, GLStylesheet)
626 p.feed(data)
627 p.close()
628 b.finish()
629 #
630 import time
631 time.sleep(5)
632
633
634if __name__ == '__main__':
635 test()