| # A parser for HTML documents |
| |
| |
| # HTML: HyperText Markup Language; an SGML-like syntax used by WWW to |
| # describe hypertext documents |
| # |
| # SGML: Standard Generalized Markup Language |
| # |
| # WWW: World-Wide Web; a distributed hypertext system develped at CERN |
| # |
| # CERN: European Particle Physics Laboratory in Geneva, Switzerland |
| |
| |
| # This file is only concerned with parsing and formatting HTML |
| # documents, not with the other (hypertext and networking) aspects of |
| # the WWW project. (It does support highlighting of anchors.) |
| |
| |
| import os |
| import sys |
| import regex |
| import string |
| import sgmllib |
| |
| |
| class HTMLParser(sgmllib.SGMLParser): |
| |
| # Copy base class entities and add some |
| entitydefs = {} |
| for key in sgmllib.SGMLParser.entitydefs.keys(): |
| entitydefs[key] = sgmllib.SGMLParser.entitydefs[key] |
| entitydefs['bullet'] = '*' |
| |
| # Provided -- handlers for tags introducing literal text |
| |
| def start_listing(self, attrs): |
| self.setliteral('listing') |
| self.literal_bgn('listing', attrs) |
| |
| def end_listing(self): |
| self.literal_end('listing') |
| |
| def start_xmp(self, attrs): |
| self.setliteral('xmp') |
| self.literal_bgn('xmp', attrs) |
| |
| def end_xmp(self): |
| self.literal_end('xmp') |
| |
| def do_plaintext(self, attrs): |
| self.setnomoretags() |
| self.literal_bgn('plaintext', attrs) |
| |
| # To be overridden -- begin/end literal mode |
| def literal_bgn(self, tag, attrs): pass |
| def literal_end(self, tag): pass |
| |
| |
| # Next level of sophistication -- collect anchors, title, nextid and isindex |
| class CollectingParser(HTMLParser): |
| # |
| def __init__(self): |
| HTMLParser.__init__(self) |
| self.savetext = None |
| self.nextid = [] |
| self.isindex = 0 |
| self.title = '' |
| self.inanchor = 0 |
| self.anchors = [] |
| self.anchornames = [] |
| self.anchortypes = [] |
| # |
| def start_a(self, attrs): |
| self.inanchor = 0 |
| href = '' |
| name = '' |
| type = '' |
| for attrname, value in attrs: |
| if attrname == 'href': |
| href = value |
| if attrname == 'name=': |
| name = value |
| if attrname == 'type=': |
| type = string.lower(value) |
| if not (href or name): |
| return |
| self.anchors.append(href) |
| self.anchornames.append(name) |
| self.anchortypes.append(type) |
| self.inanchor = len(self.anchors) |
| if not href: |
| self.inanchor = -self.inanchor |
| # |
| def end_a(self): |
| if self.inanchor > 0: |
| # Don't show anchors pointing into the current document |
| if self.anchors[self.inanchor-1][:1] <> '#': |
| self.handle_data('[' + `self.inanchor` + ']') |
| self.inanchor = 0 |
| # |
| def start_header(self, attrs): pass |
| def end_header(self): pass |
| # |
| # (head is the same as header) |
| def start_head(self, attrs): pass |
| def end_head(self): pass |
| # |
| def start_body(self, attrs): pass |
| def end_body(self): pass |
| # |
| def do_nextid(self, attrs): |
| self.nextid = attrs |
| # |
| def do_isindex(self, attrs): |
| self.isindex = 1 |
| # |
| def start_title(self, attrs): |
| self.savetext = '' |
| # |
| def end_title(self): |
| if self.savetext <> None: |
| self.title = self.savetext |
| self.savetext = None |
| # |
| def handle_data(self, text): |
| if self.savetext is not None: |
| self.savetext = self.savetext + text |
| |
| |
| # Formatting parser -- takes a formatter and a style sheet as arguments |
| |
| # XXX The use of style sheets should change: for each tag and end tag |
| # there should be a style definition, and a style definition should |
| # encompass many more parameters: font, justification, indentation, |
| # vspace before, vspace after, hanging tag... |
| |
| wordprog = regex.compile('[^ \t\n]*') |
| spaceprog = regex.compile('[ \t\n]*') |
| |
| class FormattingParser(CollectingParser): |
| |
| def __init__(self, formatter, stylesheet): |
| CollectingParser.__init__(self) |
| self.fmt = formatter |
| self.stl = stylesheet |
| self.savetext = None |
| self.compact = 0 |
| self.nofill = 0 |
| self.resetfont() |
| self.setindent(self.stl.stdindent) |
| |
| def resetfont(self): |
| self.fontstack = [] |
| self.stylestack = [] |
| self.fontset = self.stl.stdfontset |
| self.style = ROMAN |
| self.passfont() |
| |
| def passfont(self): |
| font = self.fontset[self.style] |
| self.fmt.setfont(font) |
| |
| def pushstyle(self, style): |
| self.stylestack.append(self.style) |
| self.style = min(style, len(self.fontset)-1) |
| self.passfont() |
| |
| def popstyle(self): |
| self.style = self.stylestack[-1] |
| del self.stylestack[-1] |
| self.passfont() |
| |
| def pushfontset(self, fontset, style): |
| self.fontstack.append(self.fontset) |
| self.fontset = fontset |
| self.pushstyle(style) |
| |
| def popfontset(self): |
| self.fontset = self.fontstack[-1] |
| del self.fontstack[-1] |
| self.popstyle() |
| |
| def flush(self): |
| self.fmt.flush() |
| |
| def setindent(self, n): |
| self.fmt.setleftindent(n) |
| |
| def needvspace(self, n): |
| self.fmt.needvspace(n) |
| |
| def close(self): |
| HTMLParser.close(self) |
| self.fmt.flush() |
| |
| def handle_literal(self, text): |
| lines = string.splitfields(text, '\n') |
| for i in range(1, len(lines)): |
| lines[i] = string.expandtabs(lines[i], 8) |
| for line in lines[:-1]: |
| self.fmt.addword(line, 0) |
| self.fmt.flush() |
| self.fmt.nospace = 0 |
| for line in lines[-1:]: |
| self.fmt.addword(line, 0) |
| |
| def handle_data(self, text): |
| if self.savetext is not None: |
| self.savetext = self.savetext + text |
| return |
| if self.literal: |
| self.handle_literal(text) |
| return |
| i = 0 |
| n = len(text) |
| while i < n: |
| j = i + wordprog.match(text, i) |
| word = text[i:j] |
| i = j + spaceprog.match(text, j) |
| self.fmt.addword(word, i-j) |
| if self.nofill and '\n' in text[j:i]: |
| self.fmt.flush() |
| self.fmt.nospace = 0 |
| i = j+1 |
| while text[i-1] <> '\n': i = i+1 |
| |
| def literal_bgn(self, tag, attrs): |
| if tag == 'plaintext': |
| self.flush() |
| else: |
| self.needvspace(1) |
| self.pushfontset(self.stl.stdfontset, FIXED) |
| self.setindent(self.stl.literalindent) |
| |
| def literal_end(self, tag): |
| self.needvspace(1) |
| self.popfontset() |
| self.setindent(self.stl.stdindent) |
| |
| def start_title(self, attrs): |
| self.flush() |
| self.savetext = '' |
| # NB end_title is unchanged |
| |
| def do_p(self, attrs): |
| if self.compact: |
| self.flush() |
| else: |
| self.needvspace(1) |
| |
| def start_h1(self, attrs): |
| self.needvspace(2) |
| self.setindent(self.stl.h1indent) |
| self.pushfontset(self.stl.h1fontset, BOLD) |
| self.fmt.setjust('c') |
| |
| def end_h1(self): |
| self.popfontset() |
| self.needvspace(2) |
| self.setindent(self.stl.stdindent) |
| self.fmt.setjust('l') |
| |
| def start_h2(self, attrs): |
| self.needvspace(1) |
| self.setindent(self.stl.h2indent) |
| self.pushfontset(self.stl.h2fontset, BOLD) |
| |
| def end_h2(self): |
| self.popfontset() |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| |
| def start_h3(self, attrs): |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| self.pushfontset(self.stl.h3fontset, BOLD) |
| |
| def end_h3(self): |
| self.popfontset() |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| |
| def start_h4(self, attrs): |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| self.pushfontset(self.stl.stdfontset, BOLD) |
| |
| def end_h4(self): |
| self.popfontset() |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| |
| start_h5 = start_h4 |
| end_h5 = end_h4 |
| |
| start_h6 = start_h5 |
| end_h6 = end_h5 |
| |
| start_h7 = start_h6 |
| end_h7 = end_h6 |
| |
| def start_ul(self, attrs): |
| self.needvspace(1) |
| for attrname, value in attrs: |
| if attrname == 'compact': |
| self.compact = 1 |
| self.setindent(0) |
| break |
| else: |
| self.setindent(self.stl.ulindent) |
| |
| start_dir = start_menu = start_ol = start_ul |
| |
| do_li = do_p |
| |
| def end_ul(self): |
| self.compact = 0 |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| |
| end_dir = end_menu = end_ol = end_ul |
| |
| def start_dl(self, attrs): |
| for attrname, value in attrs: |
| if attrname == 'compact': |
| self.compact = 1 |
| self.needvspace(1) |
| |
| def end_dl(self): |
| self.compact = 0 |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| |
| def do_dt(self, attrs): |
| if self.compact: |
| self.flush() |
| else: |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| |
| def do_dd(self, attrs): |
| self.fmt.addword('', 1) |
| self.setindent(self.stl.ddindent) |
| |
| def start_address(self, attrs): |
| self.compact = 1 |
| self.needvspace(1) |
| self.fmt.setjust('r') |
| |
| def end_address(self): |
| self.compact = 0 |
| self.needvspace(1) |
| self.setindent(self.stl.stdindent) |
| self.fmt.setjust('l') |
| |
| def start_pre(self, attrs): |
| self.needvspace(1) |
| self.nofill = self.nofill + 1 |
| self.pushstyle(FIXED) |
| |
| def end_pre(self): |
| self.popstyle() |
| self.nofill = self.nofill - 1 |
| self.needvspace(1) |
| |
| start_typewriter = start_pre |
| end_typewriter = end_pre |
| |
| def do_img(self, attrs): |
| self.fmt.addword('(image)', 0) |
| |
| # Physical styles |
| |
| def start_tt(self, attrs): self.pushstyle(FIXED) |
| def end_tt(self): self.popstyle() |
| |
| def start_b(self, attrs): self.pushstyle(BOLD) |
| def end_b(self): self.popstyle() |
| |
| def start_i(self, attrs): self.pushstyle(ITALIC) |
| def end_i(self): self.popstyle() |
| |
| def start_u(self, attrs): self.pushstyle(ITALIC) # Underline??? |
| def end_u(self): self.popstyle() |
| |
| def start_r(self, attrs): self.pushstyle(ROMAN) # Not official |
| def end_r(self): self.popstyle() |
| |
| # Logical styles |
| |
| start_em = start_i |
| end_em = end_i |
| |
| start_strong = start_b |
| end_strong = end_b |
| |
| start_code = start_tt |
| end_code = end_tt |
| |
| start_samp = start_tt |
| end_samp = end_tt |
| |
| start_kbd = start_tt |
| end_kbd = end_tt |
| |
| start_file = start_tt # unofficial |
| end_file = end_tt |
| |
| start_var = start_i |
| end_var = end_i |
| |
| start_dfn = start_i |
| end_dfn = end_i |
| |
| start_cite = start_i |
| end_cite = end_i |
| |
| start_hp1 = start_i |
| end_hp1 = start_i |
| |
| start_hp2 = start_b |
| end_hp2 = end_b |
| |
| def unknown_starttag(self, tag, attrs): |
| print '*** unknown <' + tag + '>' |
| |
| def unknown_endtag(self, tag): |
| print '*** unknown </' + tag + '>' |
| |
| |
| # An extension of the formatting parser which formats anchors differently. |
| class AnchoringParser(FormattingParser): |
| |
| def start_a(self, attrs): |
| FormattingParser.start_a(self, attrs) |
| if self.inanchor: |
| self.fmt.bgn_anchor(self.inanchor) |
| |
| def end_a(self): |
| if self.inanchor: |
| self.fmt.end_anchor(self.inanchor) |
| self.inanchor = 0 |
| |
| |
| # Style sheet -- this is never instantiated, but the attributes |
| # of the class object itself are used to specify fonts to be used |
| # for various paragraph styles. |
| # A font set is a non-empty list of fonts, in the order: |
| # [roman, italic, bold, fixed]. |
| # When a style is not available the nearest lower style is used |
| |
| ROMAN = 0 |
| ITALIC = 1 |
| BOLD = 2 |
| FIXED = 3 |
| |
| class NullStylesheet: |
| # Fonts -- none |
| stdfontset = [None] |
| h1fontset = [None] |
| h2fontset = [None] |
| h3fontset = [None] |
| # Indents |
| stdindent = 2 |
| ddindent = 25 |
| ulindent = 4 |
| h1indent = 0 |
| h2indent = 0 |
| literalindent = 0 |
| |
| |
| class X11Stylesheet(NullStylesheet): |
| stdfontset = [ |
| '-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', |
| '-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', |
| '-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', |
| '-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', |
| ] |
| h1fontset = [ |
| '-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', |
| '-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', |
| '-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', |
| ] |
| h2fontset = [ |
| '-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', |
| '-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', |
| '-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', |
| ] |
| h3fontset = [ |
| '-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', |
| '-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', |
| '-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', |
| ] |
| ddindent = 40 |
| |
| |
| class MacStylesheet(NullStylesheet): |
| stdfontset = [ |
| ('Geneva', 'p', 10), |
| ('Geneva', 'i', 10), |
| ('Geneva', 'b', 10), |
| ('Monaco', 'p', 10), |
| ] |
| h1fontset = [ |
| ('Geneva', 'p', 18), |
| ('Geneva', 'i', 18), |
| ('Geneva', 'b', 18), |
| ('Monaco', 'p', 18), |
| ] |
| h3fontset = [ |
| ('Geneva', 'p', 14), |
| ('Geneva', 'i', 14), |
| ('Geneva', 'b', 14), |
| ('Monaco', 'p', 14), |
| ] |
| h3fontset = [ |
| ('Geneva', 'p', 12), |
| ('Geneva', 'i', 12), |
| ('Geneva', 'b', 12), |
| ('Monaco', 'p', 12), |
| ] |
| |
| |
| if os.name == 'mac': |
| StdwinStylesheet = MacStylesheet |
| else: |
| StdwinStylesheet = X11Stylesheet |
| |
| |
| class GLStylesheet(NullStylesheet): |
| stdfontset = [ |
| 'Helvetica 10', |
| 'Helvetica-Italic 10', |
| 'Helvetica-Bold 10', |
| 'Courier 10', |
| ] |
| h1fontset = [ |
| 'Helvetica 18', |
| 'Helvetica-Italic 18', |
| 'Helvetica-Bold 18', |
| 'Courier 18', |
| ] |
| h2fontset = [ |
| 'Helvetica 14', |
| 'Helvetica-Italic 14', |
| 'Helvetica-Bold 14', |
| 'Courier 14', |
| ] |
| h3fontset = [ |
| 'Helvetica 12', |
| 'Helvetica-Italic 12', |
| 'Helvetica-Bold 12', |
| 'Courier 12', |
| ] |
| |
| |
| # Test program -- produces no output but times how long it takes |
| # to send a document to a null formatter, exclusive of I/O |
| |
| def test(): |
| import fmt |
| import time |
| if sys.argv[1:]: file = sys.argv[1] |
| else: file = 'test.html' |
| data = open(file, 'r').read() |
| t0 = time.time() |
| fmtr = fmt.WritingFormatter(sys.stdout, 79) |
| p = FormattingParser(fmtr, NullStylesheet) |
| p.feed(data) |
| p.close() |
| t1 = time.time() |
| print |
| print '*** Formatting time:', round(t1-t0, 3), 'seconds.' |
| |
| |
| # Test program using stdwin |
| |
| def testStdwin(): |
| import stdwin, fmt |
| from stdwinevents import * |
| if sys.argv[1:]: file = sys.argv[1] |
| else: file = 'test.html' |
| data = open(file, 'r').read() |
| window = stdwin.open('testStdwin') |
| b = None |
| while 1: |
| etype, ewin, edetail = stdwin.getevent() |
| if etype == WE_CLOSE: |
| break |
| if etype == WE_SIZE: |
| window.setdocsize(0, 0) |
| window.setorigin(0, 0) |
| window.change((0, 0), (10000, 30000)) # XXX |
| if etype == WE_DRAW: |
| if not b: |
| b = fmt.StdwinBackEnd(window, 1) |
| f = fmt.BaseFormatter(b.d, b) |
| p = FormattingParser(f, MacStylesheet) |
| p.feed(data) |
| p.close() |
| b.finish() |
| else: |
| b.redraw(edetail) |
| window.close() |
| |
| |
| # Test program using GL |
| |
| def testGL(): |
| import gl, GL, fmt |
| if sys.argv[1:]: file = sys.argv[1] |
| else: file = 'test.html' |
| data = open(file, 'r').read() |
| W, H = 600, 600 |
| gl.foreground() |
| gl.prefsize(W, H) |
| wid = gl.winopen('testGL') |
| gl.ortho2(0, W, H, 0) |
| gl.color(GL.WHITE) |
| gl.clear() |
| gl.color(GL.BLACK) |
| b = fmt.GLBackEnd(wid) |
| f = fmt.BaseFormatter(b.d, b) |
| p = FormattingParser(f, GLStylesheet) |
| p.feed(data) |
| p.close() |
| b.finish() |
| # |
| import time |
| time.sleep(5) |
| |
| |
| if __name__ == '__main__': |
| test() |