| # New HTML class |
| |
| # XXX Check against HTML 2.0 spec |
| |
| # XXX reorder methods according to hierarchy |
| # - html structure: head, body, title, isindex |
| # - headers |
| # - lists, items |
| # - paragraph styles |
| # - forms |
| # - character styles |
| # - images |
| # - bookkeeping |
| # - output generation |
| |
| |
| import sys |
| import regsub |
| import string |
| from sgmllib import SGMLParser |
| |
| |
| ROMAN = 0 |
| ITALIC = 1 |
| BOLD = 2 |
| FIXED = 3 |
| |
| |
| class HTMLParser(SGMLParser): |
| |
| def __init__(self): |
| SGMLParser.__init__(self) |
| self.savedata = None |
| self.isindex = 0 |
| self.title = '' |
| self.para = None |
| self.lists = [] |
| self.styles = [] |
| self.nofill = 0 |
| self.nospace = 1 |
| self.softspace = 0 |
| |
| # --- Data |
| |
| def handle_image(self, src, alt): |
| self.handle_data(alt) |
| |
| def handle_data(self, data): |
| if self.nofill: |
| self.handle_literal(data) |
| return |
| data = regsub.gsub('[ \t\n\r]+', ' ', data) |
| if self.nospace and data[:1] == ' ': data = data[1:] |
| if not data: return |
| self.nospace = 0 |
| if self.softspace and data[:1] != ' ': data = ' ' + data |
| if data[-1:] == ' ': |
| data = data[:-1] |
| self.softspace = 1 |
| self.output_data(data) |
| |
| def handle_literal(self, data): |
| self.nospace = 0 |
| self.softspace = 0 |
| self.output_data(data) |
| |
| def output_data(self, data): |
| if self.savedata is not None: |
| self.savedata = self.savedata + data |
| else: |
| self.write_data(data) |
| |
| def write_data(self, data): |
| sys.stdout.write(data) |
| |
| def save_bgn(self): |
| self.savedata = '' |
| self.nospace = 1 |
| self.softspace = 0 |
| |
| def save_end(self): |
| saved = self.savedata |
| self.savedata = None |
| self.nospace = 1 |
| self.softspace = 0 |
| return saved |
| |
| def new_para(self): |
| pass |
| |
| def new_style(self): |
| pass |
| |
| # --- Generic style changes |
| |
| def para_bgn(self, tag): |
| if not self.nospace: |
| self.handle_literal('\n') |
| self.nospace = 1 |
| self.softspace = 0 |
| if tag is not None: |
| self.para = tag |
| self.new_para() |
| |
| def para_end(self): |
| self.para_bgn('') |
| |
| def push_list(self, tag): |
| self.lists.append(tag) |
| self.para_bgn(None) |
| |
| def pop_list(self): |
| del self.lists[-1] |
| self.para_end() |
| |
| def literal_bgn(self, tag, attrs): |
| self.para_bgn(tag) |
| |
| def literal_end(self, tag): |
| self.para_end() |
| |
| def push_style(self, tag): |
| self.styles.append(tag) |
| self.new_style() |
| |
| def pop_style(self): |
| del self.styles[-1] |
| self.new_style() |
| |
| def anchor_bgn(self, href, name, type): |
| self.push_style(href and 'a' or None) |
| |
| def anchor_end(self): |
| self.pop_style() |
| |
| # --- Top level tags |
| |
| def start_html(self, attrs): pass |
| def end_html(self): pass |
| |
| def start_head(self, attrs): pass |
| def end_head(self): pass |
| |
| def start_body(self, attrs): pass |
| def end_body(self): pass |
| |
| def do_isindex(self, attrs): |
| self.isindex = 1 |
| |
| def start_title(self, attrs): |
| self.save_bgn() |
| |
| def end_title(self): |
| self.title = self.save_end() |
| |
| # --- Old HTML 'literal text' tags |
| |
| def start_listing(self, attrs): |
| self.setliteral('listing') |
| self.literal_bgn('listing', attrs) |
| |
| def end_listing(self): |
| self.literal_end('listing') |
| |
| def start_xmp(self, attrs): |
| self.setliteral('xmp') |
| self.literal_bgn('xmp', attrs) |
| |
| def end_xmp(self): |
| self.literal_end('xmp') |
| |
| def do_plaintext(self, attrs): |
| self.setnomoretags() |
| self.literal_bgn('plaintext', attrs) |
| |
| # --- Anchors |
| |
| def start_a(self, attrs): |
| href = '' |
| name = '' |
| type = '' |
| for attrname, value in attrs: |
| if attrname == 'href': |
| href = value |
| if attrname == 'name': |
| name = value |
| if attrname == 'type': |
| type = string.lower(value) |
| if not (href or name): |
| return |
| self.anchor_bgn(href, name, type) |
| |
| def end_a(self): |
| self.anchor_end() |
| |
| # --- Paragraph tags |
| |
| def do_p(self, attrs): |
| self.para_bgn(None) |
| |
| def do_br(self, attrs): |
| self.handle_literal('\n') |
| self.nospace = 1 |
| self.softspace = 0 |
| |
| def do_hr(self, attrs): |
| self.para_bgn(None) |
| self.handle_literal('-'*40) |
| self.para_end() |
| |
| def start_h1(self, attrs): |
| self.para_bgn('h1') |
| |
| def start_h2(self, attrs): |
| self.para_bgn('h2') |
| |
| def start_h3(self, attrs): |
| self.para_bgn('h3') |
| |
| def start_h4(self, attrs): |
| self.para_bgn('h4') |
| |
| def start_h5(self, attrs): |
| self.para_bgn('h5') |
| |
| def start_h6(self, attrs): |
| self.para_bgn('h6') |
| |
| def end_h1(self): |
| self.para_end() |
| |
| end_h2 = end_h1 |
| end_h3 = end_h2 |
| end_h4 = end_h3 |
| end_h5 = end_h4 |
| end_h6 = end_h5 |
| |
| def start_ul(self, attrs): |
| self.para_bgn(None) |
| self.push_list('ul') |
| |
| def start_ol(self, attrs): |
| self.para_bgn(None) |
| self.push_list('ol') |
| |
| def end_ul(self): |
| self.pop_list() |
| self.para_end() |
| |
| def do_li(self, attrs): |
| self.para_bgn('li%d' % len(self.lists)) |
| |
| start_dir = start_menu = start_ul |
| end_dir = end_menu = end_ol = end_ul |
| |
| def start_dl(self, attrs): |
| self.para_bgn(None) |
| self.push_list('dl') |
| |
| def end_dl(self): |
| self.pop_list() |
| self.para_end() |
| |
| def do_dt(self, attrs): |
| self.para_bgn('dt%d' % len(self.lists)) |
| |
| def do_dd(self, attrs): |
| self.para_bgn('dd%d' % len(self.lists)) |
| |
| def start_address(self, attrs): |
| self.para_bgn('address') |
| |
| def end_address(self): |
| self.para_end() |
| |
| def start_pre(self, attrs): |
| self.para_bgn('pre') |
| self.nofill = self.nofill + 1 |
| |
| def end_pre(self): |
| self.nofill = self.nofill - 1 |
| self.para_end() |
| |
| start_typewriter = start_pre |
| end_typewriter = end_pre |
| |
| def do_img(self, attrs): |
| src = '' |
| alt = ' (image) ' |
| for attrname, value in attrs: |
| if attrname == 'alt': |
| alt = value |
| if attrname == 'src': |
| src = value |
| self.handle_image(src, alt) |
| |
| # --- Character tags -- physical styles |
| |
| def start_tt(self, attrs): self.push_style(FIXED) |
| def end_tt(self): self.pop_style() |
| |
| def start_b(self, attrs): self.push_style(BOLD) |
| def end_b(self): self.pop_style() |
| |
| def start_i(self, attrs): self.push_style(ITALIC) |
| def end_i(self): self.pop_style() |
| |
| def start_u(self, attrs): self.push_style(ITALIC) # Underline??? |
| def end_u(self): self.pop_style() |
| |
| def start_r(self, attrs): self.push_style(ROMAN) # Not official |
| def end_r(self): self.pop_style() |
| |
| # --- Charaacter tags -- logical styles |
| |
| start_em = start_i |
| end_em = end_i |
| |
| start_strong = start_b |
| end_strong = end_b |
| |
| start_code = start_tt |
| end_code = end_tt |
| |
| start_samp = start_tt |
| end_samp = end_tt |
| |
| start_kbd = start_tt |
| end_kbd = end_tt |
| |
| start_file = start_tt # unofficial |
| end_file = end_tt |
| |
| start_var = start_i |
| end_var = end_i |
| |
| start_dfn = start_i |
| end_dfn = end_i |
| |
| start_cite = start_i |
| end_cite = end_i |
| |
| start_hp1 = start_i |
| end_hp1 = start_i |
| |
| start_hp2 = start_b |
| end_hp2 = end_b |
| |
| # --- Form tags |
| |
| def start_form(self, attrs): |
| self.para_bgn(None) |
| |
| def end_form(self): |
| self.para_end() |
| |
| # --- Unhandled tags |
| |
| def unknown_starttag(self, tag, attrs): |
| pass |
| |
| def unknown_endtag(self, tag): |
| pass |
| |
| |
| def test(): |
| file = 'test.html' |
| f = open(file, 'r') |
| data = f.read() |
| f.close() |
| p = HTMLParser() |
| p.feed(data) |
| p.close() |
| |
| |
| if __name__ == '__main__': |
| test() |