Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 1 | # New HTML class |
| 2 | |
| 3 | # XXX Check against HTML 2.0 spec |
| 4 | |
| 5 | # XXX reorder methods according to hierarchy |
| 6 | # - html structure: head, body, title, isindex |
| 7 | # - headers |
| 8 | # - lists, items |
| 9 | # - paragraph styles |
| 10 | # - forms |
| 11 | # - character styles |
| 12 | # - images |
| 13 | # - bookkeeping |
| 14 | # - output generation |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 15 | |
| 16 | |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 17 | import sys |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 18 | import regsub |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 19 | import string |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 20 | from sgmllib import SGMLParser |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 21 | |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 22 | |
| 23 | ROMAN = 0 |
| 24 | ITALIC = 1 |
| 25 | BOLD = 2 |
| 26 | FIXED = 3 |
| 27 | |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 28 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 29 | class HTMLParser(SGMLParser): |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 30 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 31 | def __init__(self): |
| 32 | SGMLParser.__init__(self) |
| 33 | self.savedata = None |
| 34 | self.isindex = 0 |
| 35 | self.title = '' |
| 36 | self.para = None |
| 37 | self.lists = [] |
| 38 | self.styles = [] |
| 39 | self.nofill = 0 |
| 40 | self.nospace = 1 |
| 41 | self.softspace = 0 |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 42 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 43 | # --- Data |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 44 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 45 | def handle_image(self, src, alt): |
| 46 | self.handle_data(alt) |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 47 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 48 | def handle_data(self, data): |
| 49 | if self.nofill: |
| 50 | self.handle_literal(data) |
| 51 | return |
| 52 | data = regsub.gsub('[ \t\n\r]+', ' ', data) |
| 53 | if self.nospace and data[:1] == ' ': data = data[1:] |
| 54 | if not data: return |
| 55 | self.nospace = 0 |
| 56 | if self.softspace and data[:1] != ' ': data = ' ' + data |
| 57 | if data[-1:] == ' ': |
| 58 | data = data[:-1] |
| 59 | self.softspace = 1 |
| 60 | self.output_data(data) |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 61 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 62 | def handle_literal(self, data): |
| 63 | self.nospace = 0 |
| 64 | self.softspace = 0 |
| 65 | self.output_data(data) |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 66 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 67 | def output_data(self, data): |
| 68 | if self.savedata is not None: |
| 69 | self.savedata = self.savedata + data |
| 70 | else: |
| 71 | self.write_data(data) |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 72 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 73 | def write_data(self, data): |
| 74 | sys.stdout.write(data) |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 75 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 76 | def save_bgn(self): |
| 77 | self.savedata = '' |
| 78 | self.nospace = 1 |
| 79 | self.softspace = 0 |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 80 | |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 81 | def save_end(self): |
| 82 | saved = self.savedata |
| 83 | self.savedata = None |
| 84 | self.nospace = 1 |
| 85 | self.softspace = 0 |
| 86 | return saved |
| 87 | |
| 88 | def new_para(self): |
| 89 | pass |
| 90 | |
| 91 | def new_style(self): |
| 92 | pass |
| 93 | |
| 94 | # --- Generic style changes |
| 95 | |
| 96 | def para_bgn(self, tag): |
| 97 | if not self.nospace: |
| 98 | self.handle_literal('\n') |
| 99 | self.nospace = 1 |
| 100 | self.softspace = 0 |
| 101 | if tag is not None: |
| 102 | self.para = tag |
| 103 | self.new_para() |
| 104 | |
| 105 | def para_end(self): |
| 106 | self.para_bgn('') |
| 107 | |
| 108 | def push_list(self, tag): |
| 109 | self.lists.append(tag) |
| 110 | self.para_bgn(None) |
| 111 | |
| 112 | def pop_list(self): |
| 113 | del self.lists[-1] |
| 114 | self.para_end() |
| 115 | |
| 116 | def literal_bgn(self, tag, attrs): |
| 117 | self.para_bgn(tag) |
| 118 | |
| 119 | def literal_end(self, tag): |
| 120 | self.para_end() |
| 121 | |
| 122 | def push_style(self, tag): |
| 123 | self.styles.append(tag) |
| 124 | self.new_style() |
| 125 | |
| 126 | def pop_style(self): |
| 127 | del self.styles[-1] |
| 128 | self.new_style() |
| 129 | |
| 130 | def anchor_bgn(self, href, name, type): |
| 131 | self.push_style(href and 'a' or None) |
| 132 | |
| 133 | def anchor_end(self): |
| 134 | self.pop_style() |
| 135 | |
| 136 | # --- Top level tags |
| 137 | |
| 138 | def start_html(self, attrs): pass |
| 139 | def end_html(self): pass |
| 140 | |
| 141 | def start_head(self, attrs): pass |
| 142 | def end_head(self): pass |
| 143 | |
| 144 | def start_body(self, attrs): pass |
| 145 | def end_body(self): pass |
| 146 | |
| 147 | def do_isindex(self, attrs): |
| 148 | self.isindex = 1 |
| 149 | |
| 150 | def start_title(self, attrs): |
| 151 | self.save_bgn() |
| 152 | |
| 153 | def end_title(self): |
| 154 | self.title = self.save_end() |
| 155 | |
| 156 | # --- Old HTML 'literal text' tags |
| 157 | |
| 158 | def start_listing(self, attrs): |
| 159 | self.setliteral('listing') |
| 160 | self.literal_bgn('listing', attrs) |
| 161 | |
| 162 | def end_listing(self): |
| 163 | self.literal_end('listing') |
| 164 | |
| 165 | def start_xmp(self, attrs): |
| 166 | self.setliteral('xmp') |
| 167 | self.literal_bgn('xmp', attrs) |
| 168 | |
| 169 | def end_xmp(self): |
| 170 | self.literal_end('xmp') |
| 171 | |
| 172 | def do_plaintext(self, attrs): |
| 173 | self.setnomoretags() |
| 174 | self.literal_bgn('plaintext', attrs) |
| 175 | |
| 176 | # --- Anchors |
| 177 | |
| 178 | def start_a(self, attrs): |
| 179 | href = '' |
| 180 | name = '' |
| 181 | type = '' |
| 182 | for attrname, value in attrs: |
| 183 | if attrname == 'href': |
| 184 | href = value |
| 185 | if attrname == 'name': |
| 186 | name = value |
| 187 | if attrname == 'type': |
| 188 | type = string.lower(value) |
| 189 | if not (href or name): |
| 190 | return |
| 191 | self.anchor_bgn(href, name, type) |
| 192 | |
| 193 | def end_a(self): |
| 194 | self.anchor_end() |
| 195 | |
| 196 | # --- Paragraph tags |
| 197 | |
| 198 | def do_p(self, attrs): |
| 199 | self.para_bgn(None) |
| 200 | |
| 201 | def do_br(self, attrs): |
| 202 | self.handle_literal('\n') |
| 203 | self.nospace = 1 |
| 204 | self.softspace = 0 |
| 205 | |
| 206 | def do_hr(self, attrs): |
| 207 | self.para_bgn(None) |
| 208 | self.handle_literal('-'*40) |
| 209 | self.para_end() |
| 210 | |
| 211 | def start_h1(self, attrs): |
| 212 | self.para_bgn('h1') |
| 213 | |
| 214 | def start_h2(self, attrs): |
| 215 | self.para_bgn('h2') |
| 216 | |
| 217 | def start_h3(self, attrs): |
| 218 | self.para_bgn('h3') |
| 219 | |
| 220 | def start_h4(self, attrs): |
| 221 | self.para_bgn('h4') |
| 222 | |
| 223 | def start_h5(self, attrs): |
| 224 | self.para_bgn('h5') |
| 225 | |
| 226 | def start_h6(self, attrs): |
| 227 | self.para_bgn('h6') |
| 228 | |
| 229 | def end_h1(self): |
| 230 | self.para_end() |
| 231 | |
| 232 | end_h2 = end_h1 |
| 233 | end_h3 = end_h2 |
| 234 | end_h4 = end_h3 |
| 235 | end_h5 = end_h4 |
| 236 | end_h6 = end_h5 |
| 237 | |
| 238 | def start_ul(self, attrs): |
| 239 | self.para_bgn(None) |
| 240 | self.push_list('ul') |
| 241 | |
| 242 | def start_ol(self, attrs): |
| 243 | self.para_bgn(None) |
| 244 | self.push_list('ol') |
| 245 | |
| 246 | def end_ul(self): |
| 247 | self.pop_list() |
| 248 | self.para_end() |
| 249 | |
| 250 | def do_li(self, attrs): |
| 251 | self.para_bgn('li%d' % len(self.lists)) |
| 252 | |
| 253 | start_dir = start_menu = start_ul |
| 254 | end_dir = end_menu = end_ol = end_ul |
| 255 | |
| 256 | def start_dl(self, attrs): |
| 257 | self.para_bgn(None) |
| 258 | self.push_list('dl') |
| 259 | |
| 260 | def end_dl(self): |
| 261 | self.pop_list() |
| 262 | self.para_end() |
| 263 | |
| 264 | def do_dt(self, attrs): |
| 265 | self.para_bgn('dt%d' % len(self.lists)) |
| 266 | |
| 267 | def do_dd(self, attrs): |
| 268 | self.para_bgn('dd%d' % len(self.lists)) |
| 269 | |
| 270 | def start_address(self, attrs): |
| 271 | self.para_bgn('address') |
| 272 | |
| 273 | def end_address(self): |
| 274 | self.para_end() |
| 275 | |
| 276 | def start_pre(self, attrs): |
| 277 | self.para_bgn('pre') |
| 278 | self.nofill = self.nofill + 1 |
| 279 | |
| 280 | def end_pre(self): |
| 281 | self.nofill = self.nofill - 1 |
| 282 | self.para_end() |
| 283 | |
| 284 | start_typewriter = start_pre |
| 285 | end_typewriter = end_pre |
| 286 | |
| 287 | def do_img(self, attrs): |
| 288 | src = '' |
| 289 | alt = ' (image) ' |
| 290 | for attrname, value in attrs: |
| 291 | if attrname == 'alt': |
| 292 | alt = value |
| 293 | if attrname == 'src': |
| 294 | src = value |
| 295 | self.handle_image(src, alt) |
| 296 | |
| 297 | # --- Character tags -- physical styles |
| 298 | |
| 299 | def start_tt(self, attrs): self.push_style(FIXED) |
| 300 | def end_tt(self): self.pop_style() |
| 301 | |
| 302 | def start_b(self, attrs): self.push_style(BOLD) |
| 303 | def end_b(self): self.pop_style() |
| 304 | |
| 305 | def start_i(self, attrs): self.push_style(ITALIC) |
| 306 | def end_i(self): self.pop_style() |
| 307 | |
| 308 | def start_u(self, attrs): self.push_style(ITALIC) # Underline??? |
| 309 | def end_u(self): self.pop_style() |
| 310 | |
| 311 | def start_r(self, attrs): self.push_style(ROMAN) # Not official |
| 312 | def end_r(self): self.pop_style() |
| 313 | |
| 314 | # --- Charaacter tags -- logical styles |
| 315 | |
| 316 | start_em = start_i |
| 317 | end_em = end_i |
| 318 | |
| 319 | start_strong = start_b |
| 320 | end_strong = end_b |
| 321 | |
| 322 | start_code = start_tt |
| 323 | end_code = end_tt |
| 324 | |
| 325 | start_samp = start_tt |
| 326 | end_samp = end_tt |
| 327 | |
| 328 | start_kbd = start_tt |
| 329 | end_kbd = end_tt |
| 330 | |
| 331 | start_file = start_tt # unofficial |
| 332 | end_file = end_tt |
| 333 | |
| 334 | start_var = start_i |
| 335 | end_var = end_i |
| 336 | |
| 337 | start_dfn = start_i |
| 338 | end_dfn = end_i |
| 339 | |
| 340 | start_cite = start_i |
| 341 | end_cite = end_i |
| 342 | |
| 343 | start_hp1 = start_i |
| 344 | end_hp1 = start_i |
| 345 | |
| 346 | start_hp2 = start_b |
| 347 | end_hp2 = end_b |
| 348 | |
| 349 | # --- Form tags |
| 350 | |
| 351 | def start_form(self, attrs): |
| 352 | self.para_bgn(None) |
| 353 | |
| 354 | def end_form(self): |
| 355 | self.para_end() |
| 356 | |
| 357 | # --- Unhandled tags |
| 358 | |
| 359 | def unknown_starttag(self, tag, attrs): |
| 360 | pass |
| 361 | |
| 362 | def unknown_endtag(self, tag): |
| 363 | pass |
| 364 | |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 365 | |
| 366 | def test(): |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 367 | file = 'test.html' |
| 368 | f = open(file, 'r') |
| 369 | data = f.read() |
| 370 | f.close() |
| 371 | p = HTMLParser() |
| 372 | p.feed(data) |
| 373 | p.close() |
Guido van Rossum | 7c750e1 | 1995-02-27 13:16:55 +0000 | [diff] [blame] | 374 | |
| 375 | |
| 376 | if __name__ == '__main__': |
Guido van Rossum | 7ff5d7f | 1995-08-04 04:23:30 +0000 | [diff] [blame] | 377 | test() |