blob: df8383ecb17312e14a4344d44e9800aa88db56d9 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import string
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
24piopen = re.compile(r'<\?')
25piclose = re.compile('>')
26endtagopen = re.compile('</')
27declopen = re.compile('<!')
28special = re.compile('<![^<>]*>')
29commentopen = re.compile('<!--')
30commentclose = re.compile(r'--\s*>')
31tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
32attrfind = re.compile(
33 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
35
36locatestarttagend = re.compile(r"""
37 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
38 (?:\s+ # whitespace before attribute name
39 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
40 (?:\s*=\s* # value indicator
41 (?:'[^']*' # LITA-enclosed value
42 |\"[^\"]*\" # LIT-enclosed value
43 |[^'\">\s]+ # bare value
44 )
45 )?
46 )
47 )*
48 \s* # trailing whitespace
49""", re.VERBOSE)
50endstarttag = re.compile(r"\s*/?>")
51endendtag = re.compile('>')
52endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
53
54declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
55declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
56
57
58class HTMLParseError(Exception):
59 """Exception raised for all parse errors."""
60
61 def __init__(self, msg, position=(None, None)):
62 assert msg
63 self.msg = msg
64 self.lineno = position[0]
65 self.offset = position[1]
66
67 def __str__(self):
68 result = self.msg
69 if self.lineno is not None:
70 result = result + ", at line %d" % self.lineno
71 if self.offset is not None:
72 result = result + ", column %d" % (self.offset + 1)
73 return result
74
75
Guido van Rossum8846d712001-05-18 14:50:52 +000076class HTMLParser:
Fred Drake1d4601d2001-08-03 19:50:59 +000077 """Find tags and other markup and call handler functions.
78
79 Usage:
80 p = HTMLParser()
81 p.feed(data)
82 ...
83 p.close()
84
85 Start tags are handled by calling self.handle_starttag() or
86 self.handle_startendtag(); end tags by self.handle_endtag(). The
87 data between tags is passed from the parser to the derived class
88 by calling self.handle_data() with the data as argument (the data
89 may be split up in arbitrary chunks). Entity references are
90 passed by calling self.handle_entityref() with the entity
91 reference as the argument. Numeric character references are
92 passed to self.handle_charref() with the string containing the
93 reference as the argument.
94 """
Guido van Rossum8846d712001-05-18 14:50:52 +000095
96 CDATA_CONTENT_ELEMENTS = ("script", "style")
97
98
Guido van Rossum8846d712001-05-18 14:50:52 +000099 def __init__(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000100 """Initialize and reset this instance."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000101 self.reset()
102
Guido van Rossum8846d712001-05-18 14:50:52 +0000103 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000104 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000105 self.rawdata = ''
106 self.stack = []
107 self.lasttag = '???'
108 self.lineno = 1
109 self.offset = 0
110 self.interesting = interesting_normal
111
Guido van Rossum8846d712001-05-18 14:50:52 +0000112 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000113 """Feed data to the parser.
114
115 Call this as often as you want, with as little or as much text
116 as you want (may include '\n').
117 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000118 self.rawdata = self.rawdata + data
119 self.goahead(0)
120
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000122 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.goahead(1)
124
125 # Internal -- update line number and offset. This should be
126 # called for each piece of data exactly once, in order -- in other
127 # words the concatenation of all the input strings to this
128 # function should be exactly the entire input.
129 def updatepos(self, i, j):
130 if i >= j:
131 return j
132 rawdata = self.rawdata
133 nlines = string.count(rawdata, "\n", i, j)
134 if nlines:
135 self.lineno = self.lineno + nlines
136 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
137 self.offset = j-(pos+1)
138 else:
139 self.offset = self.offset + j-i
140 return j
141
Guido van Rossum8846d712001-05-18 14:50:52 +0000142 def getpos(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000143 """Return current line number and offset."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 return self.lineno, self.offset
145
146 __starttag_text = None
147
Guido van Rossum8846d712001-05-18 14:50:52 +0000148 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000149 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000150 return self.__starttag_text
151
152 def set_cdata_mode(self):
153 self.interesting = interesting_cdata
154
155 def clear_cdata_mode(self):
156 self.interesting = interesting_normal
157
158 # Internal -- handle data as far as reasonable. May leave state
159 # and data to be processed by a subsequent call. If 'end' is
160 # true, force handling all data as if followed by EOF marker.
161 def goahead(self, end):
162 rawdata = self.rawdata
163 i = 0
164 n = len(rawdata)
165 while i < n:
166 match = self.interesting.search(rawdata, i) # < or &
167 if match:
168 j = match.start()
169 else:
170 j = n
171 if i < j: self.handle_data(rawdata[i:j])
172 i = self.updatepos(i, j)
173 if i == n: break
174 if rawdata[i] == '<':
175 if starttagopen.match(rawdata, i): # < + letter
176 k = self.parse_starttag(i)
177 elif endtagopen.match(rawdata, i): # </
178 k = self.parse_endtag(i)
179 if k >= 0:
180 self.clear_cdata_mode()
181 elif commentopen.match(rawdata, i): # <!--
182 k = self.parse_comment(i)
183 elif piopen.match(rawdata, i): # <?
184 k = self.parse_pi(i)
185 elif declopen.match(rawdata, i): # <!
186 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000187 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000188 self.handle_data("<")
189 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000190 else:
191 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000192 if k < 0:
193 if end:
194 raise HTMLParseError("EOF in middle of construct",
195 self.getpos())
196 break
197 i = self.updatepos(i, k)
Fred Drake68eac2b2001-09-04 15:10:16 +0000198 elif rawdata[i:i+2] == "&#":
Guido van Rossum8846d712001-05-18 14:50:52 +0000199 match = charref.match(rawdata, i)
200 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000201 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000202 self.handle_charref(name)
203 k = match.end()
204 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000205 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 i = self.updatepos(i, k)
207 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000208 else:
209 break
210 elif rawdata[i] == '&':
Guido van Rossum8846d712001-05-18 14:50:52 +0000211 match = entityref.match(rawdata, i)
212 if match:
213 name = match.group(1)
214 self.handle_entityref(name)
215 k = match.end()
216 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000217 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000218 i = self.updatepos(i, k)
219 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000220 match = incomplete.match(rawdata, i)
221 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000222 # match.group() will contain at least 2 chars
Fred Drake029acfb2001-08-20 21:24:19 +0000223 rest = rawdata[i:]
Fred Drake68eac2b2001-09-04 15:10:16 +0000224 if end and match.group() == rest:
Guido van Rossum8846d712001-05-18 14:50:52 +0000225 raise HTMLParseError(
226 "EOF in middle of entity or char ref",
227 self.getpos())
Fred Drake68eac2b2001-09-04 15:10:16 +0000228 # incomplete
229 break
230 elif (i + 1) < n:
231 # not the end of the buffer, and can't be confused
232 # with some other construct
233 self.handle_data("&")
234 i = self.updatepos(i, i + 1)
235 else:
236 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000237 else:
238 assert 0, "interesting.search() lied"
239 # end while
240 if end and i < n:
241 self.handle_data(rawdata[i:n])
242 i = self.updatepos(i, n)
243 self.rawdata = rawdata[i:]
244
245 # Internal -- parse comment, return end or -1 if not terminated
Fred Drake68eac2b2001-09-04 15:10:16 +0000246 def parse_comment(self, i, report=1):
Guido van Rossum8846d712001-05-18 14:50:52 +0000247 rawdata = self.rawdata
248 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
249 match = commentclose.search(rawdata, i+4)
250 if not match:
251 return -1
Fred Drake68eac2b2001-09-04 15:10:16 +0000252 if report:
253 j = match.start()
254 self.handle_comment(rawdata[i+4: j])
Guido van Rossum8846d712001-05-18 14:50:52 +0000255 j = match.end()
256 return j
257
258 # Internal -- parse declaration.
259 def parse_declaration(self, i):
260 # This is some sort of declaration; in "HTML as
261 # deployed," this should only be the document type
262 # declaration ("<!DOCTYPE html...>").
263 rawdata = self.rawdata
264 j = i + 2
265 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
266 if rawdata[j:j+1] in ("-", ""):
267 # Start of comment followed by buffer boundary,
268 # or just a buffer boundary.
269 return -1
270 # in practice, this should look like: ((name|stringlit) S*)+ '>'
271 n = len(rawdata)
Fred Drake7cf613d2001-09-04 16:26:03 +0000272 decltype, j = self.scan_name(j, i)
273 if j < 0:
274 return j
275 if decltype.lower() != "doctype":
276 raise HTMLParseError("unknown declaration: '%s'" % decltype,
277 self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000278 while j < n:
279 c = rawdata[j]
280 if c == ">":
281 # end of declaration syntax
Fred Drake68eac2b2001-09-04 15:10:16 +0000282 data = rawdata[i+2:j]
Fred Drake7cf613d2001-09-04 16:26:03 +0000283 self.handle_decl(data)
Guido van Rossum8846d712001-05-18 14:50:52 +0000284 return j + 1
285 if c in "\"'":
286 m = declstringlit.match(rawdata, j)
287 if not m:
288 return -1 # incomplete
289 j = m.end()
290 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
Fred Drake7cf613d2001-09-04 16:26:03 +0000291 name, j = self.scan_name(j, i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000292 elif c == "[" and decltype == "doctype":
293 j = self.parse_doctype_subset(j + 1, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000294 else:
295 raise HTMLParseError(
296 "unexpected char in declaration: %s" % `rawdata[j]`,
297 self.getpos())
Fred Drake7cf613d2001-09-04 16:26:03 +0000298 if j < 0:
299 return j
Guido van Rossum8846d712001-05-18 14:50:52 +0000300 return -1 # incomplete
301
Fred Drake68eac2b2001-09-04 15:10:16 +0000302 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
303 # returning the index just past any whitespace following the trailing ']'.
304 def parse_doctype_subset(self, i, declstartpos):
305 rawdata = self.rawdata
306 n = len(rawdata)
307 j = i
308 while j < n:
309 c = rawdata[j]
310 if c == "<":
311 s = rawdata[j:j+2]
312 if s == "<":
313 # end of buffer; incomplete
314 return -1
315 if s != "<!":
316 self.updatepos(declstartpos, j + 1)
317 raise HTMLParseError("unexpect char in internal subset",
318 self.getpos())
319 if (j + 2) == n:
320 # end of buffer; incomplete
321 return -1
322 if (j + 4) > n:
323 # end of buffer; incomplete
324 return -1
325 if rawdata[j:j+4] == "<!--":
326 j = self.parse_comment(j, report=0)
327 if j < 0:
328 return j
329 continue
330 name, j = self.scan_name(j + 2, declstartpos)
331 if j == -1:
332 return -1
333 if name not in ("attlist", "element", "entity", "notation"):
334 self.updatepos(declstartpos, j + 2)
335 raise HTMLParseError(
336 "unknown declaration %s in internal subset" % `name`,
337 self.getpos())
338 # handle the individual names
339 meth = getattr(self, "parse_doctype_" + name)
340 j = meth(j, declstartpos)
341 if j < 0:
342 return j
343 elif c == "%":
344 # parameter entity reference
345 if (j + 1) == n:
346 # end of buffer; incomplete
347 return -1
Fred Drake7cf613d2001-09-04 16:26:03 +0000348 s, j = self.scan_name(j + 1, declstartpos)
349 if j < 0:
350 return j
Fred Drake68eac2b2001-09-04 15:10:16 +0000351 if rawdata[j] == ";":
352 j = j + 1
353 elif c == "]":
354 j = j + 1
355 while j < n and rawdata[j] in string.whitespace:
356 j = j + 1
357 if j < n:
358 if rawdata[j] == ">":
359 return j
360 self.updatepos(declstartpos, j)
361 raise HTMLParseError(
362 "unexpected char after internal subset",
363 self.getpos())
364 else:
365 return -1
366 elif c in string.whitespace:
367 j = j + 1
368 else:
369 self.updatepos(declstartpos, j)
Fred Drake7cf613d2001-09-04 16:26:03 +0000370 raise HTMLParseError(
371 "unexpected char %s in internal subset" % `c`,
372 self.getpos())
Fred Drake68eac2b2001-09-04 15:10:16 +0000373 # end of buffer reached
374 return -1
375
376 def parse_doctype_element(self, i, declstartpos):
377 rawdata = self.rawdata
378 n = len(rawdata)
379 name, j = self.scan_name(i, declstartpos)
380 if j == -1:
381 return -1
382 # style content model; just skip until '>'
383 if '>' in rawdata[j:]:
384 return string.find(rawdata, ">", j) + 1
385 return -1
386
387 def parse_doctype_attlist(self, i, declstartpos):
388 rawdata = self.rawdata
389 name, j = self.scan_name(i, declstartpos)
390 c = rawdata[j:j+1]
391 if c == "":
392 return -1
393 if c == ">":
394 return j + 1
395 while 1:
396 # scan a series of attribute descriptions; simplified:
397 # name type [value] [#constraint]
398 name, j = self.scan_name(j, declstartpos)
399 if j < 0:
400 return j
401 c = rawdata[j:j+1]
402 if c == "":
403 return -1
404 if c == "(":
405 # an enumerated type; look for ')'
406 if ")" in rawdata[j:]:
407 j = string.find(rawdata, ")", j) + 1
408 else:
409 return -1
410 while rawdata[j:j+1] in string.whitespace:
411 j = j + 1
412 if not rawdata[j:]:
413 # end of buffer, incomplete
414 return -1
415 else:
416 name, j = self.scan_name(j, declstartpos)
417 c = rawdata[j:j+1]
418 if not c:
419 return -1
420 if c in "'\"":
421 m = declstringlit.match(rawdata, j)
422 if m:
423 j = m.end()
424 else:
425 return -1
426 c = rawdata[j:j+1]
427 if not c:
428 return -1
429 if c == "#":
430 if rawdata[j:] == "#":
431 # end of buffer
432 return -1
433 name, j = self.scan_name(j + 1, declstartpos)
434 if j < 0:
435 return j
436 c = rawdata[j:j+1]
437 if not c:
438 return -1
439 if c == '>':
440 # all done
441 return j + 1
442
443 def parse_doctype_notation(self, i, declstartpos):
444 name, j = self.scan_name(i, declstartpos)
445 if j < 0:
446 return j
447 rawdata = self.rawdata
448 while 1:
449 c = rawdata[j:j+1]
450 if not c:
451 # end of buffer; incomplete
452 return -1
453 if c == '>':
454 return j + 1
455 if c in "'\"":
456 m = declstringlit.match(rawdata, j)
457 if not m:
458 return -1
459 j = m.end()
460 else:
461 name, j = self.scan_name(j, declstartpos)
462 if j < 0:
463 return j
464
465 def parse_doctype_entity(self, i, declstartpos):
466 rawdata = self.rawdata
467 if rawdata[i:i+1] == "%":
468 j = i + 1
469 while 1:
470 c = rawdata[j:j+1]
471 if not c:
472 return -1
473 if c in string.whitespace:
474 j = j + 1
475 else:
476 break
477 else:
478 j = i
479 name, j = self.scan_name(j, declstartpos)
480 if j < 0:
481 return j
482 while 1:
483 c = self.rawdata[j:j+1]
484 if not c:
485 return -1
486 if c in "'\"":
487 m = declstringlit.match(rawdata, j)
488 if m:
489 j = m.end()
490 else:
491 return -1 # incomplete
492 elif c == ">":
493 return j + 1
494 else:
495 name, j = self.scan_name(j, declstartpos)
496 if j < 0:
497 return j
498
499 def scan_name(self, i, declstartpos):
500 rawdata = self.rawdata
501 n = len(rawdata)
502 if i == n:
503 return None, -1
504 m = declname.match(rawdata, i)
505 if m:
506 s = m.group()
507 name = s.strip()
508 if (i + len(s)) == n:
509 return None, -1 # end of buffer
510 return name.lower(), m.end()
511 else:
512 self.updatepos(declstartpos, i)
513 raise HTMLParseError("expected name token", self.getpos())
514
Guido van Rossum8846d712001-05-18 14:50:52 +0000515 # Internal -- parse processing instr, return end or -1 if not terminated
516 def parse_pi(self, i):
517 rawdata = self.rawdata
518 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
519 match = piclose.search(rawdata, i+2) # >
520 if not match:
521 return -1
522 j = match.start()
523 self.handle_pi(rawdata[i+2: j])
524 j = match.end()
525 return j
526
527 # Internal -- handle starttag, return end or -1 if not terminated
528 def parse_starttag(self, i):
529 self.__starttag_text = None
530 endpos = self.check_for_whole_start_tag(i)
531 if endpos < 0:
532 return endpos
533 rawdata = self.rawdata
534 self.__starttag_text = rawdata[i:endpos]
535
536 # Now parse the data between i+1 and j into a tag and attrs
537 attrs = []
538 match = tagfind.match(rawdata, i+1)
539 assert match, 'unexpected call to parse_starttag()'
540 k = match.end()
541 self.lasttag = tag = string.lower(rawdata[i+1:k])
542
543 while k < endpos:
544 m = attrfind.match(rawdata, k)
545 if not m:
546 break
547 attrname, rest, attrvalue = m.group(1, 2, 3)
548 if not rest:
549 attrvalue = None
550 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
551 attrvalue[:1] == '"' == attrvalue[-1:]:
552 attrvalue = attrvalue[1:-1]
553 attrvalue = self.unescape(attrvalue)
554 attrs.append((string.lower(attrname), attrvalue))
555 k = m.end()
556
557 end = string.strip(rawdata[k:endpos])
558 if end not in (">", "/>"):
559 lineno, offset = self.getpos()
560 if "\n" in self.__starttag_text:
561 lineno = lineno + string.count(self.__starttag_text, "\n")
562 offset = len(self.__starttag_text) \
563 - string.rfind(self.__starttag_text, "\n")
564 else:
565 offset = offset + len(self.__starttag_text)
566 raise HTMLParseError("junk characters in start tag: %s"
567 % `rawdata[k:endpos][:20]`,
568 (lineno, offset))
569 if end[-2:] == '/>':
570 # XHTML-style empty tag: <span attr="value" />
571 self.handle_startendtag(tag, attrs)
572 else:
573 self.handle_starttag(tag, attrs)
574 if tag in self.CDATA_CONTENT_ELEMENTS:
575 self.set_cdata_mode()
576 return endpos
577
578 # Internal -- check to see if we have a complete starttag; return end
579 # or -1 if incomplete.
580 def check_for_whole_start_tag(self, i):
581 rawdata = self.rawdata
582 m = locatestarttagend.match(rawdata, i)
583 if m:
584 j = m.end()
585 next = rawdata[j:j+1]
586 if next == ">":
587 return j + 1
588 if next == "/":
589 s = rawdata[j:j+2]
590 if s == "/>":
591 return j + 2
592 if s == "/":
593 # buffer boundary
594 return -1
595 # else bogus input
596 self.updatepos(i, j + 1)
597 raise HTMLParseError("malformed empty start tag",
598 self.getpos())
599 if next == "":
600 # end of input
601 return -1
602 if next in ("abcdefghijklmnopqrstuvwxyz=/"
603 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
604 # end of input in or before attribute value, or we have the
605 # '/' from a '/>' ending
606 return -1
607 self.updatepos(i, j)
608 raise HTMLParseError("malformed start tag", self.getpos())
609 raise AssertionError("we should not gt here!")
610
611 # Internal -- parse endtag, return end or -1 if incomplete
612 def parse_endtag(self, i):
613 rawdata = self.rawdata
614 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
615 match = endendtag.search(rawdata, i+1) # >
616 if not match:
617 return -1
618 j = match.end()
619 match = endtagfind.match(rawdata, i) # </ + tag + >
620 if not match:
621 raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
622 self.getpos())
623 tag = match.group(1)
624 self.handle_endtag(string.lower(tag))
625 return j
626
627 # Overridable -- finish processing of start+end tag: <tag.../>
628 def handle_startendtag(self, tag, attrs):
629 self.handle_starttag(tag, attrs)
630 self.handle_endtag(tag)
631
632 # Overridable -- handle start tag
633 def handle_starttag(self, tag, attrs):
634 pass
635
636 # Overridable -- handle end tag
637 def handle_endtag(self, tag):
638 pass
639
640 # Overridable -- handle character reference
641 def handle_charref(self, name):
642 pass
643
644 # Overridable -- handle entity reference
645 def handle_entityref(self, name):
646 pass
647
648 # Overridable -- handle data
649 def handle_data(self, data):
650 pass
651
652 # Overridable -- handle comment
653 def handle_comment(self, data):
654 pass
655
656 # Overridable -- handle declaration
657 def handle_decl(self, decl):
658 pass
659
660 # Overridable -- handle processing instruction
661 def handle_pi(self, data):
662 pass
663
664 # Internal -- helper to remove special character quoting
665 def unescape(self, s):
666 if '&' not in s:
667 return s
668 s = string.replace(s, "&lt;", "<")
669 s = string.replace(s, "&gt;", ">")
670 s = string.replace(s, "&apos;", "'")
671 s = string.replace(s, "&quot;", '"')
672 s = string.replace(s, "&amp;", "&") # Must be last
673 return s