blob: 584046d81ec648c4879968986bf9cfc2b99def73 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import string
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
24piopen = re.compile(r'<\?')
25piclose = re.compile('>')
26endtagopen = re.compile('</')
27declopen = re.compile('<!')
28special = re.compile('<![^<>]*>')
29commentopen = re.compile('<!--')
30commentclose = re.compile(r'--\s*>')
31tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
32attrfind = re.compile(
33 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
35
36locatestarttagend = re.compile(r"""
37 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
38 (?:\s+ # whitespace before attribute name
39 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
40 (?:\s*=\s* # value indicator
41 (?:'[^']*' # LITA-enclosed value
42 |\"[^\"]*\" # LIT-enclosed value
43 |[^'\">\s]+ # bare value
44 )
45 )?
46 )
47 )*
48 \s* # trailing whitespace
49""", re.VERBOSE)
50endstarttag = re.compile(r"\s*/?>")
51endendtag = re.compile('>')
52endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
53
54declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
55declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
56
57
58class HTMLParseError(Exception):
59 """Exception raised for all parse errors."""
60
61 def __init__(self, msg, position=(None, None)):
62 assert msg
63 self.msg = msg
64 self.lineno = position[0]
65 self.offset = position[1]
66
67 def __str__(self):
68 result = self.msg
69 if self.lineno is not None:
70 result = result + ", at line %d" % self.lineno
71 if self.offset is not None:
72 result = result + ", column %d" % (self.offset + 1)
73 return result
74
75
Guido van Rossum8846d712001-05-18 14:50:52 +000076class HTMLParser:
Fred Drake1d4601d2001-08-03 19:50:59 +000077 """Find tags and other markup and call handler functions.
78
79 Usage:
80 p = HTMLParser()
81 p.feed(data)
82 ...
83 p.close()
84
85 Start tags are handled by calling self.handle_starttag() or
86 self.handle_startendtag(); end tags by self.handle_endtag(). The
87 data between tags is passed from the parser to the derived class
88 by calling self.handle_data() with the data as argument (the data
89 may be split up in arbitrary chunks). Entity references are
90 passed by calling self.handle_entityref() with the entity
91 reference as the argument. Numeric character references are
92 passed to self.handle_charref() with the string containing the
93 reference as the argument.
94 """
Guido van Rossum8846d712001-05-18 14:50:52 +000095
96 CDATA_CONTENT_ELEMENTS = ("script", "style")
97
98
Guido van Rossum8846d712001-05-18 14:50:52 +000099 def __init__(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000100 """Initialize and reset this instance."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000101 self.reset()
102
Guido van Rossum8846d712001-05-18 14:50:52 +0000103 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000104 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000105 self.rawdata = ''
106 self.stack = []
107 self.lasttag = '???'
108 self.lineno = 1
109 self.offset = 0
110 self.interesting = interesting_normal
111
Guido van Rossum8846d712001-05-18 14:50:52 +0000112 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000113 """Feed data to the parser.
114
115 Call this as often as you want, with as little or as much text
116 as you want (may include '\n').
117 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000118 self.rawdata = self.rawdata + data
119 self.goahead(0)
120
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000122 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.goahead(1)
124
125 # Internal -- update line number and offset. This should be
126 # called for each piece of data exactly once, in order -- in other
127 # words the concatenation of all the input strings to this
128 # function should be exactly the entire input.
129 def updatepos(self, i, j):
130 if i >= j:
131 return j
132 rawdata = self.rawdata
133 nlines = string.count(rawdata, "\n", i, j)
134 if nlines:
135 self.lineno = self.lineno + nlines
136 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
137 self.offset = j-(pos+1)
138 else:
139 self.offset = self.offset + j-i
140 return j
141
Guido van Rossum8846d712001-05-18 14:50:52 +0000142 def getpos(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000143 """Return current line number and offset."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 return self.lineno, self.offset
145
146 __starttag_text = None
147
Guido van Rossum8846d712001-05-18 14:50:52 +0000148 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000149 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000150 return self.__starttag_text
151
152 def set_cdata_mode(self):
153 self.interesting = interesting_cdata
154
155 def clear_cdata_mode(self):
156 self.interesting = interesting_normal
157
158 # Internal -- handle data as far as reasonable. May leave state
159 # and data to be processed by a subsequent call. If 'end' is
160 # true, force handling all data as if followed by EOF marker.
161 def goahead(self, end):
162 rawdata = self.rawdata
163 i = 0
164 n = len(rawdata)
165 while i < n:
166 match = self.interesting.search(rawdata, i) # < or &
167 if match:
168 j = match.start()
169 else:
170 j = n
171 if i < j: self.handle_data(rawdata[i:j])
172 i = self.updatepos(i, j)
173 if i == n: break
174 if rawdata[i] == '<':
175 if starttagopen.match(rawdata, i): # < + letter
176 k = self.parse_starttag(i)
177 elif endtagopen.match(rawdata, i): # </
178 k = self.parse_endtag(i)
179 if k >= 0:
180 self.clear_cdata_mode()
181 elif commentopen.match(rawdata, i): # <!--
182 k = self.parse_comment(i)
183 elif piopen.match(rawdata, i): # <?
184 k = self.parse_pi(i)
185 elif declopen.match(rawdata, i): # <!
186 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000187 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000188 self.handle_data("<")
189 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000190 else:
191 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000192 if k < 0:
193 if end:
194 raise HTMLParseError("EOF in middle of construct",
195 self.getpos())
196 break
197 i = self.updatepos(i, k)
Fred Drake68eac2b2001-09-04 15:10:16 +0000198 elif rawdata[i:i+2] == "&#":
Guido van Rossum8846d712001-05-18 14:50:52 +0000199 match = charref.match(rawdata, i)
200 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000201 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000202 self.handle_charref(name)
203 k = match.end()
204 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000205 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 i = self.updatepos(i, k)
207 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000208 else:
209 break
210 elif rawdata[i] == '&':
Guido van Rossum8846d712001-05-18 14:50:52 +0000211 match = entityref.match(rawdata, i)
212 if match:
213 name = match.group(1)
214 self.handle_entityref(name)
215 k = match.end()
216 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000217 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000218 i = self.updatepos(i, k)
219 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000220 match = incomplete.match(rawdata, i)
221 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000222 # match.group() will contain at least 2 chars
Fred Drake029acfb2001-08-20 21:24:19 +0000223 rest = rawdata[i:]
Fred Drake68eac2b2001-09-04 15:10:16 +0000224 if end and match.group() == rest:
Guido van Rossum8846d712001-05-18 14:50:52 +0000225 raise HTMLParseError(
226 "EOF in middle of entity or char ref",
227 self.getpos())
Fred Drake68eac2b2001-09-04 15:10:16 +0000228 # incomplete
229 break
230 elif (i + 1) < n:
231 # not the end of the buffer, and can't be confused
232 # with some other construct
233 self.handle_data("&")
234 i = self.updatepos(i, i + 1)
235 else:
236 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000237 else:
238 assert 0, "interesting.search() lied"
239 # end while
240 if end and i < n:
241 self.handle_data(rawdata[i:n])
242 i = self.updatepos(i, n)
243 self.rawdata = rawdata[i:]
244
245 # Internal -- parse comment, return end or -1 if not terminated
Fred Drake68eac2b2001-09-04 15:10:16 +0000246 def parse_comment(self, i, report=1):
Guido van Rossum8846d712001-05-18 14:50:52 +0000247 rawdata = self.rawdata
248 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
249 match = commentclose.search(rawdata, i+4)
250 if not match:
251 return -1
Fred Drake68eac2b2001-09-04 15:10:16 +0000252 if report:
253 j = match.start()
254 self.handle_comment(rawdata[i+4: j])
Guido van Rossum8846d712001-05-18 14:50:52 +0000255 j = match.end()
256 return j
257
258 # Internal -- parse declaration.
259 def parse_declaration(self, i):
260 # This is some sort of declaration; in "HTML as
261 # deployed," this should only be the document type
262 # declaration ("<!DOCTYPE html...>").
263 rawdata = self.rawdata
264 j = i + 2
265 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
266 if rawdata[j:j+1] in ("-", ""):
267 # Start of comment followed by buffer boundary,
268 # or just a buffer boundary.
269 return -1
270 # in practice, this should look like: ((name|stringlit) S*)+ '>'
271 n = len(rawdata)
Fred Drake68eac2b2001-09-04 15:10:16 +0000272 decltype = None
273 extrachars = ""
Guido van Rossum8846d712001-05-18 14:50:52 +0000274 while j < n:
275 c = rawdata[j]
276 if c == ">":
277 # end of declaration syntax
Fred Drake68eac2b2001-09-04 15:10:16 +0000278 data = rawdata[i+2:j]
279 if decltype == "doctype":
280 self.handle_decl(data)
281 else:
282 self.unknown_decl(data)
Guido van Rossum8846d712001-05-18 14:50:52 +0000283 return j + 1
284 if c in "\"'":
285 m = declstringlit.match(rawdata, j)
286 if not m:
287 return -1 # incomplete
288 j = m.end()
289 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
290 m = declname.match(rawdata, j)
291 if not m:
292 return -1 # incomplete
293 j = m.end()
Fred Drake68eac2b2001-09-04 15:10:16 +0000294 if decltype is None:
295 decltype = m.group(0).rstrip().lower()
296 if decltype != "doctype":
297 extrachars = "="
298 elif c == "[" and decltype == "doctype":
299 j = self.parse_doctype_subset(j + 1, i)
300 if j < 0:
301 return j
302 elif c in extrachars:
303 j = j + 1
304 while j < n and rawdata[j] in string.whitespace:
305 j = j + 1
306 if j == n:
307 # end of buffer while in declaration
308 return -1
Guido van Rossum8846d712001-05-18 14:50:52 +0000309 else:
310 raise HTMLParseError(
311 "unexpected char in declaration: %s" % `rawdata[j]`,
312 self.getpos())
Fred Drake68eac2b2001-09-04 15:10:16 +0000313 decltype = decltype or ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000314 return -1 # incomplete
315
Fred Drake68eac2b2001-09-04 15:10:16 +0000316 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
317 # returning the index just past any whitespace following the trailing ']'.
318 def parse_doctype_subset(self, i, declstartpos):
319 rawdata = self.rawdata
320 n = len(rawdata)
321 j = i
322 while j < n:
323 c = rawdata[j]
324 if c == "<":
325 s = rawdata[j:j+2]
326 if s == "<":
327 # end of buffer; incomplete
328 return -1
329 if s != "<!":
330 self.updatepos(declstartpos, j + 1)
331 raise HTMLParseError("unexpect char in internal subset",
332 self.getpos())
333 if (j + 2) == n:
334 # end of buffer; incomplete
335 return -1
336 if (j + 4) > n:
337 # end of buffer; incomplete
338 return -1
339 if rawdata[j:j+4] == "<!--":
340 j = self.parse_comment(j, report=0)
341 if j < 0:
342 return j
343 continue
344 name, j = self.scan_name(j + 2, declstartpos)
345 if j == -1:
346 return -1
347 if name not in ("attlist", "element", "entity", "notation"):
348 self.updatepos(declstartpos, j + 2)
349 raise HTMLParseError(
350 "unknown declaration %s in internal subset" % `name`,
351 self.getpos())
352 # handle the individual names
353 meth = getattr(self, "parse_doctype_" + name)
354 j = meth(j, declstartpos)
355 if j < 0:
356 return j
357 elif c == "%":
358 # parameter entity reference
359 if (j + 1) == n:
360 # end of buffer; incomplete
361 return -1
362 m = declname.match(rawdata, j + 1)
363 s = m.group()
364 if s == rawdata[j+1:]:
365 return -1
366 j = j + 1 + len(s.rstrip())
367 if rawdata[j] == ";":
368 j = j + 1
369 elif c == "]":
370 j = j + 1
371 while j < n and rawdata[j] in string.whitespace:
372 j = j + 1
373 if j < n:
374 if rawdata[j] == ">":
375 return j
376 self.updatepos(declstartpos, j)
377 raise HTMLParseError(
378 "unexpected char after internal subset",
379 self.getpos())
380 else:
381 return -1
382 elif c in string.whitespace:
383 j = j + 1
384 else:
385 self.updatepos(declstartpos, j)
386 raise HTMLParseError("unexpected char in internal subset",
387 self.getpos())
388 # end of buffer reached
389 return -1
390
391 def parse_doctype_element(self, i, declstartpos):
392 rawdata = self.rawdata
393 n = len(rawdata)
394 name, j = self.scan_name(i, declstartpos)
395 if j == -1:
396 return -1
397 # style content model; just skip until '>'
398 if '>' in rawdata[j:]:
399 return string.find(rawdata, ">", j) + 1
400 return -1
401
402 def parse_doctype_attlist(self, i, declstartpos):
403 rawdata = self.rawdata
404 name, j = self.scan_name(i, declstartpos)
405 c = rawdata[j:j+1]
406 if c == "":
407 return -1
408 if c == ">":
409 return j + 1
410 while 1:
411 # scan a series of attribute descriptions; simplified:
412 # name type [value] [#constraint]
413 name, j = self.scan_name(j, declstartpos)
414 if j < 0:
415 return j
416 c = rawdata[j:j+1]
417 if c == "":
418 return -1
419 if c == "(":
420 # an enumerated type; look for ')'
421 if ")" in rawdata[j:]:
422 j = string.find(rawdata, ")", j) + 1
423 else:
424 return -1
425 while rawdata[j:j+1] in string.whitespace:
426 j = j + 1
427 if not rawdata[j:]:
428 # end of buffer, incomplete
429 return -1
430 else:
431 name, j = self.scan_name(j, declstartpos)
432 c = rawdata[j:j+1]
433 if not c:
434 return -1
435 if c in "'\"":
436 m = declstringlit.match(rawdata, j)
437 if m:
438 j = m.end()
439 else:
440 return -1
441 c = rawdata[j:j+1]
442 if not c:
443 return -1
444 if c == "#":
445 if rawdata[j:] == "#":
446 # end of buffer
447 return -1
448 name, j = self.scan_name(j + 1, declstartpos)
449 if j < 0:
450 return j
451 c = rawdata[j:j+1]
452 if not c:
453 return -1
454 if c == '>':
455 # all done
456 return j + 1
457
458 def parse_doctype_notation(self, i, declstartpos):
459 name, j = self.scan_name(i, declstartpos)
460 if j < 0:
461 return j
462 rawdata = self.rawdata
463 while 1:
464 c = rawdata[j:j+1]
465 if not c:
466 # end of buffer; incomplete
467 return -1
468 if c == '>':
469 return j + 1
470 if c in "'\"":
471 m = declstringlit.match(rawdata, j)
472 if not m:
473 return -1
474 j = m.end()
475 else:
476 name, j = self.scan_name(j, declstartpos)
477 if j < 0:
478 return j
479
480 def parse_doctype_entity(self, i, declstartpos):
481 rawdata = self.rawdata
482 if rawdata[i:i+1] == "%":
483 j = i + 1
484 while 1:
485 c = rawdata[j:j+1]
486 if not c:
487 return -1
488 if c in string.whitespace:
489 j = j + 1
490 else:
491 break
492 else:
493 j = i
494 name, j = self.scan_name(j, declstartpos)
495 if j < 0:
496 return j
497 while 1:
498 c = self.rawdata[j:j+1]
499 if not c:
500 return -1
501 if c in "'\"":
502 m = declstringlit.match(rawdata, j)
503 if m:
504 j = m.end()
505 else:
506 return -1 # incomplete
507 elif c == ">":
508 return j + 1
509 else:
510 name, j = self.scan_name(j, declstartpos)
511 if j < 0:
512 return j
513
514 def scan_name(self, i, declstartpos):
515 rawdata = self.rawdata
516 n = len(rawdata)
517 if i == n:
518 return None, -1
519 m = declname.match(rawdata, i)
520 if m:
521 s = m.group()
522 name = s.strip()
523 if (i + len(s)) == n:
524 return None, -1 # end of buffer
525 return name.lower(), m.end()
526 else:
527 self.updatepos(declstartpos, i)
528 raise HTMLParseError("expected name token", self.getpos())
529
Guido van Rossum8846d712001-05-18 14:50:52 +0000530 # Internal -- parse processing instr, return end or -1 if not terminated
531 def parse_pi(self, i):
532 rawdata = self.rawdata
533 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
534 match = piclose.search(rawdata, i+2) # >
535 if not match:
536 return -1
537 j = match.start()
538 self.handle_pi(rawdata[i+2: j])
539 j = match.end()
540 return j
541
542 # Internal -- handle starttag, return end or -1 if not terminated
543 def parse_starttag(self, i):
544 self.__starttag_text = None
545 endpos = self.check_for_whole_start_tag(i)
546 if endpos < 0:
547 return endpos
548 rawdata = self.rawdata
549 self.__starttag_text = rawdata[i:endpos]
550
551 # Now parse the data between i+1 and j into a tag and attrs
552 attrs = []
553 match = tagfind.match(rawdata, i+1)
554 assert match, 'unexpected call to parse_starttag()'
555 k = match.end()
556 self.lasttag = tag = string.lower(rawdata[i+1:k])
557
558 while k < endpos:
559 m = attrfind.match(rawdata, k)
560 if not m:
561 break
562 attrname, rest, attrvalue = m.group(1, 2, 3)
563 if not rest:
564 attrvalue = None
565 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
566 attrvalue[:1] == '"' == attrvalue[-1:]:
567 attrvalue = attrvalue[1:-1]
568 attrvalue = self.unescape(attrvalue)
569 attrs.append((string.lower(attrname), attrvalue))
570 k = m.end()
571
572 end = string.strip(rawdata[k:endpos])
573 if end not in (">", "/>"):
574 lineno, offset = self.getpos()
575 if "\n" in self.__starttag_text:
576 lineno = lineno + string.count(self.__starttag_text, "\n")
577 offset = len(self.__starttag_text) \
578 - string.rfind(self.__starttag_text, "\n")
579 else:
580 offset = offset + len(self.__starttag_text)
581 raise HTMLParseError("junk characters in start tag: %s"
582 % `rawdata[k:endpos][:20]`,
583 (lineno, offset))
584 if end[-2:] == '/>':
585 # XHTML-style empty tag: <span attr="value" />
586 self.handle_startendtag(tag, attrs)
587 else:
588 self.handle_starttag(tag, attrs)
589 if tag in self.CDATA_CONTENT_ELEMENTS:
590 self.set_cdata_mode()
591 return endpos
592
593 # Internal -- check to see if we have a complete starttag; return end
594 # or -1 if incomplete.
595 def check_for_whole_start_tag(self, i):
596 rawdata = self.rawdata
597 m = locatestarttagend.match(rawdata, i)
598 if m:
599 j = m.end()
600 next = rawdata[j:j+1]
601 if next == ">":
602 return j + 1
603 if next == "/":
604 s = rawdata[j:j+2]
605 if s == "/>":
606 return j + 2
607 if s == "/":
608 # buffer boundary
609 return -1
610 # else bogus input
611 self.updatepos(i, j + 1)
612 raise HTMLParseError("malformed empty start tag",
613 self.getpos())
614 if next == "":
615 # end of input
616 return -1
617 if next in ("abcdefghijklmnopqrstuvwxyz=/"
618 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
619 # end of input in or before attribute value, or we have the
620 # '/' from a '/>' ending
621 return -1
622 self.updatepos(i, j)
623 raise HTMLParseError("malformed start tag", self.getpos())
624 raise AssertionError("we should not gt here!")
625
626 # Internal -- parse endtag, return end or -1 if incomplete
627 def parse_endtag(self, i):
628 rawdata = self.rawdata
629 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
630 match = endendtag.search(rawdata, i+1) # >
631 if not match:
632 return -1
633 j = match.end()
634 match = endtagfind.match(rawdata, i) # </ + tag + >
635 if not match:
636 raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
637 self.getpos())
638 tag = match.group(1)
639 self.handle_endtag(string.lower(tag))
640 return j
641
642 # Overridable -- finish processing of start+end tag: <tag.../>
643 def handle_startendtag(self, tag, attrs):
644 self.handle_starttag(tag, attrs)
645 self.handle_endtag(tag)
646
647 # Overridable -- handle start tag
648 def handle_starttag(self, tag, attrs):
649 pass
650
651 # Overridable -- handle end tag
652 def handle_endtag(self, tag):
653 pass
654
655 # Overridable -- handle character reference
656 def handle_charref(self, name):
657 pass
658
659 # Overridable -- handle entity reference
660 def handle_entityref(self, name):
661 pass
662
663 # Overridable -- handle data
664 def handle_data(self, data):
665 pass
666
667 # Overridable -- handle comment
668 def handle_comment(self, data):
669 pass
670
671 # Overridable -- handle declaration
672 def handle_decl(self, decl):
673 pass
674
675 # Overridable -- handle processing instruction
676 def handle_pi(self, data):
677 pass
678
679 # Internal -- helper to remove special character quoting
680 def unescape(self, s):
681 if '&' not in s:
682 return s
683 s = string.replace(s, "&lt;", "<")
684 s = string.replace(s, "&gt;", ">")
685 s = string.replace(s, "&apos;", "'")
686 s = string.replace(s, "&quot;", '"')
687 s = string.replace(s, "&amp;", "&") # Must be last
688 return s