blob: 5c4a7ef70872e370cd0df83a8ea26378e0f07232 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000017incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000018
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000020charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000024commentclose = re.compile(r'--\s*>')
25tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000026# Note, the strict one of this pair isn't really strict, but we can't
27# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000028attrfind = re.compile(
29 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030030 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000031attrfind_tolerant = re.compile(
Ezio Melottic2fe5772011-11-14 18:53:33 +020032 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
33 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000034locatestarttagend = re.compile(r"""
35 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
36 (?:\s+ # whitespace before attribute name
37 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
38 (?:\s*=\s* # value indicator
39 (?:'[^']*' # LITA-enclosed value
40 |\"[^\"]*\" # LIT-enclosed value
41 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000042 )
Guido van Rossum8846d712001-05-18 14:50:52 +000043 )?
44 )
45 )*
46 \s* # trailing whitespace
47""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000048locatestarttagend_tolerant = re.compile(r"""
49 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
50 (?:\s* # optional whitespace before attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020051 (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
52 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000053 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020054 |"[^"]*" # LIT-enclosed value
55 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000056 )
57 (?:\s*,)* # possibly followed by a comma
Ezio Melottic2fe5772011-11-14 18:53:33 +020058 )?\s*
59 )*
60 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000061 \s* # trailing whitespace
62""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000063endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020064# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
65# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000066endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
67
Guido van Rossum8846d712001-05-18 14:50:52 +000068
69class HTMLParseError(Exception):
70 """Exception raised for all parse errors."""
71
72 def __init__(self, msg, position=(None, None)):
73 assert msg
74 self.msg = msg
75 self.lineno = position[0]
76 self.offset = position[1]
77
78 def __str__(self):
79 result = self.msg
80 if self.lineno is not None:
81 result = result + ", at line %d" % self.lineno
82 if self.offset is not None:
83 result = result + ", column %d" % (self.offset + 1)
84 return result
85
86
Fred Drakecb5c80f2007-12-07 11:10:11 +000087class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000088 """Find tags and other markup and call handler functions.
89
90 Usage:
91 p = HTMLParser()
92 p.feed(data)
93 ...
94 p.close()
95
96 Start tags are handled by calling self.handle_starttag() or
97 self.handle_startendtag(); end tags by self.handle_endtag(). The
98 data between tags is passed from the parser to the derived class
99 by calling self.handle_data() with the data as argument (the data
100 may be split up in arbitrary chunks). Entity references are
101 passed by calling self.handle_entityref() with the entity
102 reference as the argument. Numeric character references are
103 passed to self.handle_charref() with the string containing the
104 reference as the argument.
105 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000106
107 CDATA_CONTENT_ELEMENTS = ("script", "style")
108
R. David Murrayb579dba2010-12-03 04:06:39 +0000109 def __init__(self, strict=True):
110 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000111
R. David Murrayb579dba2010-12-03 04:06:39 +0000112 If strict is set to True (the default), errors are raised when invalid
113 HTML is encountered. If set to False, an attempt is instead made to
114 continue parsing, making "best guesses" about the intended meaning, in
115 a fashion similar to what browsers typically do.
116 """
117 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000118 self.reset()
119
Guido van Rossum8846d712001-05-18 14:50:52 +0000120 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000121 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000122 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000124 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200125 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000126 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000127
Guido van Rossum8846d712001-05-18 14:50:52 +0000128 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200129 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000130
131 Call this as often as you want, with as little or as much text
132 as you want (may include '\n').
133 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000134 self.rawdata = self.rawdata + data
135 self.goahead(0)
136
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000138 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000139 self.goahead(1)
140
Fred Drakebfc8fea2001-09-24 20:10:28 +0000141 def error(self, message):
142 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000143
144 __starttag_text = None
145
Guido van Rossum8846d712001-05-18 14:50:52 +0000146 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000147 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000148 return self.__starttag_text
149
Ezio Melotti7de56f62011-11-01 14:12:22 +0200150 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200151 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200152 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000153
154 def clear_cdata_mode(self):
155 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200156 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000157
158 # Internal -- handle data as far as reasonable. May leave state
159 # and data to be processed by a subsequent call. If 'end' is
160 # true, force handling all data as if followed by EOF marker.
161 def goahead(self, end):
162 rawdata = self.rawdata
163 i = 0
164 n = len(rawdata)
165 while i < n:
166 match = self.interesting.search(rawdata, i) # < or &
167 if match:
168 j = match.start()
169 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200170 if self.cdata_elem:
171 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000172 j = n
173 if i < j: self.handle_data(rawdata[i:j])
174 i = self.updatepos(i, j)
175 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000176 startswith = rawdata.startswith
177 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000178 if starttagopen.match(rawdata, i): # < + letter
179 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000180 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000181 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000182 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000183 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000184 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000185 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000186 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200187 # this might fail with things like <! not a comment > or
188 # <! -- space before '--' -->. When strict is True an
189 # error is raised, when it's False they will be considered
190 # as bogus comments and parsed (see parse_bogus_comment).
191 if self.strict:
192 k = self.parse_declaration(i)
193 else:
194 try:
195 k = self.parse_declaration(i)
196 except HTMLParseError:
197 k = self.parse_bogus_comment(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000198 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000199 self.handle_data("<")
200 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000201 else:
202 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000203 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000204 if not end:
205 break
206 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000207 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000208 k = rawdata.find('>', i + 1)
209 if k < 0:
210 k = rawdata.find('<', i + 1)
211 if k < 0:
212 k = i + 1
213 else:
214 k += 1
215 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000216 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000217 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000218 match = charref.match(rawdata, i)
219 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000220 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 self.handle_charref(name)
222 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000223 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000224 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000225 i = self.updatepos(i, k)
226 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000227 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000228 if ";" in rawdata[i:]: #bail by consuming &#
229 self.handle_data(rawdata[0:2])
230 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000231 break
Fred Drake248b0432001-12-03 17:09:50 +0000232 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000233 match = entityref.match(rawdata, i)
234 if match:
235 name = match.group(1)
236 self.handle_entityref(name)
237 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000238 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000239 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000240 i = self.updatepos(i, k)
241 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000242 match = incomplete.match(rawdata, i)
243 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000244 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000245 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000246 if self.strict:
247 self.error("EOF in middle of entity or char ref")
248 else:
249 if k <= i:
250 k = n
251 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000252 # incomplete
253 break
254 elif (i + 1) < n:
255 # not the end of the buffer, and can't be confused
256 # with some other construct
257 self.handle_data("&")
258 i = self.updatepos(i, i + 1)
259 else:
260 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000261 else:
262 assert 0, "interesting.search() lied"
263 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200264 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000265 self.handle_data(rawdata[i:n])
266 i = self.updatepos(i, n)
267 self.rawdata = rawdata[i:]
268
Ezio Melottifa3702d2012-02-10 10:45:44 +0200269 # Internal -- parse bogus comment, return length or -1 if not terminated
270 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
271 def parse_bogus_comment(self, i, report=1):
272 rawdata = self.rawdata
273 if rawdata[i:i+2] != '<!':
274 self.error('unexpected call to parse_comment()')
275 pos = rawdata.find('>', i+2)
276 if pos == -1:
277 return -1
278 if report:
279 self.handle_comment(rawdata[i+2:pos])
280 return pos + 1
281
Guido van Rossum8846d712001-05-18 14:50:52 +0000282 # Internal -- parse processing instr, return end or -1 if not terminated
283 def parse_pi(self, i):
284 rawdata = self.rawdata
285 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
286 match = piclose.search(rawdata, i+2) # >
287 if not match:
288 return -1
289 j = match.start()
290 self.handle_pi(rawdata[i+2: j])
291 j = match.end()
292 return j
293
294 # Internal -- handle starttag, return end or -1 if not terminated
295 def parse_starttag(self, i):
296 self.__starttag_text = None
297 endpos = self.check_for_whole_start_tag(i)
298 if endpos < 0:
299 return endpos
300 rawdata = self.rawdata
301 self.__starttag_text = rawdata[i:endpos]
302
303 # Now parse the data between i+1 and j into a tag and attrs
304 attrs = []
305 match = tagfind.match(rawdata, i+1)
306 assert match, 'unexpected call to parse_starttag()'
307 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000308 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000309 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000310 if self.strict:
311 m = attrfind.match(rawdata, k)
312 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300313 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000314 if not m:
315 break
316 attrname, rest, attrvalue = m.group(1, 2, 3)
317 if not rest:
318 attrvalue = None
319 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
320 attrvalue[:1] == '"' == attrvalue[-1:]:
321 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200322 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000323 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000324 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000325 k = m.end()
326
Fred Drake248b0432001-12-03 17:09:50 +0000327 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000328 if end not in (">", "/>"):
329 lineno, offset = self.getpos()
330 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000331 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000332 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000333 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000334 else:
335 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000336 if self.strict:
337 self.error("junk characters in start tag: %r"
338 % (rawdata[k:endpos][:20],))
339 self.handle_data(rawdata[i:endpos])
340 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000341 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000342 # XHTML-style empty tag: <span attr="value" />
343 self.handle_startendtag(tag, attrs)
344 else:
345 self.handle_starttag(tag, attrs)
346 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200347 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000348 return endpos
349
350 # Internal -- check to see if we have a complete starttag; return end
351 # or -1 if incomplete.
352 def check_for_whole_start_tag(self, i):
353 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000354 if self.strict:
355 m = locatestarttagend.match(rawdata, i)
356 else:
357 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000358 if m:
359 j = m.end()
360 next = rawdata[j:j+1]
361 if next == ">":
362 return j + 1
363 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000364 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000365 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000366 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000367 # buffer boundary
368 return -1
369 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000370 if self.strict:
371 self.updatepos(i, j + 1)
372 self.error("malformed empty start tag")
373 if j > i:
374 return j
375 else:
376 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000377 if next == "":
378 # end of input
379 return -1
380 if next in ("abcdefghijklmnopqrstuvwxyz=/"
381 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
382 # end of input in or before attribute value, or we have the
383 # '/' from a '/>' ending
384 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000385 if self.strict:
386 self.updatepos(i, j)
387 self.error("malformed start tag")
388 if j > i:
389 return j
390 else:
391 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000392 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000393
394 # Internal -- parse endtag, return end or -1 if incomplete
395 def parse_endtag(self, i):
396 rawdata = self.rawdata
397 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
398 match = endendtag.search(rawdata, i+1) # >
399 if not match:
400 return -1
401 j = match.end()
402 match = endtagfind.match(rawdata, i) # </ + tag + >
403 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200404 if self.cdata_elem is not None:
405 self.handle_data(rawdata[i:j])
406 return j
R. David Murrayb579dba2010-12-03 04:06:39 +0000407 if self.strict:
408 self.error("bad end tag: %r" % (rawdata[i:j],))
409 k = rawdata.find('<', i + 1, j)
410 if k > i:
411 j = k
412 if j <= i:
413 j = i + 1
414 self.handle_data(rawdata[i:j])
415 return j
Ezio Melotti7de56f62011-11-01 14:12:22 +0200416
417 elem = match.group(1).lower() # script or style
418 if self.cdata_elem is not None:
419 if elem != self.cdata_elem:
420 self.handle_data(rawdata[i:j])
421 return j
422
423 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000424 self.clear_cdata_mode()
Guido van Rossum8846d712001-05-18 14:50:52 +0000425 return j
426
427 # Overridable -- finish processing of start+end tag: <tag.../>
428 def handle_startendtag(self, tag, attrs):
429 self.handle_starttag(tag, attrs)
430 self.handle_endtag(tag)
431
432 # Overridable -- handle start tag
433 def handle_starttag(self, tag, attrs):
434 pass
435
436 # Overridable -- handle end tag
437 def handle_endtag(self, tag):
438 pass
439
440 # Overridable -- handle character reference
441 def handle_charref(self, name):
442 pass
443
444 # Overridable -- handle entity reference
445 def handle_entityref(self, name):
446 pass
447
448 # Overridable -- handle data
449 def handle_data(self, data):
450 pass
451
452 # Overridable -- handle comment
453 def handle_comment(self, data):
454 pass
455
456 # Overridable -- handle declaration
457 def handle_decl(self, decl):
458 pass
459
460 # Overridable -- handle processing instruction
461 def handle_pi(self, data):
462 pass
463
Fred Drakebfc8fea2001-09-24 20:10:28 +0000464 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000465 if self.strict:
466 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000467
Guido van Rossum8846d712001-05-18 14:50:52 +0000468 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000469 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000470 def unescape(self, s):
471 if '&' not in s:
472 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000473 def replaceEntities(s):
474 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000475 try:
476 if s[0] == "#":
477 s = s[1:]
478 if s[0] in ['x','X']:
479 c = int(s[1:], 16)
480 else:
481 c = int(s)
482 return chr(c)
483 except ValueError:
484 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000485 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000486 # Cannot use name2codepoint directly, because HTMLParser
487 # supports apos, which is not part of HTML 4
488 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000489 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000490 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000491 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000492 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000493 try:
494 return self.entitydefs[s]
495 except KeyError:
496 return '&'+s+';'
497
Fred Drake3c50ea42008-05-17 22:02:32 +0000498 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300499 replaceEntities, s, flags=re.ASCII)