blob: dd9c2e14862eae4a3df3ec4df9c22586ba9b49dc [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000017incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000018
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000020charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000024commentclose = re.compile(r'--\s*>')
25tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000026# Note, the strict one of this pair isn't really strict, but we can't
27# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000028attrfind = re.compile(
29 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030030 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000031attrfind_tolerant = re.compile(
Ezio Melottic2fe5772011-11-14 18:53:33 +020032 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
33 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000034locatestarttagend = re.compile(r"""
35 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
36 (?:\s+ # whitespace before attribute name
37 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
38 (?:\s*=\s* # value indicator
39 (?:'[^']*' # LITA-enclosed value
40 |\"[^\"]*\" # LIT-enclosed value
41 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000042 )
Guido van Rossum8846d712001-05-18 14:50:52 +000043 )?
44 )
45 )*
46 \s* # trailing whitespace
47""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000048locatestarttagend_tolerant = re.compile(r"""
49 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
50 (?:\s* # optional whitespace before attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020051 (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
52 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000053 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020054 |"[^"]*" # LIT-enclosed value
55 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000056 )
57 (?:\s*,)* # possibly followed by a comma
Ezio Melottic2fe5772011-11-14 18:53:33 +020058 )?\s*
59 )*
60 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000061 \s* # trailing whitespace
62""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000063endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020064# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
65# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000066endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
67
Guido van Rossum8846d712001-05-18 14:50:52 +000068
69class HTMLParseError(Exception):
70 """Exception raised for all parse errors."""
71
72 def __init__(self, msg, position=(None, None)):
73 assert msg
74 self.msg = msg
75 self.lineno = position[0]
76 self.offset = position[1]
77
78 def __str__(self):
79 result = self.msg
80 if self.lineno is not None:
81 result = result + ", at line %d" % self.lineno
82 if self.offset is not None:
83 result = result + ", column %d" % (self.offset + 1)
84 return result
85
86
Fred Drakecb5c80f2007-12-07 11:10:11 +000087class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000088 """Find tags and other markup and call handler functions.
89
90 Usage:
91 p = HTMLParser()
92 p.feed(data)
93 ...
94 p.close()
95
96 Start tags are handled by calling self.handle_starttag() or
97 self.handle_startendtag(); end tags by self.handle_endtag(). The
98 data between tags is passed from the parser to the derived class
99 by calling self.handle_data() with the data as argument (the data
100 may be split up in arbitrary chunks). Entity references are
101 passed by calling self.handle_entityref() with the entity
102 reference as the argument. Numeric character references are
103 passed to self.handle_charref() with the string containing the
104 reference as the argument.
105 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000106
107 CDATA_CONTENT_ELEMENTS = ("script", "style")
108
R. David Murrayb579dba2010-12-03 04:06:39 +0000109 def __init__(self, strict=True):
110 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000111
R. David Murrayb579dba2010-12-03 04:06:39 +0000112 If strict is set to True (the default), errors are raised when invalid
113 HTML is encountered. If set to False, an attempt is instead made to
114 continue parsing, making "best guesses" about the intended meaning, in
115 a fashion similar to what browsers typically do.
116 """
117 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000118 self.reset()
119
Guido van Rossum8846d712001-05-18 14:50:52 +0000120 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000121 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000122 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000124 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200125 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000126 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000127
Guido van Rossum8846d712001-05-18 14:50:52 +0000128 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200129 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000130
131 Call this as often as you want, with as little or as much text
132 as you want (may include '\n').
133 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000134 self.rawdata = self.rawdata + data
135 self.goahead(0)
136
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000138 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000139 self.goahead(1)
140
Fred Drakebfc8fea2001-09-24 20:10:28 +0000141 def error(self, message):
142 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000143
144 __starttag_text = None
145
Guido van Rossum8846d712001-05-18 14:50:52 +0000146 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000147 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000148 return self.__starttag_text
149
Ezio Melotti7de56f62011-11-01 14:12:22 +0200150 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200151 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200152 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000153
154 def clear_cdata_mode(self):
155 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200156 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000157
158 # Internal -- handle data as far as reasonable. May leave state
159 # and data to be processed by a subsequent call. If 'end' is
160 # true, force handling all data as if followed by EOF marker.
161 def goahead(self, end):
162 rawdata = self.rawdata
163 i = 0
164 n = len(rawdata)
165 while i < n:
166 match = self.interesting.search(rawdata, i) # < or &
167 if match:
168 j = match.start()
169 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200170 if self.cdata_elem:
171 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000172 j = n
173 if i < j: self.handle_data(rawdata[i:j])
174 i = self.updatepos(i, j)
175 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000176 startswith = rawdata.startswith
177 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000178 if starttagopen.match(rawdata, i): # < + letter
179 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000180 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000181 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000182 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000183 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000184 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000185 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000186 elif startswith("<!", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000187 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000188 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000189 self.handle_data("<")
190 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000191 else:
192 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000193 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000194 if not end:
195 break
196 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000197 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000198 k = rawdata.find('>', i + 1)
199 if k < 0:
200 k = rawdata.find('<', i + 1)
201 if k < 0:
202 k = i + 1
203 else:
204 k += 1
205 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000207 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000208 match = charref.match(rawdata, i)
209 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000210 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000211 self.handle_charref(name)
212 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000213 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000214 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000215 i = self.updatepos(i, k)
216 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000217 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000218 if ";" in rawdata[i:]: #bail by consuming &#
219 self.handle_data(rawdata[0:2])
220 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000221 break
Fred Drake248b0432001-12-03 17:09:50 +0000222 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000223 match = entityref.match(rawdata, i)
224 if match:
225 name = match.group(1)
226 self.handle_entityref(name)
227 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000228 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000229 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000230 i = self.updatepos(i, k)
231 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000232 match = incomplete.match(rawdata, i)
233 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000234 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000235 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000236 if self.strict:
237 self.error("EOF in middle of entity or char ref")
238 else:
239 if k <= i:
240 k = n
241 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000242 # incomplete
243 break
244 elif (i + 1) < n:
245 # not the end of the buffer, and can't be confused
246 # with some other construct
247 self.handle_data("&")
248 i = self.updatepos(i, i + 1)
249 else:
250 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000251 else:
252 assert 0, "interesting.search() lied"
253 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200254 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000255 self.handle_data(rawdata[i:n])
256 i = self.updatepos(i, n)
257 self.rawdata = rawdata[i:]
258
Guido van Rossum8846d712001-05-18 14:50:52 +0000259 # Internal -- parse processing instr, return end or -1 if not terminated
260 def parse_pi(self, i):
261 rawdata = self.rawdata
262 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
263 match = piclose.search(rawdata, i+2) # >
264 if not match:
265 return -1
266 j = match.start()
267 self.handle_pi(rawdata[i+2: j])
268 j = match.end()
269 return j
270
271 # Internal -- handle starttag, return end or -1 if not terminated
272 def parse_starttag(self, i):
273 self.__starttag_text = None
274 endpos = self.check_for_whole_start_tag(i)
275 if endpos < 0:
276 return endpos
277 rawdata = self.rawdata
278 self.__starttag_text = rawdata[i:endpos]
279
280 # Now parse the data between i+1 and j into a tag and attrs
281 attrs = []
282 match = tagfind.match(rawdata, i+1)
283 assert match, 'unexpected call to parse_starttag()'
284 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000285 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000286 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000287 if self.strict:
288 m = attrfind.match(rawdata, k)
289 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300290 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000291 if not m:
292 break
293 attrname, rest, attrvalue = m.group(1, 2, 3)
294 if not rest:
295 attrvalue = None
296 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
297 attrvalue[:1] == '"' == attrvalue[-1:]:
298 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200299 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000300 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000301 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000302 k = m.end()
303
Fred Drake248b0432001-12-03 17:09:50 +0000304 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000305 if end not in (">", "/>"):
306 lineno, offset = self.getpos()
307 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000308 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000309 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000310 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000311 else:
312 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000313 if self.strict:
314 self.error("junk characters in start tag: %r"
315 % (rawdata[k:endpos][:20],))
316 self.handle_data(rawdata[i:endpos])
317 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000318 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000319 # XHTML-style empty tag: <span attr="value" />
320 self.handle_startendtag(tag, attrs)
321 else:
322 self.handle_starttag(tag, attrs)
323 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200324 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000325 return endpos
326
327 # Internal -- check to see if we have a complete starttag; return end
328 # or -1 if incomplete.
329 def check_for_whole_start_tag(self, i):
330 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000331 if self.strict:
332 m = locatestarttagend.match(rawdata, i)
333 else:
334 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000335 if m:
336 j = m.end()
337 next = rawdata[j:j+1]
338 if next == ">":
339 return j + 1
340 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000341 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000342 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000343 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000344 # buffer boundary
345 return -1
346 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000347 if self.strict:
348 self.updatepos(i, j + 1)
349 self.error("malformed empty start tag")
350 if j > i:
351 return j
352 else:
353 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000354 if next == "":
355 # end of input
356 return -1
357 if next in ("abcdefghijklmnopqrstuvwxyz=/"
358 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
359 # end of input in or before attribute value, or we have the
360 # '/' from a '/>' ending
361 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000362 if self.strict:
363 self.updatepos(i, j)
364 self.error("malformed start tag")
365 if j > i:
366 return j
367 else:
368 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000369 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000370
371 # Internal -- parse endtag, return end or -1 if incomplete
372 def parse_endtag(self, i):
373 rawdata = self.rawdata
374 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
375 match = endendtag.search(rawdata, i+1) # >
376 if not match:
377 return -1
378 j = match.end()
379 match = endtagfind.match(rawdata, i) # </ + tag + >
380 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200381 if self.cdata_elem is not None:
382 self.handle_data(rawdata[i:j])
383 return j
R. David Murrayb579dba2010-12-03 04:06:39 +0000384 if self.strict:
385 self.error("bad end tag: %r" % (rawdata[i:j],))
386 k = rawdata.find('<', i + 1, j)
387 if k > i:
388 j = k
389 if j <= i:
390 j = i + 1
391 self.handle_data(rawdata[i:j])
392 return j
Ezio Melotti7de56f62011-11-01 14:12:22 +0200393
394 elem = match.group(1).lower() # script or style
395 if self.cdata_elem is not None:
396 if elem != self.cdata_elem:
397 self.handle_data(rawdata[i:j])
398 return j
399
400 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000401 self.clear_cdata_mode()
Guido van Rossum8846d712001-05-18 14:50:52 +0000402 return j
403
404 # Overridable -- finish processing of start+end tag: <tag.../>
405 def handle_startendtag(self, tag, attrs):
406 self.handle_starttag(tag, attrs)
407 self.handle_endtag(tag)
408
409 # Overridable -- handle start tag
410 def handle_starttag(self, tag, attrs):
411 pass
412
413 # Overridable -- handle end tag
414 def handle_endtag(self, tag):
415 pass
416
417 # Overridable -- handle character reference
418 def handle_charref(self, name):
419 pass
420
421 # Overridable -- handle entity reference
422 def handle_entityref(self, name):
423 pass
424
425 # Overridable -- handle data
426 def handle_data(self, data):
427 pass
428
429 # Overridable -- handle comment
430 def handle_comment(self, data):
431 pass
432
433 # Overridable -- handle declaration
434 def handle_decl(self, decl):
435 pass
436
437 # Overridable -- handle processing instruction
438 def handle_pi(self, data):
439 pass
440
Fred Drakebfc8fea2001-09-24 20:10:28 +0000441 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000442 if self.strict:
443 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000444
Guido van Rossum8846d712001-05-18 14:50:52 +0000445 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000446 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000447 def unescape(self, s):
448 if '&' not in s:
449 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000450 def replaceEntities(s):
451 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000452 try:
453 if s[0] == "#":
454 s = s[1:]
455 if s[0] in ['x','X']:
456 c = int(s[1:], 16)
457 else:
458 c = int(s)
459 return chr(c)
460 except ValueError:
461 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000462 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000463 # Cannot use name2codepoint directly, because HTMLParser
464 # supports apos, which is not part of HTML 4
465 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000466 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000467 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000468 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000469 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000470 try:
471 return self.entitydefs[s]
472 except KeyError:
473 return '&'+s+';'
474
Fred Drake3c50ea42008-05-17 22:02:32 +0000475 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300476 replaceEntities, s, flags=re.ASCII)