blob: afdb305d08c2872e13dcedebb996154eca4ec3ba [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000027# Note, the strict one of this pair isn't really strict, but we can't
28# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000029attrfind = re.compile(
30 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030031 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000032attrfind_tolerant = re.compile(
Ezio Melottif50ffa92011-10-28 13:21:09 +030033 r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
R. David Murrayb579dba2010-12-03 04:06:39 +000034 r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000035locatestarttagend = re.compile(r"""
36 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
37 (?:\s+ # whitespace before attribute name
38 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
39 (?:\s*=\s* # value indicator
40 (?:'[^']*' # LITA-enclosed value
41 |\"[^\"]*\" # LIT-enclosed value
42 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000043 )
Guido van Rossum8846d712001-05-18 14:50:52 +000044 )?
45 )
46 )*
47 \s* # trailing whitespace
48""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000049locatestarttagend_tolerant = re.compile(r"""
50 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
51 (?:\s* # optional whitespace before attribute name
52 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
53 (?:\s*=\s* # value indicator
54 (?:'[^']*' # LITA-enclosed value
55 |\"[^\"]*\" # LIT-enclosed value
56 |[^'\">\s]+ # bare value
57 )
58 (?:\s*,)* # possibly followed by a comma
59 )?
60 )
61 )*
62 \s* # trailing whitespace
63""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000064endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020065# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
66# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000067endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
68
Guido van Rossum8846d712001-05-18 14:50:52 +000069
70class HTMLParseError(Exception):
71 """Exception raised for all parse errors."""
72
73 def __init__(self, msg, position=(None, None)):
74 assert msg
75 self.msg = msg
76 self.lineno = position[0]
77 self.offset = position[1]
78
79 def __str__(self):
80 result = self.msg
81 if self.lineno is not None:
82 result = result + ", at line %d" % self.lineno
83 if self.offset is not None:
84 result = result + ", column %d" % (self.offset + 1)
85 return result
86
87
Fred Drakecb5c80f2007-12-07 11:10:11 +000088class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000089 """Find tags and other markup and call handler functions.
90
91 Usage:
92 p = HTMLParser()
93 p.feed(data)
94 ...
95 p.close()
96
97 Start tags are handled by calling self.handle_starttag() or
98 self.handle_startendtag(); end tags by self.handle_endtag(). The
99 data between tags is passed from the parser to the derived class
100 by calling self.handle_data() with the data as argument (the data
101 may be split up in arbitrary chunks). Entity references are
102 passed by calling self.handle_entityref() with the entity
103 reference as the argument. Numeric character references are
104 passed to self.handle_charref() with the string containing the
105 reference as the argument.
106 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000107
108 CDATA_CONTENT_ELEMENTS = ("script", "style")
109
R. David Murrayb579dba2010-12-03 04:06:39 +0000110 def __init__(self, strict=True):
111 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000112
R. David Murrayb579dba2010-12-03 04:06:39 +0000113 If strict is set to True (the default), errors are raised when invalid
114 HTML is encountered. If set to False, an attempt is instead made to
115 continue parsing, making "best guesses" about the intended meaning, in
116 a fashion similar to what browsers typically do.
117 """
118 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000119 self.reset()
120
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000122 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000124 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000125 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200126 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000127 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000128
Guido van Rossum8846d712001-05-18 14:50:52 +0000129 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200130 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000131
132 Call this as often as you want, with as little or as much text
133 as you want (may include '\n').
134 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000135 self.rawdata = self.rawdata + data
136 self.goahead(0)
137
Guido van Rossum8846d712001-05-18 14:50:52 +0000138 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000139 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000140 self.goahead(1)
141
Fred Drakebfc8fea2001-09-24 20:10:28 +0000142 def error(self, message):
143 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000144
145 __starttag_text = None
146
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000148 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 return self.__starttag_text
150
Ezio Melotti7de56f62011-11-01 14:12:22 +0200151 def set_cdata_mode(self, elem):
Guido van Rossum8846d712001-05-18 14:50:52 +0000152 self.interesting = interesting_cdata
Ezio Melotti7de56f62011-11-01 14:12:22 +0200153 self.cdata_elem = elem.lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000154
155 def clear_cdata_mode(self):
156 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200157 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000158
159 # Internal -- handle data as far as reasonable. May leave state
160 # and data to be processed by a subsequent call. If 'end' is
161 # true, force handling all data as if followed by EOF marker.
162 def goahead(self, end):
163 rawdata = self.rawdata
164 i = 0
165 n = len(rawdata)
166 while i < n:
167 match = self.interesting.search(rawdata, i) # < or &
168 if match:
169 j = match.start()
170 else:
171 j = n
172 if i < j: self.handle_data(rawdata[i:j])
173 i = self.updatepos(i, j)
174 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000175 startswith = rawdata.startswith
176 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000177 if starttagopen.match(rawdata, i): # < + letter
178 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000179 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000180 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000181 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000182 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000183 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000184 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000185 elif startswith("<!", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000186 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000187 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000188 self.handle_data("<")
189 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000190 else:
191 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000192 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000193 if not end:
194 break
195 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000196 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000197 k = rawdata.find('>', i + 1)
198 if k < 0:
199 k = rawdata.find('<', i + 1)
200 if k < 0:
201 k = i + 1
202 else:
203 k += 1
204 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000205 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000206 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000207 match = charref.match(rawdata, i)
208 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000209 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000210 self.handle_charref(name)
211 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000212 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000213 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000214 i = self.updatepos(i, k)
215 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000216 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000217 if ";" in rawdata[i:]: #bail by consuming &#
218 self.handle_data(rawdata[0:2])
219 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000220 break
Fred Drake248b0432001-12-03 17:09:50 +0000221 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000222 match = entityref.match(rawdata, i)
223 if match:
224 name = match.group(1)
225 self.handle_entityref(name)
226 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000227 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000228 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000229 i = self.updatepos(i, k)
230 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000231 match = incomplete.match(rawdata, i)
232 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000233 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000234 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000235 if self.strict:
236 self.error("EOF in middle of entity or char ref")
237 else:
238 if k <= i:
239 k = n
240 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000241 # incomplete
242 break
243 elif (i + 1) < n:
244 # not the end of the buffer, and can't be confused
245 # with some other construct
246 self.handle_data("&")
247 i = self.updatepos(i, i + 1)
248 else:
249 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000250 else:
251 assert 0, "interesting.search() lied"
252 # end while
253 if end and i < n:
254 self.handle_data(rawdata[i:n])
255 i = self.updatepos(i, n)
256 self.rawdata = rawdata[i:]
257
Guido van Rossum8846d712001-05-18 14:50:52 +0000258 # Internal -- parse processing instr, return end or -1 if not terminated
259 def parse_pi(self, i):
260 rawdata = self.rawdata
261 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
262 match = piclose.search(rawdata, i+2) # >
263 if not match:
264 return -1
265 j = match.start()
266 self.handle_pi(rawdata[i+2: j])
267 j = match.end()
268 return j
269
270 # Internal -- handle starttag, return end or -1 if not terminated
271 def parse_starttag(self, i):
272 self.__starttag_text = None
273 endpos = self.check_for_whole_start_tag(i)
274 if endpos < 0:
275 return endpos
276 rawdata = self.rawdata
277 self.__starttag_text = rawdata[i:endpos]
278
279 # Now parse the data between i+1 and j into a tag and attrs
280 attrs = []
281 match = tagfind.match(rawdata, i+1)
282 assert match, 'unexpected call to parse_starttag()'
283 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000284 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000285 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000286 if self.strict:
287 m = attrfind.match(rawdata, k)
288 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300289 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000290 if not m:
291 break
292 attrname, rest, attrvalue = m.group(1, 2, 3)
293 if not rest:
294 attrvalue = None
295 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
296 attrvalue[:1] == '"' == attrvalue[-1:]:
297 attrvalue = attrvalue[1:-1]
298 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000299 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000300 k = m.end()
301
Fred Drake248b0432001-12-03 17:09:50 +0000302 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000303 if end not in (">", "/>"):
304 lineno, offset = self.getpos()
305 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000306 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000307 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000308 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000309 else:
310 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000311 if self.strict:
312 self.error("junk characters in start tag: %r"
313 % (rawdata[k:endpos][:20],))
314 self.handle_data(rawdata[i:endpos])
315 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000316 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000317 # XHTML-style empty tag: <span attr="value" />
318 self.handle_startendtag(tag, attrs)
319 else:
320 self.handle_starttag(tag, attrs)
321 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200322 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000323 return endpos
324
325 # Internal -- check to see if we have a complete starttag; return end
326 # or -1 if incomplete.
327 def check_for_whole_start_tag(self, i):
328 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000329 if self.strict:
330 m = locatestarttagend.match(rawdata, i)
331 else:
332 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000333 if m:
334 j = m.end()
335 next = rawdata[j:j+1]
336 if next == ">":
337 return j + 1
338 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000339 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000340 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000341 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000342 # buffer boundary
343 return -1
344 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000345 if self.strict:
346 self.updatepos(i, j + 1)
347 self.error("malformed empty start tag")
348 if j > i:
349 return j
350 else:
351 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000352 if next == "":
353 # end of input
354 return -1
355 if next in ("abcdefghijklmnopqrstuvwxyz=/"
356 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
357 # end of input in or before attribute value, or we have the
358 # '/' from a '/>' ending
359 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000360 if self.strict:
361 self.updatepos(i, j)
362 self.error("malformed start tag")
363 if j > i:
364 return j
365 else:
366 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000367 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000368
369 # Internal -- parse endtag, return end or -1 if incomplete
370 def parse_endtag(self, i):
371 rawdata = self.rawdata
372 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
373 match = endendtag.search(rawdata, i+1) # >
374 if not match:
375 return -1
376 j = match.end()
377 match = endtagfind.match(rawdata, i) # </ + tag + >
378 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200379 if self.cdata_elem is not None:
380 self.handle_data(rawdata[i:j])
381 return j
R. David Murrayb579dba2010-12-03 04:06:39 +0000382 if self.strict:
383 self.error("bad end tag: %r" % (rawdata[i:j],))
384 k = rawdata.find('<', i + 1, j)
385 if k > i:
386 j = k
387 if j <= i:
388 j = i + 1
389 self.handle_data(rawdata[i:j])
390 return j
Ezio Melotti7de56f62011-11-01 14:12:22 +0200391
392 elem = match.group(1).lower() # script or style
393 if self.cdata_elem is not None:
394 if elem != self.cdata_elem:
395 self.handle_data(rawdata[i:j])
396 return j
397
398 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000399 self.clear_cdata_mode()
Guido van Rossum8846d712001-05-18 14:50:52 +0000400 return j
401
402 # Overridable -- finish processing of start+end tag: <tag.../>
403 def handle_startendtag(self, tag, attrs):
404 self.handle_starttag(tag, attrs)
405 self.handle_endtag(tag)
406
407 # Overridable -- handle start tag
408 def handle_starttag(self, tag, attrs):
409 pass
410
411 # Overridable -- handle end tag
412 def handle_endtag(self, tag):
413 pass
414
415 # Overridable -- handle character reference
416 def handle_charref(self, name):
417 pass
418
419 # Overridable -- handle entity reference
420 def handle_entityref(self, name):
421 pass
422
423 # Overridable -- handle data
424 def handle_data(self, data):
425 pass
426
427 # Overridable -- handle comment
428 def handle_comment(self, data):
429 pass
430
431 # Overridable -- handle declaration
432 def handle_decl(self, decl):
433 pass
434
435 # Overridable -- handle processing instruction
436 def handle_pi(self, data):
437 pass
438
Fred Drakebfc8fea2001-09-24 20:10:28 +0000439 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000440 if self.strict:
441 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000442
Guido van Rossum8846d712001-05-18 14:50:52 +0000443 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000444 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000445 def unescape(self, s):
446 if '&' not in s:
447 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000448 def replaceEntities(s):
449 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000450 try:
451 if s[0] == "#":
452 s = s[1:]
453 if s[0] in ['x','X']:
454 c = int(s[1:], 16)
455 else:
456 c = int(s)
457 return chr(c)
458 except ValueError:
459 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000460 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000461 # Cannot use name2codepoint directly, because HTMLParser
462 # supports apos, which is not part of HTML 4
463 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000464 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000465 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000466 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000467 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000468 try:
469 return self.entitydefs[s]
470 except KeyError:
471 return '&'+s+';'
472
Fred Drake3c50ea42008-05-17 22:02:32 +0000473 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300474 replaceEntities, s, flags=re.ASCII)