blob: 662e85575a4219df97e876e094adc963f9127f20 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000027# Note, the strict one of this pair isn't really strict, but we can't
28# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000029attrfind = re.compile(
30 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030031 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000032attrfind_tolerant = re.compile(
Ezio Melottic2fe5772011-11-14 18:53:33 +020033 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
34 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000035locatestarttagend = re.compile(r"""
36 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
37 (?:\s+ # whitespace before attribute name
38 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
39 (?:\s*=\s* # value indicator
40 (?:'[^']*' # LITA-enclosed value
41 |\"[^\"]*\" # LIT-enclosed value
42 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000043 )
Guido van Rossum8846d712001-05-18 14:50:52 +000044 )?
45 )
46 )*
47 \s* # trailing whitespace
48""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000049locatestarttagend_tolerant = re.compile(r"""
50 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
51 (?:\s* # optional whitespace before attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020052 (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
53 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000054 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020055 |"[^"]*" # LIT-enclosed value
56 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000057 )
58 (?:\s*,)* # possibly followed by a comma
Ezio Melottic2fe5772011-11-14 18:53:33 +020059 )?\s*
60 )*
61 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000062 \s* # trailing whitespace
63""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000064endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020065# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
66# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000067endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
68
Guido van Rossum8846d712001-05-18 14:50:52 +000069
70class HTMLParseError(Exception):
71 """Exception raised for all parse errors."""
72
73 def __init__(self, msg, position=(None, None)):
74 assert msg
75 self.msg = msg
76 self.lineno = position[0]
77 self.offset = position[1]
78
79 def __str__(self):
80 result = self.msg
81 if self.lineno is not None:
82 result = result + ", at line %d" % self.lineno
83 if self.offset is not None:
84 result = result + ", column %d" % (self.offset + 1)
85 return result
86
87
Fred Drakecb5c80f2007-12-07 11:10:11 +000088class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000089 """Find tags and other markup and call handler functions.
90
91 Usage:
92 p = HTMLParser()
93 p.feed(data)
94 ...
95 p.close()
96
97 Start tags are handled by calling self.handle_starttag() or
98 self.handle_startendtag(); end tags by self.handle_endtag(). The
99 data between tags is passed from the parser to the derived class
100 by calling self.handle_data() with the data as argument (the data
101 may be split up in arbitrary chunks). Entity references are
102 passed by calling self.handle_entityref() with the entity
103 reference as the argument. Numeric character references are
104 passed to self.handle_charref() with the string containing the
105 reference as the argument.
106 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000107
108 CDATA_CONTENT_ELEMENTS = ("script", "style")
109
R. David Murrayb579dba2010-12-03 04:06:39 +0000110 def __init__(self, strict=True):
111 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000112
R. David Murrayb579dba2010-12-03 04:06:39 +0000113 If strict is set to True (the default), errors are raised when invalid
114 HTML is encountered. If set to False, an attempt is instead made to
115 continue parsing, making "best guesses" about the intended meaning, in
116 a fashion similar to what browsers typically do.
117 """
118 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000119 self.reset()
120
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000122 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000124 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000125 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200126 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000127 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000128
Guido van Rossum8846d712001-05-18 14:50:52 +0000129 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200130 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000131
132 Call this as often as you want, with as little or as much text
133 as you want (may include '\n').
134 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000135 self.rawdata = self.rawdata + data
136 self.goahead(0)
137
Guido van Rossum8846d712001-05-18 14:50:52 +0000138 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000139 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000140 self.goahead(1)
141
Fred Drakebfc8fea2001-09-24 20:10:28 +0000142 def error(self, message):
143 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000144
145 __starttag_text = None
146
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000148 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 return self.__starttag_text
150
Ezio Melotti7de56f62011-11-01 14:12:22 +0200151 def set_cdata_mode(self, elem):
Guido van Rossum8846d712001-05-18 14:50:52 +0000152 self.interesting = interesting_cdata
Ezio Melotti7de56f62011-11-01 14:12:22 +0200153 self.cdata_elem = elem.lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000154
155 def clear_cdata_mode(self):
156 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200157 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000158
159 # Internal -- handle data as far as reasonable. May leave state
160 # and data to be processed by a subsequent call. If 'end' is
161 # true, force handling all data as if followed by EOF marker.
162 def goahead(self, end):
163 rawdata = self.rawdata
164 i = 0
165 n = len(rawdata)
166 while i < n:
167 match = self.interesting.search(rawdata, i) # < or &
168 if match:
169 j = match.start()
170 else:
171 j = n
172 if i < j: self.handle_data(rawdata[i:j])
173 i = self.updatepos(i, j)
174 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000175 startswith = rawdata.startswith
176 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000177 if starttagopen.match(rawdata, i): # < + letter
178 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000179 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000180 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000181 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000182 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000183 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000184 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000185 elif startswith("<!", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000186 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000187 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000188 self.handle_data("<")
189 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000190 else:
191 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000192 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000193 if not end:
194 break
195 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000196 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000197 k = rawdata.find('>', i + 1)
198 if k < 0:
199 k = rawdata.find('<', i + 1)
200 if k < 0:
201 k = i + 1
202 else:
203 k += 1
204 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000205 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000206 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000207 match = charref.match(rawdata, i)
208 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000209 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000210 self.handle_charref(name)
211 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000212 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000213 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000214 i = self.updatepos(i, k)
215 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000216 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000217 if ";" in rawdata[i:]: #bail by consuming &#
218 self.handle_data(rawdata[0:2])
219 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000220 break
Fred Drake248b0432001-12-03 17:09:50 +0000221 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000222 match = entityref.match(rawdata, i)
223 if match:
224 name = match.group(1)
225 self.handle_entityref(name)
226 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000227 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000228 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000229 i = self.updatepos(i, k)
230 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000231 match = incomplete.match(rawdata, i)
232 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000233 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000234 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000235 if self.strict:
236 self.error("EOF in middle of entity or char ref")
237 else:
238 if k <= i:
239 k = n
240 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000241 # incomplete
242 break
243 elif (i + 1) < n:
244 # not the end of the buffer, and can't be confused
245 # with some other construct
246 self.handle_data("&")
247 i = self.updatepos(i, i + 1)
248 else:
249 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000250 else:
251 assert 0, "interesting.search() lied"
252 # end while
253 if end and i < n:
254 self.handle_data(rawdata[i:n])
255 i = self.updatepos(i, n)
256 self.rawdata = rawdata[i:]
257
Guido van Rossum8846d712001-05-18 14:50:52 +0000258 # Internal -- parse processing instr, return end or -1 if not terminated
259 def parse_pi(self, i):
260 rawdata = self.rawdata
261 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
262 match = piclose.search(rawdata, i+2) # >
263 if not match:
264 return -1
265 j = match.start()
266 self.handle_pi(rawdata[i+2: j])
267 j = match.end()
268 return j
269
270 # Internal -- handle starttag, return end or -1 if not terminated
271 def parse_starttag(self, i):
272 self.__starttag_text = None
273 endpos = self.check_for_whole_start_tag(i)
274 if endpos < 0:
275 return endpos
276 rawdata = self.rawdata
277 self.__starttag_text = rawdata[i:endpos]
278
279 # Now parse the data between i+1 and j into a tag and attrs
280 attrs = []
281 match = tagfind.match(rawdata, i+1)
282 assert match, 'unexpected call to parse_starttag()'
283 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000284 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000285 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000286 if self.strict:
287 m = attrfind.match(rawdata, k)
288 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300289 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000290 if not m:
291 break
292 attrname, rest, attrvalue = m.group(1, 2, 3)
293 if not rest:
294 attrvalue = None
295 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
296 attrvalue[:1] == '"' == attrvalue[-1:]:
297 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200298 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000299 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000300 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000301 k = m.end()
302
Fred Drake248b0432001-12-03 17:09:50 +0000303 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000304 if end not in (">", "/>"):
305 lineno, offset = self.getpos()
306 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000307 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000308 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000309 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000310 else:
311 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000312 if self.strict:
313 self.error("junk characters in start tag: %r"
314 % (rawdata[k:endpos][:20],))
315 self.handle_data(rawdata[i:endpos])
316 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000317 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000318 # XHTML-style empty tag: <span attr="value" />
319 self.handle_startendtag(tag, attrs)
320 else:
321 self.handle_starttag(tag, attrs)
322 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200323 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000324 return endpos
325
326 # Internal -- check to see if we have a complete starttag; return end
327 # or -1 if incomplete.
328 def check_for_whole_start_tag(self, i):
329 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000330 if self.strict:
331 m = locatestarttagend.match(rawdata, i)
332 else:
333 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000334 if m:
335 j = m.end()
336 next = rawdata[j:j+1]
337 if next == ">":
338 return j + 1
339 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000340 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000341 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000342 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000343 # buffer boundary
344 return -1
345 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000346 if self.strict:
347 self.updatepos(i, j + 1)
348 self.error("malformed empty start tag")
349 if j > i:
350 return j
351 else:
352 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000353 if next == "":
354 # end of input
355 return -1
356 if next in ("abcdefghijklmnopqrstuvwxyz=/"
357 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
358 # end of input in or before attribute value, or we have the
359 # '/' from a '/>' ending
360 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000361 if self.strict:
362 self.updatepos(i, j)
363 self.error("malformed start tag")
364 if j > i:
365 return j
366 else:
367 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000368 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000369
370 # Internal -- parse endtag, return end or -1 if incomplete
371 def parse_endtag(self, i):
372 rawdata = self.rawdata
373 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
374 match = endendtag.search(rawdata, i+1) # >
375 if not match:
376 return -1
377 j = match.end()
378 match = endtagfind.match(rawdata, i) # </ + tag + >
379 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200380 if self.cdata_elem is not None:
381 self.handle_data(rawdata[i:j])
382 return j
R. David Murrayb579dba2010-12-03 04:06:39 +0000383 if self.strict:
384 self.error("bad end tag: %r" % (rawdata[i:j],))
385 k = rawdata.find('<', i + 1, j)
386 if k > i:
387 j = k
388 if j <= i:
389 j = i + 1
390 self.handle_data(rawdata[i:j])
391 return j
Ezio Melotti7de56f62011-11-01 14:12:22 +0200392
393 elem = match.group(1).lower() # script or style
394 if self.cdata_elem is not None:
395 if elem != self.cdata_elem:
396 self.handle_data(rawdata[i:j])
397 return j
398
399 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000400 self.clear_cdata_mode()
Guido van Rossum8846d712001-05-18 14:50:52 +0000401 return j
402
403 # Overridable -- finish processing of start+end tag: <tag.../>
404 def handle_startendtag(self, tag, attrs):
405 self.handle_starttag(tag, attrs)
406 self.handle_endtag(tag)
407
408 # Overridable -- handle start tag
409 def handle_starttag(self, tag, attrs):
410 pass
411
412 # Overridable -- handle end tag
413 def handle_endtag(self, tag):
414 pass
415
416 # Overridable -- handle character reference
417 def handle_charref(self, name):
418 pass
419
420 # Overridable -- handle entity reference
421 def handle_entityref(self, name):
422 pass
423
424 # Overridable -- handle data
425 def handle_data(self, data):
426 pass
427
428 # Overridable -- handle comment
429 def handle_comment(self, data):
430 pass
431
432 # Overridable -- handle declaration
433 def handle_decl(self, decl):
434 pass
435
436 # Overridable -- handle processing instruction
437 def handle_pi(self, data):
438 pass
439
Fred Drakebfc8fea2001-09-24 20:10:28 +0000440 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000441 if self.strict:
442 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000443
Guido van Rossum8846d712001-05-18 14:50:52 +0000444 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000445 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000446 def unescape(self, s):
447 if '&' not in s:
448 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000449 def replaceEntities(s):
450 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000451 try:
452 if s[0] == "#":
453 s = s[1:]
454 if s[0] in ['x','X']:
455 c = int(s[1:], 16)
456 else:
457 c = int(s)
458 return chr(c)
459 except ValueError:
460 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000461 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000462 # Cannot use name2codepoint directly, because HTMLParser
463 # supports apos, which is not part of HTML 4
464 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000465 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000466 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000467 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000468 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000469 try:
470 return self.entitydefs[s]
471 except KeyError:
472 return '&'+s+';'
473
Fred Drake3c50ea42008-05-17 22:02:32 +0000474 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300475 replaceEntities, s, flags=re.ASCII)