blob: a65478058f440eecba90bd092cfae8fb9a81f611 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000017incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000018
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000020charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000024commentclose = re.compile(r'--\s*>')
25tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
Ezio Melotti5211ffe2012-02-13 11:24:50 +020026# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000029# Note, the strict one of this pair isn't really strict, but we can't
30# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000031attrfind = re.compile(
32 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030033 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000034attrfind_tolerant = re.compile(
Ezio Melottic2fe5772011-11-14 18:53:33 +020035 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
36 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000037locatestarttagend = re.compile(r"""
38 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
39 (?:\s+ # whitespace before attribute name
40 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
41 (?:\s*=\s* # value indicator
42 (?:'[^']*' # LITA-enclosed value
43 |\"[^\"]*\" # LIT-enclosed value
44 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000045 )
Guido van Rossum8846d712001-05-18 14:50:52 +000046 )?
47 )
48 )*
49 \s* # trailing whitespace
50""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000051locatestarttagend_tolerant = re.compile(r"""
52 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
53 (?:\s* # optional whitespace before attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020054 (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
55 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000056 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020057 |"[^"]*" # LIT-enclosed value
58 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000059 )
60 (?:\s*,)* # possibly followed by a comma
Ezio Melottic2fe5772011-11-14 18:53:33 +020061 )?\s*
62 )*
63 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000064 \s* # trailing whitespace
65""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000066endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020067# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
68# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000069endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
70
Guido van Rossum8846d712001-05-18 14:50:52 +000071
72class HTMLParseError(Exception):
73 """Exception raised for all parse errors."""
74
75 def __init__(self, msg, position=(None, None)):
76 assert msg
77 self.msg = msg
78 self.lineno = position[0]
79 self.offset = position[1]
80
81 def __str__(self):
82 result = self.msg
83 if self.lineno is not None:
84 result = result + ", at line %d" % self.lineno
85 if self.offset is not None:
86 result = result + ", column %d" % (self.offset + 1)
87 return result
88
89
Fred Drakecb5c80f2007-12-07 11:10:11 +000090class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000091 """Find tags and other markup and call handler functions.
92
93 Usage:
94 p = HTMLParser()
95 p.feed(data)
96 ...
97 p.close()
98
99 Start tags are handled by calling self.handle_starttag() or
100 self.handle_startendtag(); end tags by self.handle_endtag(). The
101 data between tags is passed from the parser to the derived class
102 by calling self.handle_data() with the data as argument (the data
103 may be split up in arbitrary chunks). Entity references are
104 passed by calling self.handle_entityref() with the entity
105 reference as the argument. Numeric character references are
106 passed to self.handle_charref() with the string containing the
107 reference as the argument.
108 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000109
110 CDATA_CONTENT_ELEMENTS = ("script", "style")
111
R. David Murrayb579dba2010-12-03 04:06:39 +0000112 def __init__(self, strict=True):
113 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000114
R. David Murrayb579dba2010-12-03 04:06:39 +0000115 If strict is set to True (the default), errors are raised when invalid
116 HTML is encountered. If set to False, an attempt is instead made to
117 continue parsing, making "best guesses" about the intended meaning, in
118 a fashion similar to what browsers typically do.
119 """
120 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 self.reset()
122
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000124 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000125 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000126 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000127 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200128 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000129 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000130
Guido van Rossum8846d712001-05-18 14:50:52 +0000131 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200132 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000133
134 Call this as often as you want, with as little or as much text
135 as you want (may include '\n').
136 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 self.rawdata = self.rawdata + data
138 self.goahead(0)
139
Guido van Rossum8846d712001-05-18 14:50:52 +0000140 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000141 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000142 self.goahead(1)
143
Fred Drakebfc8fea2001-09-24 20:10:28 +0000144 def error(self, message):
145 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000146
147 __starttag_text = None
148
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000150 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000151 return self.__starttag_text
152
Ezio Melotti7de56f62011-11-01 14:12:22 +0200153 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200154 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200155 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000156
157 def clear_cdata_mode(self):
158 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200159 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000160
161 # Internal -- handle data as far as reasonable. May leave state
162 # and data to be processed by a subsequent call. If 'end' is
163 # true, force handling all data as if followed by EOF marker.
164 def goahead(self, end):
165 rawdata = self.rawdata
166 i = 0
167 n = len(rawdata)
168 while i < n:
169 match = self.interesting.search(rawdata, i) # < or &
170 if match:
171 j = match.start()
172 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200173 if self.cdata_elem:
174 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000175 j = n
176 if i < j: self.handle_data(rawdata[i:j])
177 i = self.updatepos(i, j)
178 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000179 startswith = rawdata.startswith
180 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000181 if starttagopen.match(rawdata, i): # < + letter
182 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000183 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000184 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000185 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000186 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000187 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000188 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000189 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200190 # this might fail with things like <! not a comment > or
191 # <! -- space before '--' -->. When strict is True an
192 # error is raised, when it's False they will be considered
193 # as bogus comments and parsed (see parse_bogus_comment).
194 if self.strict:
195 k = self.parse_declaration(i)
196 else:
197 try:
198 k = self.parse_declaration(i)
199 except HTMLParseError:
200 k = self.parse_bogus_comment(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000201 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000202 self.handle_data("<")
203 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000204 else:
205 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000207 if not end:
208 break
209 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000210 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000211 k = rawdata.find('>', i + 1)
212 if k < 0:
213 k = rawdata.find('<', i + 1)
214 if k < 0:
215 k = i + 1
216 else:
217 k += 1
218 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000219 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000220 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 match = charref.match(rawdata, i)
222 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000223 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000224 self.handle_charref(name)
225 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000226 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000227 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000228 i = self.updatepos(i, k)
229 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000230 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000231 if ";" in rawdata[i:]: #bail by consuming &#
232 self.handle_data(rawdata[0:2])
233 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000234 break
Fred Drake248b0432001-12-03 17:09:50 +0000235 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000236 match = entityref.match(rawdata, i)
237 if match:
238 name = match.group(1)
239 self.handle_entityref(name)
240 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000241 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000242 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000243 i = self.updatepos(i, k)
244 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000245 match = incomplete.match(rawdata, i)
246 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000247 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000248 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000249 if self.strict:
250 self.error("EOF in middle of entity or char ref")
251 else:
252 if k <= i:
253 k = n
254 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000255 # incomplete
256 break
257 elif (i + 1) < n:
258 # not the end of the buffer, and can't be confused
259 # with some other construct
260 self.handle_data("&")
261 i = self.updatepos(i, i + 1)
262 else:
263 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000264 else:
265 assert 0, "interesting.search() lied"
266 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200267 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000268 self.handle_data(rawdata[i:n])
269 i = self.updatepos(i, n)
270 self.rawdata = rawdata[i:]
271
Ezio Melottifa3702d2012-02-10 10:45:44 +0200272 # Internal -- parse bogus comment, return length or -1 if not terminated
273 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
274 def parse_bogus_comment(self, i, report=1):
275 rawdata = self.rawdata
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200276 if rawdata[i:i+2] not in ('<!', '</'):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200277 self.error('unexpected call to parse_comment()')
278 pos = rawdata.find('>', i+2)
279 if pos == -1:
280 return -1
281 if report:
282 self.handle_comment(rawdata[i+2:pos])
283 return pos + 1
284
Guido van Rossum8846d712001-05-18 14:50:52 +0000285 # Internal -- parse processing instr, return end or -1 if not terminated
286 def parse_pi(self, i):
287 rawdata = self.rawdata
288 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
289 match = piclose.search(rawdata, i+2) # >
290 if not match:
291 return -1
292 j = match.start()
293 self.handle_pi(rawdata[i+2: j])
294 j = match.end()
295 return j
296
297 # Internal -- handle starttag, return end or -1 if not terminated
298 def parse_starttag(self, i):
299 self.__starttag_text = None
300 endpos = self.check_for_whole_start_tag(i)
301 if endpos < 0:
302 return endpos
303 rawdata = self.rawdata
304 self.__starttag_text = rawdata[i:endpos]
305
306 # Now parse the data between i+1 and j into a tag and attrs
307 attrs = []
308 match = tagfind.match(rawdata, i+1)
309 assert match, 'unexpected call to parse_starttag()'
310 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000311 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000312 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000313 if self.strict:
314 m = attrfind.match(rawdata, k)
315 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300316 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000317 if not m:
318 break
319 attrname, rest, attrvalue = m.group(1, 2, 3)
320 if not rest:
321 attrvalue = None
322 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
323 attrvalue[:1] == '"' == attrvalue[-1:]:
324 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200325 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000326 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000327 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000328 k = m.end()
329
Fred Drake248b0432001-12-03 17:09:50 +0000330 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000331 if end not in (">", "/>"):
332 lineno, offset = self.getpos()
333 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000334 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000335 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000336 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000337 else:
338 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000339 if self.strict:
340 self.error("junk characters in start tag: %r"
341 % (rawdata[k:endpos][:20],))
342 self.handle_data(rawdata[i:endpos])
343 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000344 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000345 # XHTML-style empty tag: <span attr="value" />
346 self.handle_startendtag(tag, attrs)
347 else:
348 self.handle_starttag(tag, attrs)
349 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200350 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000351 return endpos
352
353 # Internal -- check to see if we have a complete starttag; return end
354 # or -1 if incomplete.
355 def check_for_whole_start_tag(self, i):
356 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000357 if self.strict:
358 m = locatestarttagend.match(rawdata, i)
359 else:
360 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000361 if m:
362 j = m.end()
363 next = rawdata[j:j+1]
364 if next == ">":
365 return j + 1
366 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000367 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000368 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000369 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000370 # buffer boundary
371 return -1
372 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000373 if self.strict:
374 self.updatepos(i, j + 1)
375 self.error("malformed empty start tag")
376 if j > i:
377 return j
378 else:
379 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000380 if next == "":
381 # end of input
382 return -1
383 if next in ("abcdefghijklmnopqrstuvwxyz=/"
384 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
385 # end of input in or before attribute value, or we have the
386 # '/' from a '/>' ending
387 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000388 if self.strict:
389 self.updatepos(i, j)
390 self.error("malformed start tag")
391 if j > i:
392 return j
393 else:
394 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000395 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000396
397 # Internal -- parse endtag, return end or -1 if incomplete
398 def parse_endtag(self, i):
399 rawdata = self.rawdata
400 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
401 match = endendtag.search(rawdata, i+1) # >
402 if not match:
403 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200404 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000405 match = endtagfind.match(rawdata, i) # </ + tag + >
406 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200407 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200408 self.handle_data(rawdata[i:gtpos])
409 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000410 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200411 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
412 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
413 namematch = tagfind_tolerant.match(rawdata, i+2)
414 if not namematch:
415 # w3.org/TR/html5/tokenization.html#end-tag-open-state
416 if rawdata[i:i+3] == '</>':
417 return i+3
418 else:
419 return self.parse_bogus_comment(i)
420 tagname = namematch.group().lower()
421 # consume and ignore other stuff between the name and the >
422 # Note: this is not 100% correct, since we might have things like
423 # </tag attr=">">, but looking for > after tha name should cover
424 # most of the cases and is much simpler
425 gtpos = rawdata.find('>', namematch.end())
426 self.handle_endtag(tagname)
427 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200428
429 elem = match.group(1).lower() # script or style
430 if self.cdata_elem is not None:
431 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200432 self.handle_data(rawdata[i:gtpos])
433 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200434
435 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000436 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200437 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000438
439 # Overridable -- finish processing of start+end tag: <tag.../>
440 def handle_startendtag(self, tag, attrs):
441 self.handle_starttag(tag, attrs)
442 self.handle_endtag(tag)
443
444 # Overridable -- handle start tag
445 def handle_starttag(self, tag, attrs):
446 pass
447
448 # Overridable -- handle end tag
449 def handle_endtag(self, tag):
450 pass
451
452 # Overridable -- handle character reference
453 def handle_charref(self, name):
454 pass
455
456 # Overridable -- handle entity reference
457 def handle_entityref(self, name):
458 pass
459
460 # Overridable -- handle data
461 def handle_data(self, data):
462 pass
463
464 # Overridable -- handle comment
465 def handle_comment(self, data):
466 pass
467
468 # Overridable -- handle declaration
469 def handle_decl(self, decl):
470 pass
471
472 # Overridable -- handle processing instruction
473 def handle_pi(self, data):
474 pass
475
Fred Drakebfc8fea2001-09-24 20:10:28 +0000476 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000477 if self.strict:
478 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000479
Guido van Rossum8846d712001-05-18 14:50:52 +0000480 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000481 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000482 def unescape(self, s):
483 if '&' not in s:
484 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000485 def replaceEntities(s):
486 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000487 try:
488 if s[0] == "#":
489 s = s[1:]
490 if s[0] in ['x','X']:
491 c = int(s[1:], 16)
492 else:
493 c = int(s)
494 return chr(c)
495 except ValueError:
496 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000497 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000498 # Cannot use name2codepoint directly, because HTMLParser
499 # supports apos, which is not part of HTML 4
500 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000501 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000502 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000503 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000504 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000505 try:
506 return self.entitydefs[s]
507 except KeyError:
508 return '&'+s+';'
509
Fred Drake3c50ea42008-05-17 22:02:32 +0000510 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300511 replaceEntities, s, flags=re.ASCII)