blob: de504ab54409314352461fa39517e7ab411a0be6 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000017incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000018
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000020charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000024commentclose = re.compile(r'--\s*>')
Ezio Melotti0780b6b2012-04-18 19:18:22 -060025tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
Ezio Melotti5211ffe2012-02-13 11:24:50 +020026# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
Ezio Melotti29877e82012-02-21 09:25:00 +020029# Note:
30# 1) the strict attrfind isn't really strict, but we can't make it
31# correctly strict without breaking backward compatibility;
32# 2) if you change attrfind remember to update locatestarttagend too;
33# 3) if you change attrfind and/or locatestarttagend the parser will
34# explode, so don't do it.
Guido van Rossum8846d712001-05-18 14:50:52 +000035attrfind = re.compile(
36 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030037 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000038attrfind_tolerant = re.compile(
Ezio Melotti0780b6b2012-04-18 19:18:22 -060039 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
Ezio Melotti29877e82012-02-21 09:25:00 +020040 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000041locatestarttagend = re.compile(r"""
42 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
43 (?:\s+ # whitespace before attribute name
44 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
45 (?:\s*=\s* # value indicator
46 (?:'[^']*' # LITA-enclosed value
47 |\"[^\"]*\" # LIT-enclosed value
48 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000049 )
Guido van Rossum8846d712001-05-18 14:50:52 +000050 )?
51 )
52 )*
53 \s* # trailing whitespace
54""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000055locatestarttagend_tolerant = re.compile(r"""
56 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
Ezio Melotti29877e82012-02-21 09:25:00 +020057 (?:[\s/]* # optional whitespace before attribute name
58 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020059 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000060 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020061 |"[^"]*" # LIT-enclosed value
62 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000063 )
64 (?:\s*,)* # possibly followed by a comma
Ezio Melotti29877e82012-02-21 09:25:00 +020065 )?(?:\s|/(?!>))*
Ezio Melottic2fe5772011-11-14 18:53:33 +020066 )*
67 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000068 \s* # trailing whitespace
69""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000070endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020071# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
72# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000073endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
74
Guido van Rossum8846d712001-05-18 14:50:52 +000075
76class HTMLParseError(Exception):
77 """Exception raised for all parse errors."""
78
79 def __init__(self, msg, position=(None, None)):
80 assert msg
81 self.msg = msg
82 self.lineno = position[0]
83 self.offset = position[1]
84
85 def __str__(self):
86 result = self.msg
87 if self.lineno is not None:
88 result = result + ", at line %d" % self.lineno
89 if self.offset is not None:
90 result = result + ", column %d" % (self.offset + 1)
91 return result
92
93
Fred Drakecb5c80f2007-12-07 11:10:11 +000094class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000095 """Find tags and other markup and call handler functions.
96
97 Usage:
98 p = HTMLParser()
99 p.feed(data)
100 ...
101 p.close()
102
103 Start tags are handled by calling self.handle_starttag() or
104 self.handle_startendtag(); end tags by self.handle_endtag(). The
105 data between tags is passed from the parser to the derived class
106 by calling self.handle_data() with the data as argument (the data
107 may be split up in arbitrary chunks). Entity references are
108 passed by calling self.handle_entityref() with the entity
109 reference as the argument. Numeric character references are
110 passed to self.handle_charref() with the string containing the
111 reference as the argument.
112 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000113
114 CDATA_CONTENT_ELEMENTS = ("script", "style")
115
R. David Murrayb579dba2010-12-03 04:06:39 +0000116 def __init__(self, strict=True):
117 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000118
R. David Murrayb579dba2010-12-03 04:06:39 +0000119 If strict is set to True (the default), errors are raised when invalid
120 HTML is encountered. If set to False, an attempt is instead made to
121 continue parsing, making "best guesses" about the intended meaning, in
122 a fashion similar to what browsers typically do.
123 """
124 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000125 self.reset()
126
Guido van Rossum8846d712001-05-18 14:50:52 +0000127 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000128 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000129 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000130 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000131 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200132 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000133 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000134
Guido van Rossum8846d712001-05-18 14:50:52 +0000135 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200136 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000137
138 Call this as often as you want, with as little or as much text
139 as you want (may include '\n').
140 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000141 self.rawdata = self.rawdata + data
142 self.goahead(0)
143
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000145 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000146 self.goahead(1)
147
Fred Drakebfc8fea2001-09-24 20:10:28 +0000148 def error(self, message):
149 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000150
151 __starttag_text = None
152
Guido van Rossum8846d712001-05-18 14:50:52 +0000153 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000154 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000155 return self.__starttag_text
156
Ezio Melotti7de56f62011-11-01 14:12:22 +0200157 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200158 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200159 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000160
161 def clear_cdata_mode(self):
162 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200163 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000164
165 # Internal -- handle data as far as reasonable. May leave state
166 # and data to be processed by a subsequent call. If 'end' is
167 # true, force handling all data as if followed by EOF marker.
168 def goahead(self, end):
169 rawdata = self.rawdata
170 i = 0
171 n = len(rawdata)
172 while i < n:
173 match = self.interesting.search(rawdata, i) # < or &
174 if match:
175 j = match.start()
176 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200177 if self.cdata_elem:
178 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000179 j = n
180 if i < j: self.handle_data(rawdata[i:j])
181 i = self.updatepos(i, j)
182 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000183 startswith = rawdata.startswith
184 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000185 if starttagopen.match(rawdata, i): # < + letter
186 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000187 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000188 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000189 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000190 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000191 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000192 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000193 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200194 if self.strict:
195 k = self.parse_declaration(i)
196 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200197 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000198 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000199 self.handle_data("<")
200 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000201 else:
202 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000203 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000204 if not end:
205 break
206 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000207 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000208 k = rawdata.find('>', i + 1)
209 if k < 0:
210 k = rawdata.find('<', i + 1)
211 if k < 0:
212 k = i + 1
213 else:
214 k += 1
215 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000216 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000217 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000218 match = charref.match(rawdata, i)
219 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000220 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 self.handle_charref(name)
222 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000223 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000224 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000225 i = self.updatepos(i, k)
226 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000227 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000228 if ";" in rawdata[i:]: #bail by consuming &#
229 self.handle_data(rawdata[0:2])
230 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000231 break
Fred Drake248b0432001-12-03 17:09:50 +0000232 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000233 match = entityref.match(rawdata, i)
234 if match:
235 name = match.group(1)
236 self.handle_entityref(name)
237 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000238 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000239 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000240 i = self.updatepos(i, k)
241 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000242 match = incomplete.match(rawdata, i)
243 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000244 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000245 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000246 if self.strict:
247 self.error("EOF in middle of entity or char ref")
248 else:
249 if k <= i:
250 k = n
251 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000252 # incomplete
253 break
254 elif (i + 1) < n:
255 # not the end of the buffer, and can't be confused
256 # with some other construct
257 self.handle_data("&")
258 i = self.updatepos(i, i + 1)
259 else:
260 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000261 else:
262 assert 0, "interesting.search() lied"
263 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200264 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000265 self.handle_data(rawdata[i:n])
266 i = self.updatepos(i, n)
267 self.rawdata = rawdata[i:]
268
Ezio Melottif4ab4912012-02-13 15:50:37 +0200269 # Internal -- parse html declarations, return length or -1 if not terminated
270 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
271 # See also parse_declaration in _markupbase
272 def parse_html_declaration(self, i):
273 rawdata = self.rawdata
274 if rawdata[i:i+2] != '<!':
275 self.error('unexpected call to parse_html_declaration()')
276 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200277 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200278 return self.parse_comment(i)
279 elif rawdata[i:i+3] == '<![':
280 return self.parse_marked_section(i)
281 elif rawdata[i:i+9].lower() == '<!doctype':
282 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200283 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200284 if gtpos == -1:
285 return -1
286 self.handle_decl(rawdata[i+2:gtpos])
287 return gtpos+1
288 else:
289 return self.parse_bogus_comment(i)
290
Ezio Melottifa3702d2012-02-10 10:45:44 +0200291 # Internal -- parse bogus comment, return length or -1 if not terminated
292 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
293 def parse_bogus_comment(self, i, report=1):
294 rawdata = self.rawdata
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200295 if rawdata[i:i+2] not in ('<!', '</'):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200296 self.error('unexpected call to parse_comment()')
297 pos = rawdata.find('>', i+2)
298 if pos == -1:
299 return -1
300 if report:
301 self.handle_comment(rawdata[i+2:pos])
302 return pos + 1
303
Guido van Rossum8846d712001-05-18 14:50:52 +0000304 # Internal -- parse processing instr, return end or -1 if not terminated
305 def parse_pi(self, i):
306 rawdata = self.rawdata
307 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
308 match = piclose.search(rawdata, i+2) # >
309 if not match:
310 return -1
311 j = match.start()
312 self.handle_pi(rawdata[i+2: j])
313 j = match.end()
314 return j
315
316 # Internal -- handle starttag, return end or -1 if not terminated
317 def parse_starttag(self, i):
318 self.__starttag_text = None
319 endpos = self.check_for_whole_start_tag(i)
320 if endpos < 0:
321 return endpos
322 rawdata = self.rawdata
323 self.__starttag_text = rawdata[i:endpos]
324
325 # Now parse the data between i+1 and j into a tag and attrs
326 attrs = []
327 match = tagfind.match(rawdata, i+1)
328 assert match, 'unexpected call to parse_starttag()'
329 k = match.end()
Ezio Melotti0780b6b2012-04-18 19:18:22 -0600330 self.lasttag = tag = match.group(1).lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000331 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000332 if self.strict:
333 m = attrfind.match(rawdata, k)
334 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300335 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000336 if not m:
337 break
338 attrname, rest, attrvalue = m.group(1, 2, 3)
339 if not rest:
340 attrvalue = None
341 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
342 attrvalue[:1] == '"' == attrvalue[-1:]:
343 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200344 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000345 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000346 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000347 k = m.end()
348
Fred Drake248b0432001-12-03 17:09:50 +0000349 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000350 if end not in (">", "/>"):
351 lineno, offset = self.getpos()
352 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000353 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000354 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000355 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000356 else:
357 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000358 if self.strict:
359 self.error("junk characters in start tag: %r"
360 % (rawdata[k:endpos][:20],))
361 self.handle_data(rawdata[i:endpos])
362 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000363 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000364 # XHTML-style empty tag: <span attr="value" />
365 self.handle_startendtag(tag, attrs)
366 else:
367 self.handle_starttag(tag, attrs)
368 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200369 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000370 return endpos
371
372 # Internal -- check to see if we have a complete starttag; return end
373 # or -1 if incomplete.
374 def check_for_whole_start_tag(self, i):
375 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000376 if self.strict:
377 m = locatestarttagend.match(rawdata, i)
378 else:
379 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000380 if m:
381 j = m.end()
382 next = rawdata[j:j+1]
383 if next == ">":
384 return j + 1
385 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000386 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000387 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000388 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000389 # buffer boundary
390 return -1
391 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000392 if self.strict:
393 self.updatepos(i, j + 1)
394 self.error("malformed empty start tag")
395 if j > i:
396 return j
397 else:
398 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000399 if next == "":
400 # end of input
401 return -1
402 if next in ("abcdefghijklmnopqrstuvwxyz=/"
403 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
404 # end of input in or before attribute value, or we have the
405 # '/' from a '/>' ending
406 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000407 if self.strict:
408 self.updatepos(i, j)
409 self.error("malformed start tag")
410 if j > i:
411 return j
412 else:
413 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000414 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000415
416 # Internal -- parse endtag, return end or -1 if incomplete
417 def parse_endtag(self, i):
418 rawdata = self.rawdata
419 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
420 match = endendtag.search(rawdata, i+1) # >
421 if not match:
422 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200423 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000424 match = endtagfind.match(rawdata, i) # </ + tag + >
425 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200426 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200427 self.handle_data(rawdata[i:gtpos])
428 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000429 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200430 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
431 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
432 namematch = tagfind_tolerant.match(rawdata, i+2)
433 if not namematch:
434 # w3.org/TR/html5/tokenization.html#end-tag-open-state
435 if rawdata[i:i+3] == '</>':
436 return i+3
437 else:
438 return self.parse_bogus_comment(i)
439 tagname = namematch.group().lower()
440 # consume and ignore other stuff between the name and the >
441 # Note: this is not 100% correct, since we might have things like
442 # </tag attr=">">, but looking for > after tha name should cover
443 # most of the cases and is much simpler
444 gtpos = rawdata.find('>', namematch.end())
445 self.handle_endtag(tagname)
446 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200447
448 elem = match.group(1).lower() # script or style
449 if self.cdata_elem is not None:
450 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200451 self.handle_data(rawdata[i:gtpos])
452 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200453
454 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000455 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200456 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000457
458 # Overridable -- finish processing of start+end tag: <tag.../>
459 def handle_startendtag(self, tag, attrs):
460 self.handle_starttag(tag, attrs)
461 self.handle_endtag(tag)
462
463 # Overridable -- handle start tag
464 def handle_starttag(self, tag, attrs):
465 pass
466
467 # Overridable -- handle end tag
468 def handle_endtag(self, tag):
469 pass
470
471 # Overridable -- handle character reference
472 def handle_charref(self, name):
473 pass
474
475 # Overridable -- handle entity reference
476 def handle_entityref(self, name):
477 pass
478
479 # Overridable -- handle data
480 def handle_data(self, data):
481 pass
482
483 # Overridable -- handle comment
484 def handle_comment(self, data):
485 pass
486
487 # Overridable -- handle declaration
488 def handle_decl(self, decl):
489 pass
490
491 # Overridable -- handle processing instruction
492 def handle_pi(self, data):
493 pass
494
Fred Drakebfc8fea2001-09-24 20:10:28 +0000495 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000496 if self.strict:
497 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000498
Guido van Rossum8846d712001-05-18 14:50:52 +0000499 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000500 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000501 def unescape(self, s):
502 if '&' not in s:
503 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000504 def replaceEntities(s):
505 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000506 try:
507 if s[0] == "#":
508 s = s[1:]
509 if s[0] in ['x','X']:
510 c = int(s[1:], 16)
511 else:
512 c = int(s)
513 return chr(c)
514 except ValueError:
515 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000516 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000517 # Cannot use name2codepoint directly, because HTMLParser
518 # supports apos, which is not part of HTML 4
519 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000520 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000521 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000522 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000523 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000524 try:
525 return self.entitydefs[s]
526 except KeyError:
527 return '&'+s+';'
528
Fred Drake3c50ea42008-05-17 22:02:32 +0000529 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300530 replaceEntities, s, flags=re.ASCII)