blob: 60a322a9494d9df2c3fb0122c8ac63dfaa1159a3 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Ezio Melotti3861d8b2012-06-23 15:27:51 +020013import warnings
Guido van Rossum8846d712001-05-18 14:50:52 +000014
15# Regular expressions used for parsing
16
17interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
Ezio Melotti0780b6b2012-04-18 19:18:22 -060026tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
Ezio Melotti5211ffe2012-02-13 11:24:50 +020027# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
28# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
29tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
Ezio Melotti29877e82012-02-21 09:25:00 +020030# Note:
31# 1) the strict attrfind isn't really strict, but we can't make it
32# correctly strict without breaking backward compatibility;
33# 2) if you change attrfind remember to update locatestarttagend too;
34# 3) if you change attrfind and/or locatestarttagend the parser will
35# explode, so don't do it.
Guido van Rossum8846d712001-05-18 14:50:52 +000036attrfind = re.compile(
37 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030038 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000039attrfind_tolerant = re.compile(
Ezio Melotti0780b6b2012-04-18 19:18:22 -060040 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
Ezio Melotti29877e82012-02-21 09:25:00 +020041 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000042locatestarttagend = re.compile(r"""
43 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
44 (?:\s+ # whitespace before attribute name
45 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
46 (?:\s*=\s* # value indicator
47 (?:'[^']*' # LITA-enclosed value
48 |\"[^\"]*\" # LIT-enclosed value
49 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000050 )
Guido van Rossum8846d712001-05-18 14:50:52 +000051 )?
52 )
53 )*
54 \s* # trailing whitespace
55""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000056locatestarttagend_tolerant = re.compile(r"""
57 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
Ezio Melotti29877e82012-02-21 09:25:00 +020058 (?:[\s/]* # optional whitespace before attribute name
59 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020060 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000061 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020062 |"[^"]*" # LIT-enclosed value
63 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000064 )
65 (?:\s*,)* # possibly followed by a comma
Ezio Melotti29877e82012-02-21 09:25:00 +020066 )?(?:\s|/(?!>))*
Ezio Melottic2fe5772011-11-14 18:53:33 +020067 )*
68 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000069 \s* # trailing whitespace
70""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000071endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020072# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
73# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000074endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
75
Guido van Rossum8846d712001-05-18 14:50:52 +000076
77class HTMLParseError(Exception):
78 """Exception raised for all parse errors."""
79
80 def __init__(self, msg, position=(None, None)):
81 assert msg
82 self.msg = msg
83 self.lineno = position[0]
84 self.offset = position[1]
85
86 def __str__(self):
87 result = self.msg
88 if self.lineno is not None:
89 result = result + ", at line %d" % self.lineno
90 if self.offset is not None:
91 result = result + ", column %d" % (self.offset + 1)
92 return result
93
94
Fred Drakecb5c80f2007-12-07 11:10:11 +000095class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000096 """Find tags and other markup and call handler functions.
97
98 Usage:
99 p = HTMLParser()
100 p.feed(data)
101 ...
102 p.close()
103
104 Start tags are handled by calling self.handle_starttag() or
105 self.handle_startendtag(); end tags by self.handle_endtag(). The
106 data between tags is passed from the parser to the derived class
107 by calling self.handle_data() with the data as argument (the data
108 may be split up in arbitrary chunks). Entity references are
109 passed by calling self.handle_entityref() with the entity
110 reference as the argument. Numeric character references are
111 passed to self.handle_charref() with the string containing the
112 reference as the argument.
113 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000114
115 CDATA_CONTENT_ELEMENTS = ("script", "style")
116
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200117 def __init__(self, strict=False):
R. David Murrayb579dba2010-12-03 04:06:39 +0000118 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000119
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200120 If strict is set to False (the default) the parser will parse invalid
121 markup, otherwise it will raise an error. Note that the strict mode
122 is deprecated.
R. David Murrayb579dba2010-12-03 04:06:39 +0000123 """
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200124 if strict:
125 warnings.warn("The strict mode is deprecated.",
126 DeprecationWarning, stacklevel=2)
R. David Murrayb579dba2010-12-03 04:06:39 +0000127 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000128 self.reset()
129
Guido van Rossum8846d712001-05-18 14:50:52 +0000130 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000131 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000132 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000133 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000134 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200135 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000136 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000137
Guido van Rossum8846d712001-05-18 14:50:52 +0000138 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200139 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000140
141 Call this as often as you want, with as little or as much text
142 as you want (may include '\n').
143 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 self.rawdata = self.rawdata + data
145 self.goahead(0)
146
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000148 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 self.goahead(1)
150
Fred Drakebfc8fea2001-09-24 20:10:28 +0000151 def error(self, message):
152 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000153
154 __starttag_text = None
155
Guido van Rossum8846d712001-05-18 14:50:52 +0000156 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000157 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000158 return self.__starttag_text
159
Ezio Melotti7de56f62011-11-01 14:12:22 +0200160 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200161 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200162 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000163
164 def clear_cdata_mode(self):
165 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200166 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000167
168 # Internal -- handle data as far as reasonable. May leave state
169 # and data to be processed by a subsequent call. If 'end' is
170 # true, force handling all data as if followed by EOF marker.
171 def goahead(self, end):
172 rawdata = self.rawdata
173 i = 0
174 n = len(rawdata)
175 while i < n:
176 match = self.interesting.search(rawdata, i) # < or &
177 if match:
178 j = match.start()
179 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200180 if self.cdata_elem:
181 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000182 j = n
183 if i < j: self.handle_data(rawdata[i:j])
184 i = self.updatepos(i, j)
185 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000186 startswith = rawdata.startswith
187 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000188 if starttagopen.match(rawdata, i): # < + letter
189 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000190 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000191 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000192 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000193 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000194 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000195 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000196 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200197 if self.strict:
198 k = self.parse_declaration(i)
199 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200200 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000201 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000202 self.handle_data("<")
203 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000204 else:
205 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000207 if not end:
208 break
209 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000210 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000211 k = rawdata.find('>', i + 1)
212 if k < 0:
213 k = rawdata.find('<', i + 1)
214 if k < 0:
215 k = i + 1
216 else:
217 k += 1
218 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000219 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000220 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 match = charref.match(rawdata, i)
222 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000223 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000224 self.handle_charref(name)
225 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000226 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000227 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000228 i = self.updatepos(i, k)
229 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000230 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000231 if ";" in rawdata[i:]: #bail by consuming &#
232 self.handle_data(rawdata[0:2])
233 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000234 break
Fred Drake248b0432001-12-03 17:09:50 +0000235 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000236 match = entityref.match(rawdata, i)
237 if match:
238 name = match.group(1)
239 self.handle_entityref(name)
240 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000241 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000242 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000243 i = self.updatepos(i, k)
244 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000245 match = incomplete.match(rawdata, i)
246 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000247 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000248 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000249 if self.strict:
250 self.error("EOF in middle of entity or char ref")
251 else:
Ezio Melotti8e596a72013-05-01 16:18:25 +0300252 k = match.end()
R. David Murrayb579dba2010-12-03 04:06:39 +0000253 if k <= i:
254 k = n
255 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000256 # incomplete
257 break
258 elif (i + 1) < n:
259 # not the end of the buffer, and can't be confused
260 # with some other construct
261 self.handle_data("&")
262 i = self.updatepos(i, i + 1)
263 else:
264 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000265 else:
266 assert 0, "interesting.search() lied"
267 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200268 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000269 self.handle_data(rawdata[i:n])
270 i = self.updatepos(i, n)
271 self.rawdata = rawdata[i:]
272
Ezio Melottif4ab4912012-02-13 15:50:37 +0200273 # Internal -- parse html declarations, return length or -1 if not terminated
274 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
275 # See also parse_declaration in _markupbase
276 def parse_html_declaration(self, i):
277 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200278 assert rawdata[i:i+2] == '<!', ('unexpected call to '
279 'parse_html_declaration()')
Ezio Melottif4ab4912012-02-13 15:50:37 +0200280 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200281 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200282 return self.parse_comment(i)
283 elif rawdata[i:i+3] == '<![':
284 return self.parse_marked_section(i)
285 elif rawdata[i:i+9].lower() == '<!doctype':
286 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200287 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200288 if gtpos == -1:
289 return -1
290 self.handle_decl(rawdata[i+2:gtpos])
291 return gtpos+1
292 else:
293 return self.parse_bogus_comment(i)
294
Ezio Melottifa3702d2012-02-10 10:45:44 +0200295 # Internal -- parse bogus comment, return length or -1 if not terminated
296 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
297 def parse_bogus_comment(self, i, report=1):
298 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200299 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
300 'parse_comment()')
Ezio Melottifa3702d2012-02-10 10:45:44 +0200301 pos = rawdata.find('>', i+2)
302 if pos == -1:
303 return -1
304 if report:
305 self.handle_comment(rawdata[i+2:pos])
306 return pos + 1
307
Guido van Rossum8846d712001-05-18 14:50:52 +0000308 # Internal -- parse processing instr, return end or -1 if not terminated
309 def parse_pi(self, i):
310 rawdata = self.rawdata
311 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
312 match = piclose.search(rawdata, i+2) # >
313 if not match:
314 return -1
315 j = match.start()
316 self.handle_pi(rawdata[i+2: j])
317 j = match.end()
318 return j
319
320 # Internal -- handle starttag, return end or -1 if not terminated
321 def parse_starttag(self, i):
322 self.__starttag_text = None
323 endpos = self.check_for_whole_start_tag(i)
324 if endpos < 0:
325 return endpos
326 rawdata = self.rawdata
327 self.__starttag_text = rawdata[i:endpos]
328
329 # Now parse the data between i+1 and j into a tag and attrs
330 attrs = []
331 match = tagfind.match(rawdata, i+1)
332 assert match, 'unexpected call to parse_starttag()'
333 k = match.end()
Ezio Melotti0780b6b2012-04-18 19:18:22 -0600334 self.lasttag = tag = match.group(1).lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000335 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000336 if self.strict:
337 m = attrfind.match(rawdata, k)
338 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300339 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000340 if not m:
341 break
342 attrname, rest, attrvalue = m.group(1, 2, 3)
343 if not rest:
344 attrvalue = None
345 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
346 attrvalue[:1] == '"' == attrvalue[-1:]:
347 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200348 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000349 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000350 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000351 k = m.end()
352
Fred Drake248b0432001-12-03 17:09:50 +0000353 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000354 if end not in (">", "/>"):
355 lineno, offset = self.getpos()
356 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000357 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000358 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000359 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000360 else:
361 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000362 if self.strict:
363 self.error("junk characters in start tag: %r"
364 % (rawdata[k:endpos][:20],))
365 self.handle_data(rawdata[i:endpos])
366 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000367 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000368 # XHTML-style empty tag: <span attr="value" />
369 self.handle_startendtag(tag, attrs)
370 else:
371 self.handle_starttag(tag, attrs)
372 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200373 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000374 return endpos
375
376 # Internal -- check to see if we have a complete starttag; return end
377 # or -1 if incomplete.
378 def check_for_whole_start_tag(self, i):
379 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000380 if self.strict:
381 m = locatestarttagend.match(rawdata, i)
382 else:
383 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000384 if m:
385 j = m.end()
386 next = rawdata[j:j+1]
387 if next == ">":
388 return j + 1
389 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000390 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000391 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000392 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000393 # buffer boundary
394 return -1
395 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000396 if self.strict:
397 self.updatepos(i, j + 1)
398 self.error("malformed empty start tag")
399 if j > i:
400 return j
401 else:
402 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000403 if next == "":
404 # end of input
405 return -1
406 if next in ("abcdefghijklmnopqrstuvwxyz=/"
407 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
408 # end of input in or before attribute value, or we have the
409 # '/' from a '/>' ending
410 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000411 if self.strict:
412 self.updatepos(i, j)
413 self.error("malformed start tag")
414 if j > i:
415 return j
416 else:
417 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000418 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000419
420 # Internal -- parse endtag, return end or -1 if incomplete
421 def parse_endtag(self, i):
422 rawdata = self.rawdata
423 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
424 match = endendtag.search(rawdata, i+1) # >
425 if not match:
426 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200427 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000428 match = endtagfind.match(rawdata, i) # </ + tag + >
429 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200430 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200431 self.handle_data(rawdata[i:gtpos])
432 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000433 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200434 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
435 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
436 namematch = tagfind_tolerant.match(rawdata, i+2)
437 if not namematch:
438 # w3.org/TR/html5/tokenization.html#end-tag-open-state
439 if rawdata[i:i+3] == '</>':
440 return i+3
441 else:
442 return self.parse_bogus_comment(i)
443 tagname = namematch.group().lower()
444 # consume and ignore other stuff between the name and the >
445 # Note: this is not 100% correct, since we might have things like
446 # </tag attr=">">, but looking for > after tha name should cover
447 # most of the cases and is much simpler
448 gtpos = rawdata.find('>', namematch.end())
449 self.handle_endtag(tagname)
450 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200451
452 elem = match.group(1).lower() # script or style
453 if self.cdata_elem is not None:
454 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200455 self.handle_data(rawdata[i:gtpos])
456 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200457
458 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000459 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200460 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000461
462 # Overridable -- finish processing of start+end tag: <tag.../>
463 def handle_startendtag(self, tag, attrs):
464 self.handle_starttag(tag, attrs)
465 self.handle_endtag(tag)
466
467 # Overridable -- handle start tag
468 def handle_starttag(self, tag, attrs):
469 pass
470
471 # Overridable -- handle end tag
472 def handle_endtag(self, tag):
473 pass
474
475 # Overridable -- handle character reference
476 def handle_charref(self, name):
477 pass
478
479 # Overridable -- handle entity reference
480 def handle_entityref(self, name):
481 pass
482
483 # Overridable -- handle data
484 def handle_data(self, data):
485 pass
486
487 # Overridable -- handle comment
488 def handle_comment(self, data):
489 pass
490
491 # Overridable -- handle declaration
492 def handle_decl(self, decl):
493 pass
494
495 # Overridable -- handle processing instruction
496 def handle_pi(self, data):
497 pass
498
Fred Drakebfc8fea2001-09-24 20:10:28 +0000499 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000500 if self.strict:
501 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000502
Guido van Rossum8846d712001-05-18 14:50:52 +0000503 # Internal -- helper to remove special character quoting
504 def unescape(self, s):
505 if '&' not in s:
506 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000507 def replaceEntities(s):
508 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000509 try:
510 if s[0] == "#":
511 s = s[1:]
512 if s[0] in ['x','X']:
Ezio Melotti46495182012-06-24 22:02:56 +0200513 c = int(s[1:].rstrip(';'), 16)
Senthil Kumaran164540f2010-12-28 15:55:16 +0000514 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200515 c = int(s.rstrip(';'))
Senthil Kumaran164540f2010-12-28 15:55:16 +0000516 return chr(c)
517 except ValueError:
Ezio Melotti46495182012-06-24 22:02:56 +0200518 return '&#' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000519 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200520 from html.entities import html5
521 if s in html5:
522 return html5[s]
523 elif s.endswith(';'):
524 return '&' + s
525 for x in range(2, len(s)):
526 if s[:x] in html5:
527 return html5[s[:x]] + s[x:]
528 else:
529 return '&' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000530
Ezio Melotti46495182012-06-24 22:02:56 +0200531 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300532 replaceEntities, s, flags=re.ASCII)