blob: 22498db4a0d450972f425342655f64a72983de8d [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Ezio Melotti3861d8b2012-06-23 15:27:51 +020013import warnings
Guido van Rossum8846d712001-05-18 14:50:52 +000014
Ezio Melotti1698bab2013-05-01 16:09:34 +030015__all__ = ['HTMLParser']
16
Guido van Rossum8846d712001-05-18 14:50:52 +000017# Regular expressions used for parsing
18
19interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000020incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000023charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024
25starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000026piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000027commentclose = re.compile(r'--\s*>')
Ezio Melotti29877e82012-02-21 09:25:00 +020028# Note:
29# 1) the strict attrfind isn't really strict, but we can't make it
30# correctly strict without breaking backward compatibility;
Ezio Melotti7165d8b2013-11-07 18:33:24 +020031# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
32# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
Ezio Melotti29877e82012-02-21 09:25:00 +020033# explode, so don't do it.
Ezio Melotti7165d8b2013-11-07 18:33:24 +020034tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
35# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000038attrfind = re.compile(
39 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030040 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000041attrfind_tolerant = re.compile(
Ezio Melotti0780b6b2012-04-18 19:18:22 -060042 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
Ezio Melotti29877e82012-02-21 09:25:00 +020043 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000044locatestarttagend = re.compile(r"""
45 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
46 (?:\s+ # whitespace before attribute name
47 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
48 (?:\s*=\s* # value indicator
49 (?:'[^']*' # LITA-enclosed value
50 |\"[^\"]*\" # LIT-enclosed value
51 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000052 )
Guido van Rossum8846d712001-05-18 14:50:52 +000053 )?
54 )
55 )*
56 \s* # trailing whitespace
57""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000058locatestarttagend_tolerant = re.compile(r"""
Ezio Melotti7165d8b2013-11-07 18:33:24 +020059 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
Ezio Melotti29877e82012-02-21 09:25:00 +020060 (?:[\s/]* # optional whitespace before attribute name
61 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020062 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000063 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020064 |"[^"]*" # LIT-enclosed value
65 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000066 )
67 (?:\s*,)* # possibly followed by a comma
Ezio Melotti29877e82012-02-21 09:25:00 +020068 )?(?:\s|/(?!>))*
Ezio Melottic2fe5772011-11-14 18:53:33 +020069 )*
70 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000071 \s* # trailing whitespace
72""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000073endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020074# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
75# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000076endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
77
Guido van Rossum8846d712001-05-18 14:50:52 +000078
79class HTMLParseError(Exception):
80 """Exception raised for all parse errors."""
81
82 def __init__(self, msg, position=(None, None)):
83 assert msg
84 self.msg = msg
85 self.lineno = position[0]
86 self.offset = position[1]
87
88 def __str__(self):
89 result = self.msg
90 if self.lineno is not None:
91 result = result + ", at line %d" % self.lineno
92 if self.offset is not None:
93 result = result + ", column %d" % (self.offset + 1)
94 return result
95
96
Ezio Melotti88ebfb12013-11-02 17:08:24 +020097_strict_sentinel = object()
98
Fred Drakecb5c80f2007-12-07 11:10:11 +000099class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +0000100 """Find tags and other markup and call handler functions.
101
102 Usage:
103 p = HTMLParser()
104 p.feed(data)
105 ...
106 p.close()
107
108 Start tags are handled by calling self.handle_starttag() or
109 self.handle_startendtag(); end tags by self.handle_endtag(). The
110 data between tags is passed from the parser to the derived class
111 by calling self.handle_data() with the data as argument (the data
112 may be split up in arbitrary chunks). Entity references are
113 passed by calling self.handle_entityref() with the entity
114 reference as the argument. Numeric character references are
115 passed to self.handle_charref() with the string containing the
116 reference as the argument.
117 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000118
119 CDATA_CONTENT_ELEMENTS = ("script", "style")
120
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200121 def __init__(self, strict=_strict_sentinel):
R. David Murrayb579dba2010-12-03 04:06:39 +0000122 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000123
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200124 If strict is set to False (the default) the parser will parse invalid
125 markup, otherwise it will raise an error. Note that the strict mode
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200126 and argument are deprecated.
R. David Murrayb579dba2010-12-03 04:06:39 +0000127 """
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200128 if strict is not _strict_sentinel:
129 warnings.warn("The strict argument and mode are deprecated.",
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200130 DeprecationWarning, stacklevel=2)
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200131 else:
132 strict = False # default
R. David Murrayb579dba2010-12-03 04:06:39 +0000133 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000134 self.reset()
135
Guido van Rossum8846d712001-05-18 14:50:52 +0000136 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000137 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000138 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000139 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000140 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200141 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000142 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000143
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200145 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000146
147 Call this as often as you want, with as little or as much text
148 as you want (may include '\n').
149 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000150 self.rawdata = self.rawdata + data
151 self.goahead(0)
152
Guido van Rossum8846d712001-05-18 14:50:52 +0000153 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000154 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000155 self.goahead(1)
156
Fred Drakebfc8fea2001-09-24 20:10:28 +0000157 def error(self, message):
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200158 warnings.warn("The 'error' method is deprecated.",
159 DeprecationWarning, stacklevel=2)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000160 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000161
162 __starttag_text = None
163
Guido van Rossum8846d712001-05-18 14:50:52 +0000164 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000165 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000166 return self.__starttag_text
167
Ezio Melotti7de56f62011-11-01 14:12:22 +0200168 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200169 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200170 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000171
172 def clear_cdata_mode(self):
173 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200174 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000175
176 # Internal -- handle data as far as reasonable. May leave state
177 # and data to be processed by a subsequent call. If 'end' is
178 # true, force handling all data as if followed by EOF marker.
179 def goahead(self, end):
180 rawdata = self.rawdata
181 i = 0
182 n = len(rawdata)
183 while i < n:
184 match = self.interesting.search(rawdata, i) # < or &
185 if match:
186 j = match.start()
187 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200188 if self.cdata_elem:
189 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000190 j = n
191 if i < j: self.handle_data(rawdata[i:j])
192 i = self.updatepos(i, j)
193 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000194 startswith = rawdata.startswith
195 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000196 if starttagopen.match(rawdata, i): # < + letter
197 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000198 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000199 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000200 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000201 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000202 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000203 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000204 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200205 if self.strict:
206 k = self.parse_declaration(i)
207 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200208 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000209 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000210 self.handle_data("<")
211 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000212 else:
213 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000214 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000215 if not end:
216 break
217 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000218 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000219 k = rawdata.find('>', i + 1)
220 if k < 0:
221 k = rawdata.find('<', i + 1)
222 if k < 0:
223 k = i + 1
224 else:
225 k += 1
226 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000227 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000228 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000229 match = charref.match(rawdata, i)
230 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000231 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000232 self.handle_charref(name)
233 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000234 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000235 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000236 i = self.updatepos(i, k)
237 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000238 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000239 if ";" in rawdata[i:]: #bail by consuming &#
240 self.handle_data(rawdata[0:2])
241 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000242 break
Fred Drake248b0432001-12-03 17:09:50 +0000243 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000244 match = entityref.match(rawdata, i)
245 if match:
246 name = match.group(1)
247 self.handle_entityref(name)
248 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000249 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000250 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000251 i = self.updatepos(i, k)
252 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000253 match = incomplete.match(rawdata, i)
254 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000255 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000256 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000257 if self.strict:
258 self.error("EOF in middle of entity or char ref")
259 else:
Ezio Melotti8e596a72013-05-01 16:18:25 +0300260 k = match.end()
R. David Murrayb579dba2010-12-03 04:06:39 +0000261 if k <= i:
262 k = n
263 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000264 # incomplete
265 break
266 elif (i + 1) < n:
267 # not the end of the buffer, and can't be confused
268 # with some other construct
269 self.handle_data("&")
270 i = self.updatepos(i, i + 1)
271 else:
272 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000273 else:
274 assert 0, "interesting.search() lied"
275 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200276 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000277 self.handle_data(rawdata[i:n])
278 i = self.updatepos(i, n)
279 self.rawdata = rawdata[i:]
280
Ezio Melottif4ab4912012-02-13 15:50:37 +0200281 # Internal -- parse html declarations, return length or -1 if not terminated
282 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
283 # See also parse_declaration in _markupbase
284 def parse_html_declaration(self, i):
285 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200286 assert rawdata[i:i+2] == '<!', ('unexpected call to '
287 'parse_html_declaration()')
Ezio Melottif4ab4912012-02-13 15:50:37 +0200288 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200289 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200290 return self.parse_comment(i)
291 elif rawdata[i:i+3] == '<![':
292 return self.parse_marked_section(i)
293 elif rawdata[i:i+9].lower() == '<!doctype':
294 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200295 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200296 if gtpos == -1:
297 return -1
298 self.handle_decl(rawdata[i+2:gtpos])
299 return gtpos+1
300 else:
301 return self.parse_bogus_comment(i)
302
Ezio Melottifa3702d2012-02-10 10:45:44 +0200303 # Internal -- parse bogus comment, return length or -1 if not terminated
304 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
305 def parse_bogus_comment(self, i, report=1):
306 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200307 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
308 'parse_comment()')
Ezio Melottifa3702d2012-02-10 10:45:44 +0200309 pos = rawdata.find('>', i+2)
310 if pos == -1:
311 return -1
312 if report:
313 self.handle_comment(rawdata[i+2:pos])
314 return pos + 1
315
Guido van Rossum8846d712001-05-18 14:50:52 +0000316 # Internal -- parse processing instr, return end or -1 if not terminated
317 def parse_pi(self, i):
318 rawdata = self.rawdata
319 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
320 match = piclose.search(rawdata, i+2) # >
321 if not match:
322 return -1
323 j = match.start()
324 self.handle_pi(rawdata[i+2: j])
325 j = match.end()
326 return j
327
328 # Internal -- handle starttag, return end or -1 if not terminated
329 def parse_starttag(self, i):
330 self.__starttag_text = None
331 endpos = self.check_for_whole_start_tag(i)
332 if endpos < 0:
333 return endpos
334 rawdata = self.rawdata
335 self.__starttag_text = rawdata[i:endpos]
336
337 # Now parse the data between i+1 and j into a tag and attrs
338 attrs = []
Ezio Melotti7165d8b2013-11-07 18:33:24 +0200339 if self.strict:
340 match = tagfind.match(rawdata, i+1)
341 else:
342 match = tagfind_tolerant.match(rawdata, i+1)
Guido van Rossum8846d712001-05-18 14:50:52 +0000343 assert match, 'unexpected call to parse_starttag()'
344 k = match.end()
Ezio Melotti0780b6b2012-04-18 19:18:22 -0600345 self.lasttag = tag = match.group(1).lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000346 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000347 if self.strict:
348 m = attrfind.match(rawdata, k)
349 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300350 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000351 if not m:
352 break
353 attrname, rest, attrvalue = m.group(1, 2, 3)
354 if not rest:
355 attrvalue = None
356 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
357 attrvalue[:1] == '"' == attrvalue[-1:]:
358 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200359 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000360 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000361 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000362 k = m.end()
363
Fred Drake248b0432001-12-03 17:09:50 +0000364 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000365 if end not in (">", "/>"):
366 lineno, offset = self.getpos()
367 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000368 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000369 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000370 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000371 else:
372 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000373 if self.strict:
374 self.error("junk characters in start tag: %r"
375 % (rawdata[k:endpos][:20],))
376 self.handle_data(rawdata[i:endpos])
377 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000378 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000379 # XHTML-style empty tag: <span attr="value" />
380 self.handle_startendtag(tag, attrs)
381 else:
382 self.handle_starttag(tag, attrs)
383 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200384 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000385 return endpos
386
387 # Internal -- check to see if we have a complete starttag; return end
388 # or -1 if incomplete.
389 def check_for_whole_start_tag(self, i):
390 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000391 if self.strict:
392 m = locatestarttagend.match(rawdata, i)
393 else:
394 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000395 if m:
396 j = m.end()
397 next = rawdata[j:j+1]
398 if next == ">":
399 return j + 1
400 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000401 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000402 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000403 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000404 # buffer boundary
405 return -1
406 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000407 if self.strict:
408 self.updatepos(i, j + 1)
409 self.error("malformed empty start tag")
410 if j > i:
411 return j
412 else:
413 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000414 if next == "":
415 # end of input
416 return -1
417 if next in ("abcdefghijklmnopqrstuvwxyz=/"
418 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
419 # end of input in or before attribute value, or we have the
420 # '/' from a '/>' ending
421 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000422 if self.strict:
423 self.updatepos(i, j)
424 self.error("malformed start tag")
425 if j > i:
426 return j
427 else:
428 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000429 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000430
431 # Internal -- parse endtag, return end or -1 if incomplete
432 def parse_endtag(self, i):
433 rawdata = self.rawdata
434 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
435 match = endendtag.search(rawdata, i+1) # >
436 if not match:
437 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200438 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000439 match = endtagfind.match(rawdata, i) # </ + tag + >
440 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200441 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200442 self.handle_data(rawdata[i:gtpos])
443 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000444 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200445 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
446 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
447 namematch = tagfind_tolerant.match(rawdata, i+2)
448 if not namematch:
449 # w3.org/TR/html5/tokenization.html#end-tag-open-state
450 if rawdata[i:i+3] == '</>':
451 return i+3
452 else:
453 return self.parse_bogus_comment(i)
Ezio Melotti7165d8b2013-11-07 18:33:24 +0200454 tagname = namematch.group(1).lower()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200455 # consume and ignore other stuff between the name and the >
456 # Note: this is not 100% correct, since we might have things like
457 # </tag attr=">">, but looking for > after tha name should cover
458 # most of the cases and is much simpler
459 gtpos = rawdata.find('>', namematch.end())
460 self.handle_endtag(tagname)
461 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200462
463 elem = match.group(1).lower() # script or style
464 if self.cdata_elem is not None:
465 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200466 self.handle_data(rawdata[i:gtpos])
467 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200468
469 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000470 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200471 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000472
473 # Overridable -- finish processing of start+end tag: <tag.../>
474 def handle_startendtag(self, tag, attrs):
475 self.handle_starttag(tag, attrs)
476 self.handle_endtag(tag)
477
478 # Overridable -- handle start tag
479 def handle_starttag(self, tag, attrs):
480 pass
481
482 # Overridable -- handle end tag
483 def handle_endtag(self, tag):
484 pass
485
486 # Overridable -- handle character reference
487 def handle_charref(self, name):
488 pass
489
490 # Overridable -- handle entity reference
491 def handle_entityref(self, name):
492 pass
493
494 # Overridable -- handle data
495 def handle_data(self, data):
496 pass
497
498 # Overridable -- handle comment
499 def handle_comment(self, data):
500 pass
501
502 # Overridable -- handle declaration
503 def handle_decl(self, decl):
504 pass
505
506 # Overridable -- handle processing instruction
507 def handle_pi(self, data):
508 pass
509
Fred Drakebfc8fea2001-09-24 20:10:28 +0000510 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000511 if self.strict:
512 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000513
Guido van Rossum8846d712001-05-18 14:50:52 +0000514 # Internal -- helper to remove special character quoting
515 def unescape(self, s):
516 if '&' not in s:
517 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000518 def replaceEntities(s):
519 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000520 try:
521 if s[0] == "#":
522 s = s[1:]
523 if s[0] in ['x','X']:
Ezio Melotti46495182012-06-24 22:02:56 +0200524 c = int(s[1:].rstrip(';'), 16)
Senthil Kumaran164540f2010-12-28 15:55:16 +0000525 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200526 c = int(s.rstrip(';'))
Senthil Kumaran164540f2010-12-28 15:55:16 +0000527 return chr(c)
528 except ValueError:
Ezio Melotti46495182012-06-24 22:02:56 +0200529 return '&#' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000530 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200531 from html.entities import html5
532 if s in html5:
533 return html5[s]
534 elif s.endswith(';'):
535 return '&' + s
536 for x in range(2, len(s)):
537 if s[:x] in html5:
538 return html5[s[:x]] + s[x:]
539 else:
540 return '&' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000541
Ezio Melotti46495182012-06-24 22:02:56 +0200542 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300543 replaceEntities, s, flags=re.ASCII)