blob: e793c37cd8009190c85d8b8a6bf122f6adcca8bb [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
Ezio Melotti3861d8b2012-06-23 15:27:51 +020012import warnings
Ezio Melotti4a9ee262013-11-19 20:28:45 +020013import _markupbase
14
15from html import unescape
16
Guido van Rossum8846d712001-05-18 14:50:52 +000017
Ezio Melotti1698bab2013-05-01 16:09:34 +030018__all__ = ['HTMLParser']
19
Guido van Rossum8846d712001-05-18 14:50:52 +000020# Regular expressions used for parsing
21
22interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000023incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024
25entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000026charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000027
28starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000029piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000030commentclose = re.compile(r'--\s*>')
Ezio Melotti29877e82012-02-21 09:25:00 +020031# Note:
32# 1) the strict attrfind isn't really strict, but we can't make it
33# correctly strict without breaking backward compatibility;
Ezio Melotti7165d8b2013-11-07 18:33:24 +020034# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
35# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
Ezio Melotti29877e82012-02-21 09:25:00 +020036# explode, so don't do it.
Ezio Melotti7165d8b2013-11-07 18:33:24 +020037tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
38# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
39# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
40tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000041attrfind = re.compile(
42 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030043 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000044attrfind_tolerant = re.compile(
Ezio Melotti0780b6b2012-04-18 19:18:22 -060045 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
Ezio Melotti29877e82012-02-21 09:25:00 +020046 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000047locatestarttagend = re.compile(r"""
48 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
49 (?:\s+ # whitespace before attribute name
50 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
51 (?:\s*=\s* # value indicator
52 (?:'[^']*' # LITA-enclosed value
53 |\"[^\"]*\" # LIT-enclosed value
54 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000055 )
Guido van Rossum8846d712001-05-18 14:50:52 +000056 )?
57 )
58 )*
59 \s* # trailing whitespace
60""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000061locatestarttagend_tolerant = re.compile(r"""
Ezio Melotti7165d8b2013-11-07 18:33:24 +020062 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
Ezio Melotti29877e82012-02-21 09:25:00 +020063 (?:[\s/]* # optional whitespace before attribute name
64 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020065 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000066 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020067 |"[^"]*" # LIT-enclosed value
68 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000069 )
70 (?:\s*,)* # possibly followed by a comma
Ezio Melotti29877e82012-02-21 09:25:00 +020071 )?(?:\s|/(?!>))*
Ezio Melottic2fe5772011-11-14 18:53:33 +020072 )*
73 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000074 \s* # trailing whitespace
75""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000076endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020077# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
78# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000079endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
80
Guido van Rossum8846d712001-05-18 14:50:52 +000081
82class HTMLParseError(Exception):
83 """Exception raised for all parse errors."""
84
85 def __init__(self, msg, position=(None, None)):
86 assert msg
87 self.msg = msg
88 self.lineno = position[0]
89 self.offset = position[1]
90
91 def __str__(self):
92 result = self.msg
93 if self.lineno is not None:
94 result = result + ", at line %d" % self.lineno
95 if self.offset is not None:
96 result = result + ", column %d" % (self.offset + 1)
97 return result
98
99
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200100_strict_sentinel = object()
101
Fred Drakecb5c80f2007-12-07 11:10:11 +0000102class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +0000103 """Find tags and other markup and call handler functions.
104
105 Usage:
106 p = HTMLParser()
107 p.feed(data)
108 ...
109 p.close()
110
111 Start tags are handled by calling self.handle_starttag() or
112 self.handle_startendtag(); end tags by self.handle_endtag(). The
113 data between tags is passed from the parser to the derived class
114 by calling self.handle_data() with the data as argument (the data
115 may be split up in arbitrary chunks). Entity references are
116 passed by calling self.handle_entityref() with the entity
117 reference as the argument. Numeric character references are
118 passed to self.handle_charref() with the string containing the
119 reference as the argument.
120 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000121
122 CDATA_CONTENT_ELEMENTS = ("script", "style")
123
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200124 def __init__(self, strict=_strict_sentinel):
R. David Murrayb579dba2010-12-03 04:06:39 +0000125 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000126
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200127 If strict is set to False (the default) the parser will parse invalid
128 markup, otherwise it will raise an error. Note that the strict mode
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200129 and argument are deprecated.
R. David Murrayb579dba2010-12-03 04:06:39 +0000130 """
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200131 if strict is not _strict_sentinel:
132 warnings.warn("The strict argument and mode are deprecated.",
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200133 DeprecationWarning, stacklevel=2)
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200134 else:
135 strict = False # default
R. David Murrayb579dba2010-12-03 04:06:39 +0000136 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 self.reset()
138
Guido van Rossum8846d712001-05-18 14:50:52 +0000139 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000140 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000141 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000142 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000143 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200144 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000145 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000146
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200148 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000149
150 Call this as often as you want, with as little or as much text
151 as you want (may include '\n').
152 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000153 self.rawdata = self.rawdata + data
154 self.goahead(0)
155
Guido van Rossum8846d712001-05-18 14:50:52 +0000156 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000157 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000158 self.goahead(1)
159
Fred Drakebfc8fea2001-09-24 20:10:28 +0000160 def error(self, message):
Ezio Melotti88ebfb12013-11-02 17:08:24 +0200161 warnings.warn("The 'error' method is deprecated.",
162 DeprecationWarning, stacklevel=2)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000163 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000164
165 __starttag_text = None
166
Guido van Rossum8846d712001-05-18 14:50:52 +0000167 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000168 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000169 return self.__starttag_text
170
Ezio Melotti7de56f62011-11-01 14:12:22 +0200171 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200172 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200173 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000174
175 def clear_cdata_mode(self):
176 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200177 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000178
179 # Internal -- handle data as far as reasonable. May leave state
180 # and data to be processed by a subsequent call. If 'end' is
181 # true, force handling all data as if followed by EOF marker.
182 def goahead(self, end):
183 rawdata = self.rawdata
184 i = 0
185 n = len(rawdata)
186 while i < n:
187 match = self.interesting.search(rawdata, i) # < or &
188 if match:
189 j = match.start()
190 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200191 if self.cdata_elem:
192 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000193 j = n
194 if i < j: self.handle_data(rawdata[i:j])
195 i = self.updatepos(i, j)
196 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000197 startswith = rawdata.startswith
198 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000199 if starttagopen.match(rawdata, i): # < + letter
200 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000201 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000202 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000203 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000204 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000205 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000207 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200208 if self.strict:
209 k = self.parse_declaration(i)
210 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200211 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000212 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000213 self.handle_data("<")
214 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000215 else:
216 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000217 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000218 if not end:
219 break
220 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000221 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000222 k = rawdata.find('>', i + 1)
223 if k < 0:
224 k = rawdata.find('<', i + 1)
225 if k < 0:
226 k = i + 1
227 else:
228 k += 1
229 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000230 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000231 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000232 match = charref.match(rawdata, i)
233 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000234 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000235 self.handle_charref(name)
236 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000237 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000238 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000239 i = self.updatepos(i, k)
240 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000241 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000242 if ";" in rawdata[i:]: #bail by consuming &#
243 self.handle_data(rawdata[0:2])
244 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000245 break
Fred Drake248b0432001-12-03 17:09:50 +0000246 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000247 match = entityref.match(rawdata, i)
248 if match:
249 name = match.group(1)
250 self.handle_entityref(name)
251 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000252 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000253 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000254 i = self.updatepos(i, k)
255 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000256 match = incomplete.match(rawdata, i)
257 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000258 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000259 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000260 if self.strict:
261 self.error("EOF in middle of entity or char ref")
262 else:
Ezio Melotti8e596a72013-05-01 16:18:25 +0300263 k = match.end()
R. David Murrayb579dba2010-12-03 04:06:39 +0000264 if k <= i:
265 k = n
266 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000267 # incomplete
268 break
269 elif (i + 1) < n:
270 # not the end of the buffer, and can't be confused
271 # with some other construct
272 self.handle_data("&")
273 i = self.updatepos(i, i + 1)
274 else:
275 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000276 else:
277 assert 0, "interesting.search() lied"
278 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200279 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000280 self.handle_data(rawdata[i:n])
281 i = self.updatepos(i, n)
282 self.rawdata = rawdata[i:]
283
Ezio Melottif4ab4912012-02-13 15:50:37 +0200284 # Internal -- parse html declarations, return length or -1 if not terminated
285 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
286 # See also parse_declaration in _markupbase
287 def parse_html_declaration(self, i):
288 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200289 assert rawdata[i:i+2] == '<!', ('unexpected call to '
290 'parse_html_declaration()')
Ezio Melottif4ab4912012-02-13 15:50:37 +0200291 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200292 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200293 return self.parse_comment(i)
294 elif rawdata[i:i+3] == '<![':
295 return self.parse_marked_section(i)
296 elif rawdata[i:i+9].lower() == '<!doctype':
297 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200298 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200299 if gtpos == -1:
300 return -1
301 self.handle_decl(rawdata[i+2:gtpos])
302 return gtpos+1
303 else:
304 return self.parse_bogus_comment(i)
305
Ezio Melottifa3702d2012-02-10 10:45:44 +0200306 # Internal -- parse bogus comment, return length or -1 if not terminated
307 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
308 def parse_bogus_comment(self, i, report=1):
309 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200310 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
311 'parse_comment()')
Ezio Melottifa3702d2012-02-10 10:45:44 +0200312 pos = rawdata.find('>', i+2)
313 if pos == -1:
314 return -1
315 if report:
316 self.handle_comment(rawdata[i+2:pos])
317 return pos + 1
318
Guido van Rossum8846d712001-05-18 14:50:52 +0000319 # Internal -- parse processing instr, return end or -1 if not terminated
320 def parse_pi(self, i):
321 rawdata = self.rawdata
322 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
323 match = piclose.search(rawdata, i+2) # >
324 if not match:
325 return -1
326 j = match.start()
327 self.handle_pi(rawdata[i+2: j])
328 j = match.end()
329 return j
330
331 # Internal -- handle starttag, return end or -1 if not terminated
332 def parse_starttag(self, i):
333 self.__starttag_text = None
334 endpos = self.check_for_whole_start_tag(i)
335 if endpos < 0:
336 return endpos
337 rawdata = self.rawdata
338 self.__starttag_text = rawdata[i:endpos]
339
340 # Now parse the data between i+1 and j into a tag and attrs
341 attrs = []
Ezio Melotti7165d8b2013-11-07 18:33:24 +0200342 if self.strict:
343 match = tagfind.match(rawdata, i+1)
344 else:
345 match = tagfind_tolerant.match(rawdata, i+1)
Guido van Rossum8846d712001-05-18 14:50:52 +0000346 assert match, 'unexpected call to parse_starttag()'
347 k = match.end()
Ezio Melotti0780b6b2012-04-18 19:18:22 -0600348 self.lasttag = tag = match.group(1).lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000349 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000350 if self.strict:
351 m = attrfind.match(rawdata, k)
352 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300353 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000354 if not m:
355 break
356 attrname, rest, attrvalue = m.group(1, 2, 3)
357 if not rest:
358 attrvalue = None
359 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
360 attrvalue[:1] == '"' == attrvalue[-1:]:
361 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200362 if attrvalue:
Ezio Melotti4a9ee262013-11-19 20:28:45 +0200363 attrvalue = unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000364 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000365 k = m.end()
366
Fred Drake248b0432001-12-03 17:09:50 +0000367 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000368 if end not in (">", "/>"):
369 lineno, offset = self.getpos()
370 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000371 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000372 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000373 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000374 else:
375 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000376 if self.strict:
377 self.error("junk characters in start tag: %r"
378 % (rawdata[k:endpos][:20],))
379 self.handle_data(rawdata[i:endpos])
380 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000381 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000382 # XHTML-style empty tag: <span attr="value" />
383 self.handle_startendtag(tag, attrs)
384 else:
385 self.handle_starttag(tag, attrs)
386 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200387 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000388 return endpos
389
390 # Internal -- check to see if we have a complete starttag; return end
391 # or -1 if incomplete.
392 def check_for_whole_start_tag(self, i):
393 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000394 if self.strict:
395 m = locatestarttagend.match(rawdata, i)
396 else:
397 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000398 if m:
399 j = m.end()
400 next = rawdata[j:j+1]
401 if next == ">":
402 return j + 1
403 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000404 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000405 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000406 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000407 # buffer boundary
408 return -1
409 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000410 if self.strict:
411 self.updatepos(i, j + 1)
412 self.error("malformed empty start tag")
413 if j > i:
414 return j
415 else:
416 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000417 if next == "":
418 # end of input
419 return -1
420 if next in ("abcdefghijklmnopqrstuvwxyz=/"
421 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
422 # end of input in or before attribute value, or we have the
423 # '/' from a '/>' ending
424 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000425 if self.strict:
426 self.updatepos(i, j)
427 self.error("malformed start tag")
428 if j > i:
429 return j
430 else:
431 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000432 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000433
434 # Internal -- parse endtag, return end or -1 if incomplete
435 def parse_endtag(self, i):
436 rawdata = self.rawdata
437 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
438 match = endendtag.search(rawdata, i+1) # >
439 if not match:
440 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200441 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000442 match = endtagfind.match(rawdata, i) # </ + tag + >
443 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200444 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200445 self.handle_data(rawdata[i:gtpos])
446 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000447 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200448 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
449 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
450 namematch = tagfind_tolerant.match(rawdata, i+2)
451 if not namematch:
452 # w3.org/TR/html5/tokenization.html#end-tag-open-state
453 if rawdata[i:i+3] == '</>':
454 return i+3
455 else:
456 return self.parse_bogus_comment(i)
Ezio Melotti7165d8b2013-11-07 18:33:24 +0200457 tagname = namematch.group(1).lower()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200458 # consume and ignore other stuff between the name and the >
459 # Note: this is not 100% correct, since we might have things like
460 # </tag attr=">">, but looking for > after tha name should cover
461 # most of the cases and is much simpler
462 gtpos = rawdata.find('>', namematch.end())
463 self.handle_endtag(tagname)
464 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200465
466 elem = match.group(1).lower() # script or style
467 if self.cdata_elem is not None:
468 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200469 self.handle_data(rawdata[i:gtpos])
470 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200471
472 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000473 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200474 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000475
476 # Overridable -- finish processing of start+end tag: <tag.../>
477 def handle_startendtag(self, tag, attrs):
478 self.handle_starttag(tag, attrs)
479 self.handle_endtag(tag)
480
481 # Overridable -- handle start tag
482 def handle_starttag(self, tag, attrs):
483 pass
484
485 # Overridable -- handle end tag
486 def handle_endtag(self, tag):
487 pass
488
489 # Overridable -- handle character reference
490 def handle_charref(self, name):
491 pass
492
493 # Overridable -- handle entity reference
494 def handle_entityref(self, name):
495 pass
496
497 # Overridable -- handle data
498 def handle_data(self, data):
499 pass
500
501 # Overridable -- handle comment
502 def handle_comment(self, data):
503 pass
504
505 # Overridable -- handle declaration
506 def handle_decl(self, decl):
507 pass
508
509 # Overridable -- handle processing instruction
510 def handle_pi(self, data):
511 pass
512
Fred Drakebfc8fea2001-09-24 20:10:28 +0000513 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000514 if self.strict:
515 self.error("unknown declaration: %r" % (data,))