blob: 2d3bef351b0ad0fe2285166908dec2ba4f9070ca [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Ezio Melotti3861d8b2012-06-23 15:27:51 +020013import warnings
Guido van Rossum8846d712001-05-18 14:50:52 +000014
15# Regular expressions used for parsing
16
17interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
Ezio Melotti29877e82012-02-21 09:25:00 +020026# Note:
27# 1) the strict attrfind isn't really strict, but we can't make it
28# correctly strict without breaking backward compatibility;
Ezio Melotti7165d8b2013-11-07 18:33:24 +020029# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
30# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
Ezio Melotti29877e82012-02-21 09:25:00 +020031# explode, so don't do it.
Ezio Melotti7165d8b2013-11-07 18:33:24 +020032tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
33# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
34# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
35tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000036attrfind = re.compile(
37 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030038 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000039attrfind_tolerant = re.compile(
Ezio Melotti0780b6b2012-04-18 19:18:22 -060040 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
Ezio Melotti29877e82012-02-21 09:25:00 +020041 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000042locatestarttagend = re.compile(r"""
43 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
44 (?:\s+ # whitespace before attribute name
45 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
46 (?:\s*=\s* # value indicator
47 (?:'[^']*' # LITA-enclosed value
48 |\"[^\"]*\" # LIT-enclosed value
49 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000050 )
Guido van Rossum8846d712001-05-18 14:50:52 +000051 )?
52 )
53 )*
54 \s* # trailing whitespace
55""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000056locatestarttagend_tolerant = re.compile(r"""
Ezio Melotti7165d8b2013-11-07 18:33:24 +020057 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
Ezio Melotti29877e82012-02-21 09:25:00 +020058 (?:[\s/]* # optional whitespace before attribute name
59 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020060 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000061 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020062 |"[^"]*" # LIT-enclosed value
63 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000064 )
65 (?:\s*,)* # possibly followed by a comma
Ezio Melotti29877e82012-02-21 09:25:00 +020066 )?(?:\s|/(?!>))*
Ezio Melottic2fe5772011-11-14 18:53:33 +020067 )*
68 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000069 \s* # trailing whitespace
70""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000071endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020072# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
73# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000074endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
75
Guido van Rossum8846d712001-05-18 14:50:52 +000076
77class HTMLParseError(Exception):
78 """Exception raised for all parse errors."""
79
80 def __init__(self, msg, position=(None, None)):
81 assert msg
82 self.msg = msg
83 self.lineno = position[0]
84 self.offset = position[1]
85
86 def __str__(self):
87 result = self.msg
88 if self.lineno is not None:
89 result = result + ", at line %d" % self.lineno
90 if self.offset is not None:
91 result = result + ", column %d" % (self.offset + 1)
92 return result
93
94
Fred Drakecb5c80f2007-12-07 11:10:11 +000095class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000096 """Find tags and other markup and call handler functions.
97
98 Usage:
99 p = HTMLParser()
100 p.feed(data)
101 ...
102 p.close()
103
104 Start tags are handled by calling self.handle_starttag() or
105 self.handle_startendtag(); end tags by self.handle_endtag(). The
106 data between tags is passed from the parser to the derived class
107 by calling self.handle_data() with the data as argument (the data
108 may be split up in arbitrary chunks). Entity references are
109 passed by calling self.handle_entityref() with the entity
110 reference as the argument. Numeric character references are
111 passed to self.handle_charref() with the string containing the
112 reference as the argument.
113 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000114
115 CDATA_CONTENT_ELEMENTS = ("script", "style")
116
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200117 def __init__(self, strict=False):
R. David Murrayb579dba2010-12-03 04:06:39 +0000118 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000119
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200120 If strict is set to False (the default) the parser will parse invalid
121 markup, otherwise it will raise an error. Note that the strict mode
122 is deprecated.
R. David Murrayb579dba2010-12-03 04:06:39 +0000123 """
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200124 if strict:
125 warnings.warn("The strict mode is deprecated.",
126 DeprecationWarning, stacklevel=2)
R. David Murrayb579dba2010-12-03 04:06:39 +0000127 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000128 self.reset()
129
Guido van Rossum8846d712001-05-18 14:50:52 +0000130 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000131 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000132 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000133 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000134 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200135 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000136 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000137
Guido van Rossum8846d712001-05-18 14:50:52 +0000138 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200139 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000140
141 Call this as often as you want, with as little or as much text
142 as you want (may include '\n').
143 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 self.rawdata = self.rawdata + data
145 self.goahead(0)
146
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000148 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 self.goahead(1)
150
Fred Drakebfc8fea2001-09-24 20:10:28 +0000151 def error(self, message):
152 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000153
154 __starttag_text = None
155
Guido van Rossum8846d712001-05-18 14:50:52 +0000156 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000157 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000158 return self.__starttag_text
159
Ezio Melotti7de56f62011-11-01 14:12:22 +0200160 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200161 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200162 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000163
164 def clear_cdata_mode(self):
165 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200166 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000167
168 # Internal -- handle data as far as reasonable. May leave state
169 # and data to be processed by a subsequent call. If 'end' is
170 # true, force handling all data as if followed by EOF marker.
171 def goahead(self, end):
172 rawdata = self.rawdata
173 i = 0
174 n = len(rawdata)
175 while i < n:
176 match = self.interesting.search(rawdata, i) # < or &
177 if match:
178 j = match.start()
179 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200180 if self.cdata_elem:
181 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000182 j = n
183 if i < j: self.handle_data(rawdata[i:j])
184 i = self.updatepos(i, j)
185 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000186 startswith = rawdata.startswith
187 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000188 if starttagopen.match(rawdata, i): # < + letter
189 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000190 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000191 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000192 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000193 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000194 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000195 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000196 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200197 if self.strict:
198 k = self.parse_declaration(i)
199 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200200 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000201 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000202 self.handle_data("<")
203 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000204 else:
205 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000207 if not end:
208 break
209 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000210 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000211 k = rawdata.find('>', i + 1)
212 if k < 0:
213 k = rawdata.find('<', i + 1)
214 if k < 0:
215 k = i + 1
216 else:
217 k += 1
218 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000219 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000220 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 match = charref.match(rawdata, i)
222 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000223 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000224 self.handle_charref(name)
225 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000226 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000227 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000228 i = self.updatepos(i, k)
229 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000230 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000231 if ";" in rawdata[i:]: #bail by consuming &#
232 self.handle_data(rawdata[0:2])
233 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000234 break
Fred Drake248b0432001-12-03 17:09:50 +0000235 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000236 match = entityref.match(rawdata, i)
237 if match:
238 name = match.group(1)
239 self.handle_entityref(name)
240 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000241 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000242 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000243 i = self.updatepos(i, k)
244 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000245 match = incomplete.match(rawdata, i)
246 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000247 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000248 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000249 if self.strict:
250 self.error("EOF in middle of entity or char ref")
251 else:
Ezio Melotti8e596a72013-05-01 16:18:25 +0300252 k = match.end()
R. David Murrayb579dba2010-12-03 04:06:39 +0000253 if k <= i:
254 k = n
255 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000256 # incomplete
257 break
258 elif (i + 1) < n:
259 # not the end of the buffer, and can't be confused
260 # with some other construct
261 self.handle_data("&")
262 i = self.updatepos(i, i + 1)
263 else:
264 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000265 else:
266 assert 0, "interesting.search() lied"
267 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200268 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000269 self.handle_data(rawdata[i:n])
270 i = self.updatepos(i, n)
271 self.rawdata = rawdata[i:]
272
Ezio Melottif4ab4912012-02-13 15:50:37 +0200273 # Internal -- parse html declarations, return length or -1 if not terminated
274 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
275 # See also parse_declaration in _markupbase
276 def parse_html_declaration(self, i):
277 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200278 assert rawdata[i:i+2] == '<!', ('unexpected call to '
279 'parse_html_declaration()')
Ezio Melottif4ab4912012-02-13 15:50:37 +0200280 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200281 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200282 return self.parse_comment(i)
283 elif rawdata[i:i+3] == '<![':
284 return self.parse_marked_section(i)
285 elif rawdata[i:i+9].lower() == '<!doctype':
286 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200287 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200288 if gtpos == -1:
289 return -1
290 self.handle_decl(rawdata[i+2:gtpos])
291 return gtpos+1
292 else:
293 return self.parse_bogus_comment(i)
294
Ezio Melottifa3702d2012-02-10 10:45:44 +0200295 # Internal -- parse bogus comment, return length or -1 if not terminated
296 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
297 def parse_bogus_comment(self, i, report=1):
298 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200299 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
300 'parse_comment()')
Ezio Melottifa3702d2012-02-10 10:45:44 +0200301 pos = rawdata.find('>', i+2)
302 if pos == -1:
303 return -1
304 if report:
305 self.handle_comment(rawdata[i+2:pos])
306 return pos + 1
307
Guido van Rossum8846d712001-05-18 14:50:52 +0000308 # Internal -- parse processing instr, return end or -1 if not terminated
309 def parse_pi(self, i):
310 rawdata = self.rawdata
311 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
312 match = piclose.search(rawdata, i+2) # >
313 if not match:
314 return -1
315 j = match.start()
316 self.handle_pi(rawdata[i+2: j])
317 j = match.end()
318 return j
319
320 # Internal -- handle starttag, return end or -1 if not terminated
321 def parse_starttag(self, i):
322 self.__starttag_text = None
323 endpos = self.check_for_whole_start_tag(i)
324 if endpos < 0:
325 return endpos
326 rawdata = self.rawdata
327 self.__starttag_text = rawdata[i:endpos]
328
329 # Now parse the data between i+1 and j into a tag and attrs
330 attrs = []
Ezio Melotti7165d8b2013-11-07 18:33:24 +0200331 if self.strict:
332 match = tagfind.match(rawdata, i+1)
333 else:
334 match = tagfind_tolerant.match(rawdata, i+1)
Guido van Rossum8846d712001-05-18 14:50:52 +0000335 assert match, 'unexpected call to parse_starttag()'
336 k = match.end()
Ezio Melotti0780b6b2012-04-18 19:18:22 -0600337 self.lasttag = tag = match.group(1).lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000338 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000339 if self.strict:
340 m = attrfind.match(rawdata, k)
341 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300342 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000343 if not m:
344 break
345 attrname, rest, attrvalue = m.group(1, 2, 3)
346 if not rest:
347 attrvalue = None
348 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
349 attrvalue[:1] == '"' == attrvalue[-1:]:
350 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200351 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000352 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000353 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000354 k = m.end()
355
Fred Drake248b0432001-12-03 17:09:50 +0000356 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000357 if end not in (">", "/>"):
358 lineno, offset = self.getpos()
359 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000360 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000361 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000362 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000363 else:
364 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000365 if self.strict:
366 self.error("junk characters in start tag: %r"
367 % (rawdata[k:endpos][:20],))
368 self.handle_data(rawdata[i:endpos])
369 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000370 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000371 # XHTML-style empty tag: <span attr="value" />
372 self.handle_startendtag(tag, attrs)
373 else:
374 self.handle_starttag(tag, attrs)
375 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200376 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000377 return endpos
378
379 # Internal -- check to see if we have a complete starttag; return end
380 # or -1 if incomplete.
381 def check_for_whole_start_tag(self, i):
382 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000383 if self.strict:
384 m = locatestarttagend.match(rawdata, i)
385 else:
386 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000387 if m:
388 j = m.end()
389 next = rawdata[j:j+1]
390 if next == ">":
391 return j + 1
392 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000393 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000394 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000395 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000396 # buffer boundary
397 return -1
398 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000399 if self.strict:
400 self.updatepos(i, j + 1)
401 self.error("malformed empty start tag")
402 if j > i:
403 return j
404 else:
405 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000406 if next == "":
407 # end of input
408 return -1
409 if next in ("abcdefghijklmnopqrstuvwxyz=/"
410 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
411 # end of input in or before attribute value, or we have the
412 # '/' from a '/>' ending
413 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000414 if self.strict:
415 self.updatepos(i, j)
416 self.error("malformed start tag")
417 if j > i:
418 return j
419 else:
420 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000421 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000422
423 # Internal -- parse endtag, return end or -1 if incomplete
424 def parse_endtag(self, i):
425 rawdata = self.rawdata
426 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
427 match = endendtag.search(rawdata, i+1) # >
428 if not match:
429 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200430 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000431 match = endtagfind.match(rawdata, i) # </ + tag + >
432 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200433 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200434 self.handle_data(rawdata[i:gtpos])
435 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000436 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200437 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
438 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
439 namematch = tagfind_tolerant.match(rawdata, i+2)
440 if not namematch:
441 # w3.org/TR/html5/tokenization.html#end-tag-open-state
442 if rawdata[i:i+3] == '</>':
443 return i+3
444 else:
445 return self.parse_bogus_comment(i)
Ezio Melotti7165d8b2013-11-07 18:33:24 +0200446 tagname = namematch.group(1).lower()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200447 # consume and ignore other stuff between the name and the >
448 # Note: this is not 100% correct, since we might have things like
449 # </tag attr=">">, but looking for > after tha name should cover
450 # most of the cases and is much simpler
451 gtpos = rawdata.find('>', namematch.end())
452 self.handle_endtag(tagname)
453 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200454
455 elem = match.group(1).lower() # script or style
456 if self.cdata_elem is not None:
457 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200458 self.handle_data(rawdata[i:gtpos])
459 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200460
461 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000462 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200463 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000464
465 # Overridable -- finish processing of start+end tag: <tag.../>
466 def handle_startendtag(self, tag, attrs):
467 self.handle_starttag(tag, attrs)
468 self.handle_endtag(tag)
469
470 # Overridable -- handle start tag
471 def handle_starttag(self, tag, attrs):
472 pass
473
474 # Overridable -- handle end tag
475 def handle_endtag(self, tag):
476 pass
477
478 # Overridable -- handle character reference
479 def handle_charref(self, name):
480 pass
481
482 # Overridable -- handle entity reference
483 def handle_entityref(self, name):
484 pass
485
486 # Overridable -- handle data
487 def handle_data(self, data):
488 pass
489
490 # Overridable -- handle comment
491 def handle_comment(self, data):
492 pass
493
494 # Overridable -- handle declaration
495 def handle_decl(self, decl):
496 pass
497
498 # Overridable -- handle processing instruction
499 def handle_pi(self, data):
500 pass
501
Fred Drakebfc8fea2001-09-24 20:10:28 +0000502 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000503 if self.strict:
504 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000505
Guido van Rossum8846d712001-05-18 14:50:52 +0000506 # Internal -- helper to remove special character quoting
507 def unescape(self, s):
508 if '&' not in s:
509 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000510 def replaceEntities(s):
511 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000512 try:
513 if s[0] == "#":
514 s = s[1:]
515 if s[0] in ['x','X']:
Ezio Melotti46495182012-06-24 22:02:56 +0200516 c = int(s[1:].rstrip(';'), 16)
Senthil Kumaran164540f2010-12-28 15:55:16 +0000517 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200518 c = int(s.rstrip(';'))
Senthil Kumaran164540f2010-12-28 15:55:16 +0000519 return chr(c)
520 except ValueError:
Ezio Melotti46495182012-06-24 22:02:56 +0200521 return '&#' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000522 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200523 from html.entities import html5
524 if s in html5:
525 return html5[s]
526 elif s.endswith(';'):
527 return '&' + s
528 for x in range(2, len(s)):
529 if s[:x] in html5:
530 return html5[s[:x]] + s[x:]
531 else:
532 return '&' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000533
Ezio Melotti46495182012-06-24 22:02:56 +0200534 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300535 replaceEntities, s, flags=re.ASCII)