blob: f8ac82834a3e200fe707e818b1260e2417bcf660 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Ezio Melotti3861d8b2012-06-23 15:27:51 +020013import warnings
Guido van Rossum8846d712001-05-18 14:50:52 +000014
15# Regular expressions used for parsing
16
17interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
Ezio Melotti0780b6b2012-04-18 19:18:22 -060026tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
Ezio Melotti5211ffe2012-02-13 11:24:50 +020027# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
28# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
29tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
Ezio Melotti29877e82012-02-21 09:25:00 +020030# Note:
31# 1) the strict attrfind isn't really strict, but we can't make it
32# correctly strict without breaking backward compatibility;
33# 2) if you change attrfind remember to update locatestarttagend too;
34# 3) if you change attrfind and/or locatestarttagend the parser will
35# explode, so don't do it.
Guido van Rossum8846d712001-05-18 14:50:52 +000036attrfind = re.compile(
37 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030038 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000039attrfind_tolerant = re.compile(
Ezio Melotti0780b6b2012-04-18 19:18:22 -060040 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
Ezio Melotti29877e82012-02-21 09:25:00 +020041 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000042locatestarttagend = re.compile(r"""
43 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
44 (?:\s+ # whitespace before attribute name
45 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
46 (?:\s*=\s* # value indicator
47 (?:'[^']*' # LITA-enclosed value
48 |\"[^\"]*\" # LIT-enclosed value
49 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000050 )
Guido van Rossum8846d712001-05-18 14:50:52 +000051 )?
52 )
53 )*
54 \s* # trailing whitespace
55""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000056locatestarttagend_tolerant = re.compile(r"""
57 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
Ezio Melotti29877e82012-02-21 09:25:00 +020058 (?:[\s/]* # optional whitespace before attribute name
59 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020060 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000061 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020062 |"[^"]*" # LIT-enclosed value
63 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000064 )
65 (?:\s*,)* # possibly followed by a comma
Ezio Melotti29877e82012-02-21 09:25:00 +020066 )?(?:\s|/(?!>))*
Ezio Melottic2fe5772011-11-14 18:53:33 +020067 )*
68 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000069 \s* # trailing whitespace
70""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000071endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020072# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
73# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000074endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
75
Guido van Rossum8846d712001-05-18 14:50:52 +000076
77class HTMLParseError(Exception):
78 """Exception raised for all parse errors."""
79
80 def __init__(self, msg, position=(None, None)):
81 assert msg
82 self.msg = msg
83 self.lineno = position[0]
84 self.offset = position[1]
85
86 def __str__(self):
87 result = self.msg
88 if self.lineno is not None:
89 result = result + ", at line %d" % self.lineno
90 if self.offset is not None:
91 result = result + ", column %d" % (self.offset + 1)
92 return result
93
94
Fred Drakecb5c80f2007-12-07 11:10:11 +000095class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000096 """Find tags and other markup and call handler functions.
97
98 Usage:
99 p = HTMLParser()
100 p.feed(data)
101 ...
102 p.close()
103
104 Start tags are handled by calling self.handle_starttag() or
105 self.handle_startendtag(); end tags by self.handle_endtag(). The
106 data between tags is passed from the parser to the derived class
107 by calling self.handle_data() with the data as argument (the data
108 may be split up in arbitrary chunks). Entity references are
109 passed by calling self.handle_entityref() with the entity
110 reference as the argument. Numeric character references are
111 passed to self.handle_charref() with the string containing the
112 reference as the argument.
113 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000114
115 CDATA_CONTENT_ELEMENTS = ("script", "style")
116
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200117 def __init__(self, strict=False):
R. David Murrayb579dba2010-12-03 04:06:39 +0000118 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000119
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200120 If strict is set to False (the default) the parser will parse invalid
121 markup, otherwise it will raise an error. Note that the strict mode
122 is deprecated.
R. David Murrayb579dba2010-12-03 04:06:39 +0000123 """
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200124 if strict:
125 warnings.warn("The strict mode is deprecated.",
126 DeprecationWarning, stacklevel=2)
R. David Murrayb579dba2010-12-03 04:06:39 +0000127 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000128 self.reset()
129
Guido van Rossum8846d712001-05-18 14:50:52 +0000130 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000131 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000132 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000133 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000134 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200135 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000136 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000137
Guido van Rossum8846d712001-05-18 14:50:52 +0000138 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200139 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000140
141 Call this as often as you want, with as little or as much text
142 as you want (may include '\n').
143 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 self.rawdata = self.rawdata + data
145 self.goahead(0)
146
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000148 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 self.goahead(1)
150
Fred Drakebfc8fea2001-09-24 20:10:28 +0000151 def error(self, message):
152 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000153
154 __starttag_text = None
155
Guido van Rossum8846d712001-05-18 14:50:52 +0000156 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000157 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000158 return self.__starttag_text
159
Ezio Melotti7de56f62011-11-01 14:12:22 +0200160 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200161 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200162 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000163
164 def clear_cdata_mode(self):
165 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200166 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000167
168 # Internal -- handle data as far as reasonable. May leave state
169 # and data to be processed by a subsequent call. If 'end' is
170 # true, force handling all data as if followed by EOF marker.
171 def goahead(self, end):
172 rawdata = self.rawdata
173 i = 0
174 n = len(rawdata)
175 while i < n:
176 match = self.interesting.search(rawdata, i) # < or &
177 if match:
178 j = match.start()
179 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200180 if self.cdata_elem:
181 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000182 j = n
183 if i < j: self.handle_data(rawdata[i:j])
184 i = self.updatepos(i, j)
185 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000186 startswith = rawdata.startswith
187 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000188 if starttagopen.match(rawdata, i): # < + letter
189 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000190 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000191 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000192 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000193 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000194 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000195 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000196 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200197 if self.strict:
198 k = self.parse_declaration(i)
199 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200200 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000201 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000202 self.handle_data("<")
203 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000204 else:
205 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000207 if not end:
208 break
209 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000210 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000211 k = rawdata.find('>', i + 1)
212 if k < 0:
213 k = rawdata.find('<', i + 1)
214 if k < 0:
215 k = i + 1
216 else:
217 k += 1
218 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000219 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000220 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 match = charref.match(rawdata, i)
222 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000223 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000224 self.handle_charref(name)
225 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000226 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000227 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000228 i = self.updatepos(i, k)
229 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000230 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000231 if ";" in rawdata[i:]: #bail by consuming &#
232 self.handle_data(rawdata[0:2])
233 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000234 break
Fred Drake248b0432001-12-03 17:09:50 +0000235 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000236 match = entityref.match(rawdata, i)
237 if match:
238 name = match.group(1)
239 self.handle_entityref(name)
240 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000241 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000242 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000243 i = self.updatepos(i, k)
244 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000245 match = incomplete.match(rawdata, i)
246 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000247 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000248 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000249 if self.strict:
250 self.error("EOF in middle of entity or char ref")
251 else:
252 if k <= i:
253 k = n
254 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000255 # incomplete
256 break
257 elif (i + 1) < n:
258 # not the end of the buffer, and can't be confused
259 # with some other construct
260 self.handle_data("&")
261 i = self.updatepos(i, i + 1)
262 else:
263 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000264 else:
265 assert 0, "interesting.search() lied"
266 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200267 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000268 self.handle_data(rawdata[i:n])
269 i = self.updatepos(i, n)
270 self.rawdata = rawdata[i:]
271
Ezio Melottif4ab4912012-02-13 15:50:37 +0200272 # Internal -- parse html declarations, return length or -1 if not terminated
273 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
274 # See also parse_declaration in _markupbase
275 def parse_html_declaration(self, i):
276 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200277 assert rawdata[i:i+2] == '<!', ('unexpected call to '
278 'parse_html_declaration()')
Ezio Melottif4ab4912012-02-13 15:50:37 +0200279 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200280 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200281 return self.parse_comment(i)
282 elif rawdata[i:i+3] == '<![':
283 return self.parse_marked_section(i)
284 elif rawdata[i:i+9].lower() == '<!doctype':
285 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200286 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200287 if gtpos == -1:
288 return -1
289 self.handle_decl(rawdata[i+2:gtpos])
290 return gtpos+1
291 else:
292 return self.parse_bogus_comment(i)
293
Ezio Melottifa3702d2012-02-10 10:45:44 +0200294 # Internal -- parse bogus comment, return length or -1 if not terminated
295 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
296 def parse_bogus_comment(self, i, report=1):
297 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200298 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
299 'parse_comment()')
Ezio Melottifa3702d2012-02-10 10:45:44 +0200300 pos = rawdata.find('>', i+2)
301 if pos == -1:
302 return -1
303 if report:
304 self.handle_comment(rawdata[i+2:pos])
305 return pos + 1
306
Guido van Rossum8846d712001-05-18 14:50:52 +0000307 # Internal -- parse processing instr, return end or -1 if not terminated
308 def parse_pi(self, i):
309 rawdata = self.rawdata
310 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
311 match = piclose.search(rawdata, i+2) # >
312 if not match:
313 return -1
314 j = match.start()
315 self.handle_pi(rawdata[i+2: j])
316 j = match.end()
317 return j
318
319 # Internal -- handle starttag, return end or -1 if not terminated
320 def parse_starttag(self, i):
321 self.__starttag_text = None
322 endpos = self.check_for_whole_start_tag(i)
323 if endpos < 0:
324 return endpos
325 rawdata = self.rawdata
326 self.__starttag_text = rawdata[i:endpos]
327
328 # Now parse the data between i+1 and j into a tag and attrs
329 attrs = []
330 match = tagfind.match(rawdata, i+1)
331 assert match, 'unexpected call to parse_starttag()'
332 k = match.end()
Ezio Melotti0780b6b2012-04-18 19:18:22 -0600333 self.lasttag = tag = match.group(1).lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000334 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000335 if self.strict:
336 m = attrfind.match(rawdata, k)
337 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300338 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000339 if not m:
340 break
341 attrname, rest, attrvalue = m.group(1, 2, 3)
342 if not rest:
343 attrvalue = None
344 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
345 attrvalue[:1] == '"' == attrvalue[-1:]:
346 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200347 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000348 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000349 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000350 k = m.end()
351
Fred Drake248b0432001-12-03 17:09:50 +0000352 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000353 if end not in (">", "/>"):
354 lineno, offset = self.getpos()
355 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000356 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000357 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000358 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000359 else:
360 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000361 if self.strict:
362 self.error("junk characters in start tag: %r"
363 % (rawdata[k:endpos][:20],))
364 self.handle_data(rawdata[i:endpos])
365 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000366 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000367 # XHTML-style empty tag: <span attr="value" />
368 self.handle_startendtag(tag, attrs)
369 else:
370 self.handle_starttag(tag, attrs)
371 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200372 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000373 return endpos
374
375 # Internal -- check to see if we have a complete starttag; return end
376 # or -1 if incomplete.
377 def check_for_whole_start_tag(self, i):
378 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000379 if self.strict:
380 m = locatestarttagend.match(rawdata, i)
381 else:
382 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000383 if m:
384 j = m.end()
385 next = rawdata[j:j+1]
386 if next == ">":
387 return j + 1
388 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000389 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000390 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000391 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000392 # buffer boundary
393 return -1
394 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000395 if self.strict:
396 self.updatepos(i, j + 1)
397 self.error("malformed empty start tag")
398 if j > i:
399 return j
400 else:
401 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000402 if next == "":
403 # end of input
404 return -1
405 if next in ("abcdefghijklmnopqrstuvwxyz=/"
406 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
407 # end of input in or before attribute value, or we have the
408 # '/' from a '/>' ending
409 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000410 if self.strict:
411 self.updatepos(i, j)
412 self.error("malformed start tag")
413 if j > i:
414 return j
415 else:
416 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000417 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000418
419 # Internal -- parse endtag, return end or -1 if incomplete
420 def parse_endtag(self, i):
421 rawdata = self.rawdata
422 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
423 match = endendtag.search(rawdata, i+1) # >
424 if not match:
425 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200426 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000427 match = endtagfind.match(rawdata, i) # </ + tag + >
428 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200429 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200430 self.handle_data(rawdata[i:gtpos])
431 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000432 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200433 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
434 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
435 namematch = tagfind_tolerant.match(rawdata, i+2)
436 if not namematch:
437 # w3.org/TR/html5/tokenization.html#end-tag-open-state
438 if rawdata[i:i+3] == '</>':
439 return i+3
440 else:
441 return self.parse_bogus_comment(i)
442 tagname = namematch.group().lower()
443 # consume and ignore other stuff between the name and the >
444 # Note: this is not 100% correct, since we might have things like
445 # </tag attr=">">, but looking for > after tha name should cover
446 # most of the cases and is much simpler
447 gtpos = rawdata.find('>', namematch.end())
448 self.handle_endtag(tagname)
449 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200450
451 elem = match.group(1).lower() # script or style
452 if self.cdata_elem is not None:
453 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200454 self.handle_data(rawdata[i:gtpos])
455 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200456
457 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000458 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200459 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000460
461 # Overridable -- finish processing of start+end tag: <tag.../>
462 def handle_startendtag(self, tag, attrs):
463 self.handle_starttag(tag, attrs)
464 self.handle_endtag(tag)
465
466 # Overridable -- handle start tag
467 def handle_starttag(self, tag, attrs):
468 pass
469
470 # Overridable -- handle end tag
471 def handle_endtag(self, tag):
472 pass
473
474 # Overridable -- handle character reference
475 def handle_charref(self, name):
476 pass
477
478 # Overridable -- handle entity reference
479 def handle_entityref(self, name):
480 pass
481
482 # Overridable -- handle data
483 def handle_data(self, data):
484 pass
485
486 # Overridable -- handle comment
487 def handle_comment(self, data):
488 pass
489
490 # Overridable -- handle declaration
491 def handle_decl(self, decl):
492 pass
493
494 # Overridable -- handle processing instruction
495 def handle_pi(self, data):
496 pass
497
Fred Drakebfc8fea2001-09-24 20:10:28 +0000498 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000499 if self.strict:
500 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000501
Guido van Rossum8846d712001-05-18 14:50:52 +0000502 # Internal -- helper to remove special character quoting
503 def unescape(self, s):
504 if '&' not in s:
505 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000506 def replaceEntities(s):
507 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000508 try:
509 if s[0] == "#":
510 s = s[1:]
511 if s[0] in ['x','X']:
Ezio Melotti46495182012-06-24 22:02:56 +0200512 c = int(s[1:].rstrip(';'), 16)
Senthil Kumaran164540f2010-12-28 15:55:16 +0000513 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200514 c = int(s.rstrip(';'))
Senthil Kumaran164540f2010-12-28 15:55:16 +0000515 return chr(c)
516 except ValueError:
Ezio Melotti46495182012-06-24 22:02:56 +0200517 return '&#' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000518 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200519 from html.entities import html5
520 if s in html5:
521 return html5[s]
522 elif s.endswith(';'):
523 return '&' + s
524 for x in range(2, len(s)):
525 if s[:x] in html5:
526 return html5[s[:x]] + s[x:]
527 else:
528 return '&' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000529
Ezio Melotti46495182012-06-24 22:02:56 +0200530 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300531 replaceEntities, s, flags=re.ASCII)