blob: 18f31152a3469eaf520e1afcd1f15fa00e73e070 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Ezio Melotti3861d8b2012-06-23 15:27:51 +020013import warnings
Guido van Rossum8846d712001-05-18 14:50:52 +000014
Ezio Melotti1698bab2013-05-01 16:09:34 +030015__all__ = ['HTMLParser']
16
Guido van Rossum8846d712001-05-18 14:50:52 +000017# Regular expressions used for parsing
18
19interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000020incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000023charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024
25starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000026piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000027commentclose = re.compile(r'--\s*>')
Ezio Melotti0780b6b2012-04-18 19:18:22 -060028tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
Ezio Melotti5211ffe2012-02-13 11:24:50 +020029# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
30# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
31tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
Ezio Melotti29877e82012-02-21 09:25:00 +020032# Note:
33# 1) the strict attrfind isn't really strict, but we can't make it
34# correctly strict without breaking backward compatibility;
35# 2) if you change attrfind remember to update locatestarttagend too;
36# 3) if you change attrfind and/or locatestarttagend the parser will
37# explode, so don't do it.
Guido van Rossum8846d712001-05-18 14:50:52 +000038attrfind = re.compile(
39 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030040 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000041attrfind_tolerant = re.compile(
Ezio Melotti0780b6b2012-04-18 19:18:22 -060042 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
Ezio Melotti29877e82012-02-21 09:25:00 +020043 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
Guido van Rossum8846d712001-05-18 14:50:52 +000044locatestarttagend = re.compile(r"""
45 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
46 (?:\s+ # whitespace before attribute name
47 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
48 (?:\s*=\s* # value indicator
49 (?:'[^']*' # LITA-enclosed value
50 |\"[^\"]*\" # LIT-enclosed value
51 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000052 )
Guido van Rossum8846d712001-05-18 14:50:52 +000053 )?
54 )
55 )*
56 \s* # trailing whitespace
57""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000058locatestarttagend_tolerant = re.compile(r"""
59 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
Ezio Melotti29877e82012-02-21 09:25:00 +020060 (?:[\s/]* # optional whitespace before attribute name
61 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020062 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000063 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020064 |"[^"]*" # LIT-enclosed value
65 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000066 )
67 (?:\s*,)* # possibly followed by a comma
Ezio Melotti29877e82012-02-21 09:25:00 +020068 )?(?:\s|/(?!>))*
Ezio Melottic2fe5772011-11-14 18:53:33 +020069 )*
70 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000071 \s* # trailing whitespace
72""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000073endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020074# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
75# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000076endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
77
Guido van Rossum8846d712001-05-18 14:50:52 +000078
79class HTMLParseError(Exception):
80 """Exception raised for all parse errors."""
81
82 def __init__(self, msg, position=(None, None)):
83 assert msg
84 self.msg = msg
85 self.lineno = position[0]
86 self.offset = position[1]
87
88 def __str__(self):
89 result = self.msg
90 if self.lineno is not None:
91 result = result + ", at line %d" % self.lineno
92 if self.offset is not None:
93 result = result + ", column %d" % (self.offset + 1)
94 return result
95
96
Fred Drakecb5c80f2007-12-07 11:10:11 +000097class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000098 """Find tags and other markup and call handler functions.
99
100 Usage:
101 p = HTMLParser()
102 p.feed(data)
103 ...
104 p.close()
105
106 Start tags are handled by calling self.handle_starttag() or
107 self.handle_startendtag(); end tags by self.handle_endtag(). The
108 data between tags is passed from the parser to the derived class
109 by calling self.handle_data() with the data as argument (the data
110 may be split up in arbitrary chunks). Entity references are
111 passed by calling self.handle_entityref() with the entity
112 reference as the argument. Numeric character references are
113 passed to self.handle_charref() with the string containing the
114 reference as the argument.
115 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000116
117 CDATA_CONTENT_ELEMENTS = ("script", "style")
118
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200119 def __init__(self, strict=False):
R. David Murrayb579dba2010-12-03 04:06:39 +0000120 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000121
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200122 If strict is set to False (the default) the parser will parse invalid
123 markup, otherwise it will raise an error. Note that the strict mode
124 is deprecated.
R. David Murrayb579dba2010-12-03 04:06:39 +0000125 """
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200126 if strict:
127 warnings.warn("The strict mode is deprecated.",
128 DeprecationWarning, stacklevel=2)
R. David Murrayb579dba2010-12-03 04:06:39 +0000129 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000130 self.reset()
131
Guido van Rossum8846d712001-05-18 14:50:52 +0000132 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000133 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000134 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000135 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000136 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200137 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000138 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000139
Guido van Rossum8846d712001-05-18 14:50:52 +0000140 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200141 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000142
143 Call this as often as you want, with as little or as much text
144 as you want (may include '\n').
145 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000146 self.rawdata = self.rawdata + data
147 self.goahead(0)
148
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000150 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000151 self.goahead(1)
152
Fred Drakebfc8fea2001-09-24 20:10:28 +0000153 def error(self, message):
154 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000155
156 __starttag_text = None
157
Guido van Rossum8846d712001-05-18 14:50:52 +0000158 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000159 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000160 return self.__starttag_text
161
Ezio Melotti7de56f62011-11-01 14:12:22 +0200162 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200163 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200164 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000165
166 def clear_cdata_mode(self):
167 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200168 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000169
170 # Internal -- handle data as far as reasonable. May leave state
171 # and data to be processed by a subsequent call. If 'end' is
172 # true, force handling all data as if followed by EOF marker.
173 def goahead(self, end):
174 rawdata = self.rawdata
175 i = 0
176 n = len(rawdata)
177 while i < n:
178 match = self.interesting.search(rawdata, i) # < or &
179 if match:
180 j = match.start()
181 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200182 if self.cdata_elem:
183 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000184 j = n
185 if i < j: self.handle_data(rawdata[i:j])
186 i = self.updatepos(i, j)
187 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000188 startswith = rawdata.startswith
189 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000190 if starttagopen.match(rawdata, i): # < + letter
191 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000192 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000193 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000194 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000195 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000196 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000197 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000198 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200199 if self.strict:
200 k = self.parse_declaration(i)
201 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200202 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000203 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000204 self.handle_data("<")
205 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000206 else:
207 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000208 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000209 if not end:
210 break
211 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000212 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000213 k = rawdata.find('>', i + 1)
214 if k < 0:
215 k = rawdata.find('<', i + 1)
216 if k < 0:
217 k = i + 1
218 else:
219 k += 1
220 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000222 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000223 match = charref.match(rawdata, i)
224 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000225 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000226 self.handle_charref(name)
227 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000228 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000229 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000230 i = self.updatepos(i, k)
231 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000232 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000233 if ";" in rawdata[i:]: #bail by consuming &#
234 self.handle_data(rawdata[0:2])
235 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000236 break
Fred Drake248b0432001-12-03 17:09:50 +0000237 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000238 match = entityref.match(rawdata, i)
239 if match:
240 name = match.group(1)
241 self.handle_entityref(name)
242 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000243 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000244 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000245 i = self.updatepos(i, k)
246 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000247 match = incomplete.match(rawdata, i)
248 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000249 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000250 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000251 if self.strict:
252 self.error("EOF in middle of entity or char ref")
253 else:
Ezio Melotti8e596a72013-05-01 16:18:25 +0300254 k = match.end()
R. David Murrayb579dba2010-12-03 04:06:39 +0000255 if k <= i:
256 k = n
257 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000258 # incomplete
259 break
260 elif (i + 1) < n:
261 # not the end of the buffer, and can't be confused
262 # with some other construct
263 self.handle_data("&")
264 i = self.updatepos(i, i + 1)
265 else:
266 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000267 else:
268 assert 0, "interesting.search() lied"
269 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200270 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000271 self.handle_data(rawdata[i:n])
272 i = self.updatepos(i, n)
273 self.rawdata = rawdata[i:]
274
Ezio Melottif4ab4912012-02-13 15:50:37 +0200275 # Internal -- parse html declarations, return length or -1 if not terminated
276 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
277 # See also parse_declaration in _markupbase
278 def parse_html_declaration(self, i):
279 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200280 assert rawdata[i:i+2] == '<!', ('unexpected call to '
281 'parse_html_declaration()')
Ezio Melottif4ab4912012-02-13 15:50:37 +0200282 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200283 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200284 return self.parse_comment(i)
285 elif rawdata[i:i+3] == '<![':
286 return self.parse_marked_section(i)
287 elif rawdata[i:i+9].lower() == '<!doctype':
288 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200289 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200290 if gtpos == -1:
291 return -1
292 self.handle_decl(rawdata[i+2:gtpos])
293 return gtpos+1
294 else:
295 return self.parse_bogus_comment(i)
296
Ezio Melottifa3702d2012-02-10 10:45:44 +0200297 # Internal -- parse bogus comment, return length or -1 if not terminated
298 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
299 def parse_bogus_comment(self, i, report=1):
300 rawdata = self.rawdata
Ezio Melotti3861d8b2012-06-23 15:27:51 +0200301 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
302 'parse_comment()')
Ezio Melottifa3702d2012-02-10 10:45:44 +0200303 pos = rawdata.find('>', i+2)
304 if pos == -1:
305 return -1
306 if report:
307 self.handle_comment(rawdata[i+2:pos])
308 return pos + 1
309
Guido van Rossum8846d712001-05-18 14:50:52 +0000310 # Internal -- parse processing instr, return end or -1 if not terminated
311 def parse_pi(self, i):
312 rawdata = self.rawdata
313 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
314 match = piclose.search(rawdata, i+2) # >
315 if not match:
316 return -1
317 j = match.start()
318 self.handle_pi(rawdata[i+2: j])
319 j = match.end()
320 return j
321
322 # Internal -- handle starttag, return end or -1 if not terminated
323 def parse_starttag(self, i):
324 self.__starttag_text = None
325 endpos = self.check_for_whole_start_tag(i)
326 if endpos < 0:
327 return endpos
328 rawdata = self.rawdata
329 self.__starttag_text = rawdata[i:endpos]
330
331 # Now parse the data between i+1 and j into a tag and attrs
332 attrs = []
333 match = tagfind.match(rawdata, i+1)
334 assert match, 'unexpected call to parse_starttag()'
335 k = match.end()
Ezio Melotti0780b6b2012-04-18 19:18:22 -0600336 self.lasttag = tag = match.group(1).lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000337 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000338 if self.strict:
339 m = attrfind.match(rawdata, k)
340 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300341 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000342 if not m:
343 break
344 attrname, rest, attrvalue = m.group(1, 2, 3)
345 if not rest:
346 attrvalue = None
347 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
348 attrvalue[:1] == '"' == attrvalue[-1:]:
349 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200350 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000351 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000352 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000353 k = m.end()
354
Fred Drake248b0432001-12-03 17:09:50 +0000355 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000356 if end not in (">", "/>"):
357 lineno, offset = self.getpos()
358 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000359 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000360 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000361 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000362 else:
363 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000364 if self.strict:
365 self.error("junk characters in start tag: %r"
366 % (rawdata[k:endpos][:20],))
367 self.handle_data(rawdata[i:endpos])
368 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000369 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000370 # XHTML-style empty tag: <span attr="value" />
371 self.handle_startendtag(tag, attrs)
372 else:
373 self.handle_starttag(tag, attrs)
374 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200375 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000376 return endpos
377
378 # Internal -- check to see if we have a complete starttag; return end
379 # or -1 if incomplete.
380 def check_for_whole_start_tag(self, i):
381 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000382 if self.strict:
383 m = locatestarttagend.match(rawdata, i)
384 else:
385 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000386 if m:
387 j = m.end()
388 next = rawdata[j:j+1]
389 if next == ">":
390 return j + 1
391 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000392 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000393 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000394 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000395 # buffer boundary
396 return -1
397 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000398 if self.strict:
399 self.updatepos(i, j + 1)
400 self.error("malformed empty start tag")
401 if j > i:
402 return j
403 else:
404 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000405 if next == "":
406 # end of input
407 return -1
408 if next in ("abcdefghijklmnopqrstuvwxyz=/"
409 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
410 # end of input in or before attribute value, or we have the
411 # '/' from a '/>' ending
412 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000413 if self.strict:
414 self.updatepos(i, j)
415 self.error("malformed start tag")
416 if j > i:
417 return j
418 else:
419 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000420 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000421
422 # Internal -- parse endtag, return end or -1 if incomplete
423 def parse_endtag(self, i):
424 rawdata = self.rawdata
425 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
426 match = endendtag.search(rawdata, i+1) # >
427 if not match:
428 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200429 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000430 match = endtagfind.match(rawdata, i) # </ + tag + >
431 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200432 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200433 self.handle_data(rawdata[i:gtpos])
434 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000435 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200436 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
437 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
438 namematch = tagfind_tolerant.match(rawdata, i+2)
439 if not namematch:
440 # w3.org/TR/html5/tokenization.html#end-tag-open-state
441 if rawdata[i:i+3] == '</>':
442 return i+3
443 else:
444 return self.parse_bogus_comment(i)
445 tagname = namematch.group().lower()
446 # consume and ignore other stuff between the name and the >
447 # Note: this is not 100% correct, since we might have things like
448 # </tag attr=">">, but looking for > after tha name should cover
449 # most of the cases and is much simpler
450 gtpos = rawdata.find('>', namematch.end())
451 self.handle_endtag(tagname)
452 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200453
454 elem = match.group(1).lower() # script or style
455 if self.cdata_elem is not None:
456 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200457 self.handle_data(rawdata[i:gtpos])
458 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200459
460 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000461 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200462 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000463
464 # Overridable -- finish processing of start+end tag: <tag.../>
465 def handle_startendtag(self, tag, attrs):
466 self.handle_starttag(tag, attrs)
467 self.handle_endtag(tag)
468
469 # Overridable -- handle start tag
470 def handle_starttag(self, tag, attrs):
471 pass
472
473 # Overridable -- handle end tag
474 def handle_endtag(self, tag):
475 pass
476
477 # Overridable -- handle character reference
478 def handle_charref(self, name):
479 pass
480
481 # Overridable -- handle entity reference
482 def handle_entityref(self, name):
483 pass
484
485 # Overridable -- handle data
486 def handle_data(self, data):
487 pass
488
489 # Overridable -- handle comment
490 def handle_comment(self, data):
491 pass
492
493 # Overridable -- handle declaration
494 def handle_decl(self, decl):
495 pass
496
497 # Overridable -- handle processing instruction
498 def handle_pi(self, data):
499 pass
500
Fred Drakebfc8fea2001-09-24 20:10:28 +0000501 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000502 if self.strict:
503 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000504
Guido van Rossum8846d712001-05-18 14:50:52 +0000505 # Internal -- helper to remove special character quoting
506 def unescape(self, s):
507 if '&' not in s:
508 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000509 def replaceEntities(s):
510 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000511 try:
512 if s[0] == "#":
513 s = s[1:]
514 if s[0] in ['x','X']:
Ezio Melotti46495182012-06-24 22:02:56 +0200515 c = int(s[1:].rstrip(';'), 16)
Senthil Kumaran164540f2010-12-28 15:55:16 +0000516 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200517 c = int(s.rstrip(';'))
Senthil Kumaran164540f2010-12-28 15:55:16 +0000518 return chr(c)
519 except ValueError:
Ezio Melotti46495182012-06-24 22:02:56 +0200520 return '&#' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000521 else:
Ezio Melotti46495182012-06-24 22:02:56 +0200522 from html.entities import html5
523 if s in html5:
524 return html5[s]
525 elif s.endswith(';'):
526 return '&' + s
527 for x in range(2, len(s)):
528 if s[:x] in html5:
529 return html5[s[:x]] + s[x:]
530 else:
531 return '&' + s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000532
Ezio Melotti46495182012-06-24 22:02:56 +0200533 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300534 replaceEntities, s, flags=re.ASCII)