blob: 9db8ab582bec32d58d0dc54c189f44c1992b3f3c [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000017incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000018
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000020charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000024commentclose = re.compile(r'--\s*>')
25tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
Ezio Melotti5211ffe2012-02-13 11:24:50 +020026# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000029# Note, the strict one of this pair isn't really strict, but we can't
30# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000031attrfind = re.compile(
32 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030033 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000034attrfind_tolerant = re.compile(
Ezio Melottic2fe5772011-11-14 18:53:33 +020035 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
36 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000037locatestarttagend = re.compile(r"""
38 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
39 (?:\s+ # whitespace before attribute name
40 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
41 (?:\s*=\s* # value indicator
42 (?:'[^']*' # LITA-enclosed value
43 |\"[^\"]*\" # LIT-enclosed value
44 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000045 )
Guido van Rossum8846d712001-05-18 14:50:52 +000046 )?
47 )
48 )*
49 \s* # trailing whitespace
50""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000051locatestarttagend_tolerant = re.compile(r"""
52 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
53 (?:\s* # optional whitespace before attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020054 (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
55 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000056 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020057 |"[^"]*" # LIT-enclosed value
58 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000059 )
60 (?:\s*,)* # possibly followed by a comma
Ezio Melottic2fe5772011-11-14 18:53:33 +020061 )?\s*
62 )*
63 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000064 \s* # trailing whitespace
65""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000066endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020067# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
68# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000069endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
70
Guido van Rossum8846d712001-05-18 14:50:52 +000071
72class HTMLParseError(Exception):
73 """Exception raised for all parse errors."""
74
75 def __init__(self, msg, position=(None, None)):
76 assert msg
77 self.msg = msg
78 self.lineno = position[0]
79 self.offset = position[1]
80
81 def __str__(self):
82 result = self.msg
83 if self.lineno is not None:
84 result = result + ", at line %d" % self.lineno
85 if self.offset is not None:
86 result = result + ", column %d" % (self.offset + 1)
87 return result
88
89
Fred Drakecb5c80f2007-12-07 11:10:11 +000090class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000091 """Find tags and other markup and call handler functions.
92
93 Usage:
94 p = HTMLParser()
95 p.feed(data)
96 ...
97 p.close()
98
99 Start tags are handled by calling self.handle_starttag() or
100 self.handle_startendtag(); end tags by self.handle_endtag(). The
101 data between tags is passed from the parser to the derived class
102 by calling self.handle_data() with the data as argument (the data
103 may be split up in arbitrary chunks). Entity references are
104 passed by calling self.handle_entityref() with the entity
105 reference as the argument. Numeric character references are
106 passed to self.handle_charref() with the string containing the
107 reference as the argument.
108 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000109
110 CDATA_CONTENT_ELEMENTS = ("script", "style")
111
R. David Murrayb579dba2010-12-03 04:06:39 +0000112 def __init__(self, strict=True):
113 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000114
R. David Murrayb579dba2010-12-03 04:06:39 +0000115 If strict is set to True (the default), errors are raised when invalid
116 HTML is encountered. If set to False, an attempt is instead made to
117 continue parsing, making "best guesses" about the intended meaning, in
118 a fashion similar to what browsers typically do.
119 """
120 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 self.reset()
122
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000124 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000125 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000126 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000127 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200128 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000129 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000130
Guido van Rossum8846d712001-05-18 14:50:52 +0000131 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200132 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000133
134 Call this as often as you want, with as little or as much text
135 as you want (may include '\n').
136 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 self.rawdata = self.rawdata + data
138 self.goahead(0)
139
Guido van Rossum8846d712001-05-18 14:50:52 +0000140 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000141 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000142 self.goahead(1)
143
Fred Drakebfc8fea2001-09-24 20:10:28 +0000144 def error(self, message):
145 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000146
147 __starttag_text = None
148
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000150 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000151 return self.__starttag_text
152
Ezio Melotti7de56f62011-11-01 14:12:22 +0200153 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200154 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200155 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000156
157 def clear_cdata_mode(self):
158 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200159 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000160
161 # Internal -- handle data as far as reasonable. May leave state
162 # and data to be processed by a subsequent call. If 'end' is
163 # true, force handling all data as if followed by EOF marker.
164 def goahead(self, end):
165 rawdata = self.rawdata
166 i = 0
167 n = len(rawdata)
168 while i < n:
169 match = self.interesting.search(rawdata, i) # < or &
170 if match:
171 j = match.start()
172 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200173 if self.cdata_elem:
174 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000175 j = n
176 if i < j: self.handle_data(rawdata[i:j])
177 i = self.updatepos(i, j)
178 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000179 startswith = rawdata.startswith
180 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000181 if starttagopen.match(rawdata, i): # < + letter
182 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000183 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000184 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000185 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000186 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000187 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000188 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000189 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200190 if self.strict:
191 k = self.parse_declaration(i)
192 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200193 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000194 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000195 self.handle_data("<")
196 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000197 else:
198 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000199 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000200 if not end:
201 break
202 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000203 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000204 k = rawdata.find('>', i + 1)
205 if k < 0:
206 k = rawdata.find('<', i + 1)
207 if k < 0:
208 k = i + 1
209 else:
210 k += 1
211 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000212 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000213 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000214 match = charref.match(rawdata, i)
215 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000216 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000217 self.handle_charref(name)
218 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000219 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000220 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 i = self.updatepos(i, k)
222 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000223 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000224 if ";" in rawdata[i:]: #bail by consuming &#
225 self.handle_data(rawdata[0:2])
226 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000227 break
Fred Drake248b0432001-12-03 17:09:50 +0000228 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000229 match = entityref.match(rawdata, i)
230 if match:
231 name = match.group(1)
232 self.handle_entityref(name)
233 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000234 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000235 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000236 i = self.updatepos(i, k)
237 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000238 match = incomplete.match(rawdata, i)
239 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000240 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000241 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000242 if self.strict:
243 self.error("EOF in middle of entity or char ref")
244 else:
245 if k <= i:
246 k = n
247 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000248 # incomplete
249 break
250 elif (i + 1) < n:
251 # not the end of the buffer, and can't be confused
252 # with some other construct
253 self.handle_data("&")
254 i = self.updatepos(i, i + 1)
255 else:
256 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000257 else:
258 assert 0, "interesting.search() lied"
259 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200260 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000261 self.handle_data(rawdata[i:n])
262 i = self.updatepos(i, n)
263 self.rawdata = rawdata[i:]
264
Ezio Melottif4ab4912012-02-13 15:50:37 +0200265 # Internal -- parse html declarations, return length or -1 if not terminated
266 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
267 # See also parse_declaration in _markupbase
268 def parse_html_declaration(self, i):
269 rawdata = self.rawdata
270 if rawdata[i:i+2] != '<!':
271 self.error('unexpected call to parse_html_declaration()')
272 if rawdata[i:i+4] == '<!--':
273 return self.parse_comment(i)
274 elif rawdata[i:i+3] == '<![':
275 return self.parse_marked_section(i)
276 elif rawdata[i:i+9].lower() == '<!doctype':
277 # find the closing >
278 gtpos = rawdata.find('>', 9)
279 if gtpos == -1:
280 return -1
281 self.handle_decl(rawdata[i+2:gtpos])
282 return gtpos+1
283 else:
284 return self.parse_bogus_comment(i)
285
Ezio Melottifa3702d2012-02-10 10:45:44 +0200286 # Internal -- parse bogus comment, return length or -1 if not terminated
287 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
288 def parse_bogus_comment(self, i, report=1):
289 rawdata = self.rawdata
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200290 if rawdata[i:i+2] not in ('<!', '</'):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200291 self.error('unexpected call to parse_comment()')
292 pos = rawdata.find('>', i+2)
293 if pos == -1:
294 return -1
295 if report:
296 self.handle_comment(rawdata[i+2:pos])
297 return pos + 1
298
Guido van Rossum8846d712001-05-18 14:50:52 +0000299 # Internal -- parse processing instr, return end or -1 if not terminated
300 def parse_pi(self, i):
301 rawdata = self.rawdata
302 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
303 match = piclose.search(rawdata, i+2) # >
304 if not match:
305 return -1
306 j = match.start()
307 self.handle_pi(rawdata[i+2: j])
308 j = match.end()
309 return j
310
311 # Internal -- handle starttag, return end or -1 if not terminated
312 def parse_starttag(self, i):
313 self.__starttag_text = None
314 endpos = self.check_for_whole_start_tag(i)
315 if endpos < 0:
316 return endpos
317 rawdata = self.rawdata
318 self.__starttag_text = rawdata[i:endpos]
319
320 # Now parse the data between i+1 and j into a tag and attrs
321 attrs = []
322 match = tagfind.match(rawdata, i+1)
323 assert match, 'unexpected call to parse_starttag()'
324 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000325 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000326 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000327 if self.strict:
328 m = attrfind.match(rawdata, k)
329 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300330 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000331 if not m:
332 break
333 attrname, rest, attrvalue = m.group(1, 2, 3)
334 if not rest:
335 attrvalue = None
336 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
337 attrvalue[:1] == '"' == attrvalue[-1:]:
338 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200339 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000340 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000341 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000342 k = m.end()
343
Fred Drake248b0432001-12-03 17:09:50 +0000344 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000345 if end not in (">", "/>"):
346 lineno, offset = self.getpos()
347 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000348 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000349 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000350 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000351 else:
352 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000353 if self.strict:
354 self.error("junk characters in start tag: %r"
355 % (rawdata[k:endpos][:20],))
356 self.handle_data(rawdata[i:endpos])
357 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000358 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000359 # XHTML-style empty tag: <span attr="value" />
360 self.handle_startendtag(tag, attrs)
361 else:
362 self.handle_starttag(tag, attrs)
363 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200364 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000365 return endpos
366
367 # Internal -- check to see if we have a complete starttag; return end
368 # or -1 if incomplete.
369 def check_for_whole_start_tag(self, i):
370 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000371 if self.strict:
372 m = locatestarttagend.match(rawdata, i)
373 else:
374 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000375 if m:
376 j = m.end()
377 next = rawdata[j:j+1]
378 if next == ">":
379 return j + 1
380 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000381 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000382 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000383 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000384 # buffer boundary
385 return -1
386 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000387 if self.strict:
388 self.updatepos(i, j + 1)
389 self.error("malformed empty start tag")
390 if j > i:
391 return j
392 else:
393 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000394 if next == "":
395 # end of input
396 return -1
397 if next in ("abcdefghijklmnopqrstuvwxyz=/"
398 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
399 # end of input in or before attribute value, or we have the
400 # '/' from a '/>' ending
401 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000402 if self.strict:
403 self.updatepos(i, j)
404 self.error("malformed start tag")
405 if j > i:
406 return j
407 else:
408 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000409 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000410
411 # Internal -- parse endtag, return end or -1 if incomplete
412 def parse_endtag(self, i):
413 rawdata = self.rawdata
414 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
415 match = endendtag.search(rawdata, i+1) # >
416 if not match:
417 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200418 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000419 match = endtagfind.match(rawdata, i) # </ + tag + >
420 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200421 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200422 self.handle_data(rawdata[i:gtpos])
423 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000424 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200425 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
426 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
427 namematch = tagfind_tolerant.match(rawdata, i+2)
428 if not namematch:
429 # w3.org/TR/html5/tokenization.html#end-tag-open-state
430 if rawdata[i:i+3] == '</>':
431 return i+3
432 else:
433 return self.parse_bogus_comment(i)
434 tagname = namematch.group().lower()
435 # consume and ignore other stuff between the name and the >
436 # Note: this is not 100% correct, since we might have things like
437 # </tag attr=">">, but looking for > after tha name should cover
438 # most of the cases and is much simpler
439 gtpos = rawdata.find('>', namematch.end())
440 self.handle_endtag(tagname)
441 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200442
443 elem = match.group(1).lower() # script or style
444 if self.cdata_elem is not None:
445 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200446 self.handle_data(rawdata[i:gtpos])
447 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200448
449 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000450 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200451 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000452
453 # Overridable -- finish processing of start+end tag: <tag.../>
454 def handle_startendtag(self, tag, attrs):
455 self.handle_starttag(tag, attrs)
456 self.handle_endtag(tag)
457
458 # Overridable -- handle start tag
459 def handle_starttag(self, tag, attrs):
460 pass
461
462 # Overridable -- handle end tag
463 def handle_endtag(self, tag):
464 pass
465
466 # Overridable -- handle character reference
467 def handle_charref(self, name):
468 pass
469
470 # Overridable -- handle entity reference
471 def handle_entityref(self, name):
472 pass
473
474 # Overridable -- handle data
475 def handle_data(self, data):
476 pass
477
478 # Overridable -- handle comment
479 def handle_comment(self, data):
480 pass
481
482 # Overridable -- handle declaration
483 def handle_decl(self, decl):
484 pass
485
486 # Overridable -- handle processing instruction
487 def handle_pi(self, data):
488 pass
489
Fred Drakebfc8fea2001-09-24 20:10:28 +0000490 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000491 if self.strict:
492 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000493
Guido van Rossum8846d712001-05-18 14:50:52 +0000494 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000495 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000496 def unescape(self, s):
497 if '&' not in s:
498 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000499 def replaceEntities(s):
500 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000501 try:
502 if s[0] == "#":
503 s = s[1:]
504 if s[0] in ['x','X']:
505 c = int(s[1:], 16)
506 else:
507 c = int(s)
508 return chr(c)
509 except ValueError:
510 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000511 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000512 # Cannot use name2codepoint directly, because HTMLParser
513 # supports apos, which is not part of HTML 4
514 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000515 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000516 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000517 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000518 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000519 try:
520 return self.entitydefs[s]
521 except KeyError:
522 return '&'+s+';'
523
Fred Drake3c50ea42008-05-17 22:02:32 +0000524 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300525 replaceEntities, s, flags=re.ASCII)