blob: aa31fbc5b1bdbf6d96c4b5cc8d1cd42275da8c05 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
Fred Drake68eac2b2001-09-04 15:10:16 +000017incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000018
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000020charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000021
22starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000024commentclose = re.compile(r'--\s*>')
25tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
Ezio Melotti5211ffe2012-02-13 11:24:50 +020026# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000029# Note, the strict one of this pair isn't really strict, but we can't
30# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000031attrfind = re.compile(
32 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030033 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000034attrfind_tolerant = re.compile(
Ezio Melottic2fe5772011-11-14 18:53:33 +020035 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
36 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000037locatestarttagend = re.compile(r"""
38 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
39 (?:\s+ # whitespace before attribute name
40 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
41 (?:\s*=\s* # value indicator
42 (?:'[^']*' # LITA-enclosed value
43 |\"[^\"]*\" # LIT-enclosed value
44 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000045 )
Guido van Rossum8846d712001-05-18 14:50:52 +000046 )?
47 )
48 )*
49 \s* # trailing whitespace
50""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000051locatestarttagend_tolerant = re.compile(r"""
52 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
53 (?:\s* # optional whitespace before attribute name
Ezio Melottic2fe5772011-11-14 18:53:33 +020054 (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
55 (?:\s*=+\s* # value indicator
R. David Murrayb579dba2010-12-03 04:06:39 +000056 (?:'[^']*' # LITA-enclosed value
Ezio Melottic2fe5772011-11-14 18:53:33 +020057 |"[^"]*" # LIT-enclosed value
58 |(?!['"])[^>\s]* # bare value
R. David Murrayb579dba2010-12-03 04:06:39 +000059 )
60 (?:\s*,)* # possibly followed by a comma
Ezio Melottic2fe5772011-11-14 18:53:33 +020061 )?\s*
62 )*
63 )?
R. David Murrayb579dba2010-12-03 04:06:39 +000064 \s* # trailing whitespace
65""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000066endendtag = re.compile('>')
Ezio Melotti7de56f62011-11-01 14:12:22 +020067# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
68# </ and the tag name, so maybe this should be fixed
Guido van Rossum8846d712001-05-18 14:50:52 +000069endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
70
Guido van Rossum8846d712001-05-18 14:50:52 +000071
72class HTMLParseError(Exception):
73 """Exception raised for all parse errors."""
74
75 def __init__(self, msg, position=(None, None)):
76 assert msg
77 self.msg = msg
78 self.lineno = position[0]
79 self.offset = position[1]
80
81 def __str__(self):
82 result = self.msg
83 if self.lineno is not None:
84 result = result + ", at line %d" % self.lineno
85 if self.offset is not None:
86 result = result + ", column %d" % (self.offset + 1)
87 return result
88
89
Fred Drakecb5c80f2007-12-07 11:10:11 +000090class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000091 """Find tags and other markup and call handler functions.
92
93 Usage:
94 p = HTMLParser()
95 p.feed(data)
96 ...
97 p.close()
98
99 Start tags are handled by calling self.handle_starttag() or
100 self.handle_startendtag(); end tags by self.handle_endtag(). The
101 data between tags is passed from the parser to the derived class
102 by calling self.handle_data() with the data as argument (the data
103 may be split up in arbitrary chunks). Entity references are
104 passed by calling self.handle_entityref() with the entity
105 reference as the argument. Numeric character references are
106 passed to self.handle_charref() with the string containing the
107 reference as the argument.
108 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000109
110 CDATA_CONTENT_ELEMENTS = ("script", "style")
111
R. David Murrayb579dba2010-12-03 04:06:39 +0000112 def __init__(self, strict=True):
113 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000114
R. David Murrayb579dba2010-12-03 04:06:39 +0000115 If strict is set to True (the default), errors are raised when invalid
116 HTML is encountered. If set to False, an attempt is instead made to
117 continue parsing, making "best guesses" about the intended meaning, in
118 a fashion similar to what browsers typically do.
119 """
120 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 self.reset()
122
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000124 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000125 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000126 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000127 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200128 self.cdata_elem = None
Fred Drakecb5c80f2007-12-07 11:10:11 +0000129 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000130
Guido van Rossum8846d712001-05-18 14:50:52 +0000131 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200132 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000133
134 Call this as often as you want, with as little or as much text
135 as you want (may include '\n').
136 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 self.rawdata = self.rawdata + data
138 self.goahead(0)
139
Guido van Rossum8846d712001-05-18 14:50:52 +0000140 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000141 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000142 self.goahead(1)
143
Fred Drakebfc8fea2001-09-24 20:10:28 +0000144 def error(self, message):
145 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000146
147 __starttag_text = None
148
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000150 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000151 return self.__starttag_text
152
Ezio Melotti7de56f62011-11-01 14:12:22 +0200153 def set_cdata_mode(self, elem):
Ezio Melotti7de56f62011-11-01 14:12:22 +0200154 self.cdata_elem = elem.lower()
Ezio Melotti15cb4892011-11-18 18:01:49 +0200155 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
Guido van Rossum8846d712001-05-18 14:50:52 +0000156
157 def clear_cdata_mode(self):
158 self.interesting = interesting_normal
Ezio Melotti7de56f62011-11-01 14:12:22 +0200159 self.cdata_elem = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000160
161 # Internal -- handle data as far as reasonable. May leave state
162 # and data to be processed by a subsequent call. If 'end' is
163 # true, force handling all data as if followed by EOF marker.
164 def goahead(self, end):
165 rawdata = self.rawdata
166 i = 0
167 n = len(rawdata)
168 while i < n:
169 match = self.interesting.search(rawdata, i) # < or &
170 if match:
171 j = match.start()
172 else:
Ezio Melotti15cb4892011-11-18 18:01:49 +0200173 if self.cdata_elem:
174 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000175 j = n
176 if i < j: self.handle_data(rawdata[i:j])
177 i = self.updatepos(i, j)
178 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000179 startswith = rawdata.startswith
180 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000181 if starttagopen.match(rawdata, i): # < + letter
182 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000183 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000184 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000185 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000186 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000187 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000188 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000189 elif startswith("<!", i):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200190 if self.strict:
191 k = self.parse_declaration(i)
192 else:
Ezio Melottif4ab4912012-02-13 15:50:37 +0200193 k = self.parse_html_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000194 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000195 self.handle_data("<")
196 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000197 else:
198 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000199 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000200 if not end:
201 break
202 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000203 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000204 k = rawdata.find('>', i + 1)
205 if k < 0:
206 k = rawdata.find('<', i + 1)
207 if k < 0:
208 k = i + 1
209 else:
210 k += 1
211 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000212 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000213 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000214 match = charref.match(rawdata, i)
215 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000216 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000217 self.handle_charref(name)
218 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000219 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000220 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000221 i = self.updatepos(i, k)
222 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000223 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000224 if ";" in rawdata[i:]: #bail by consuming &#
225 self.handle_data(rawdata[0:2])
226 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000227 break
Fred Drake248b0432001-12-03 17:09:50 +0000228 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000229 match = entityref.match(rawdata, i)
230 if match:
231 name = match.group(1)
232 self.handle_entityref(name)
233 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000234 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000235 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000236 i = self.updatepos(i, k)
237 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000238 match = incomplete.match(rawdata, i)
239 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000240 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000241 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000242 if self.strict:
243 self.error("EOF in middle of entity or char ref")
244 else:
245 if k <= i:
246 k = n
247 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000248 # incomplete
249 break
250 elif (i + 1) < n:
251 # not the end of the buffer, and can't be confused
252 # with some other construct
253 self.handle_data("&")
254 i = self.updatepos(i, i + 1)
255 else:
256 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000257 else:
258 assert 0, "interesting.search() lied"
259 # end while
Ezio Melotti15cb4892011-11-18 18:01:49 +0200260 if end and i < n and not self.cdata_elem:
Guido van Rossum8846d712001-05-18 14:50:52 +0000261 self.handle_data(rawdata[i:n])
262 i = self.updatepos(i, n)
263 self.rawdata = rawdata[i:]
264
Ezio Melottif4ab4912012-02-13 15:50:37 +0200265 # Internal -- parse html declarations, return length or -1 if not terminated
266 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
267 # See also parse_declaration in _markupbase
268 def parse_html_declaration(self, i):
269 rawdata = self.rawdata
270 if rawdata[i:i+2] != '<!':
271 self.error('unexpected call to parse_html_declaration()')
272 if rawdata[i:i+4] == '<!--':
Ezio Melottie31dded2012-02-13 20:20:00 +0200273 # this case is actually already handled in goahead()
Ezio Melottif4ab4912012-02-13 15:50:37 +0200274 return self.parse_comment(i)
275 elif rawdata[i:i+3] == '<![':
276 return self.parse_marked_section(i)
277 elif rawdata[i:i+9].lower() == '<!doctype':
278 # find the closing >
Ezio Melottie31dded2012-02-13 20:20:00 +0200279 gtpos = rawdata.find('>', i+9)
Ezio Melottif4ab4912012-02-13 15:50:37 +0200280 if gtpos == -1:
281 return -1
282 self.handle_decl(rawdata[i+2:gtpos])
283 return gtpos+1
284 else:
285 return self.parse_bogus_comment(i)
286
Ezio Melottifa3702d2012-02-10 10:45:44 +0200287 # Internal -- parse bogus comment, return length or -1 if not terminated
288 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
289 def parse_bogus_comment(self, i, report=1):
290 rawdata = self.rawdata
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200291 if rawdata[i:i+2] not in ('<!', '</'):
Ezio Melottifa3702d2012-02-10 10:45:44 +0200292 self.error('unexpected call to parse_comment()')
293 pos = rawdata.find('>', i+2)
294 if pos == -1:
295 return -1
296 if report:
297 self.handle_comment(rawdata[i+2:pos])
298 return pos + 1
299
Guido van Rossum8846d712001-05-18 14:50:52 +0000300 # Internal -- parse processing instr, return end or -1 if not terminated
301 def parse_pi(self, i):
302 rawdata = self.rawdata
303 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
304 match = piclose.search(rawdata, i+2) # >
305 if not match:
306 return -1
307 j = match.start()
308 self.handle_pi(rawdata[i+2: j])
309 j = match.end()
310 return j
311
312 # Internal -- handle starttag, return end or -1 if not terminated
313 def parse_starttag(self, i):
314 self.__starttag_text = None
315 endpos = self.check_for_whole_start_tag(i)
316 if endpos < 0:
317 return endpos
318 rawdata = self.rawdata
319 self.__starttag_text = rawdata[i:endpos]
320
321 # Now parse the data between i+1 and j into a tag and attrs
322 attrs = []
323 match = tagfind.match(rawdata, i+1)
324 assert match, 'unexpected call to parse_starttag()'
325 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000326 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000327 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000328 if self.strict:
329 m = attrfind.match(rawdata, k)
330 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300331 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000332 if not m:
333 break
334 attrname, rest, attrvalue = m.group(1, 2, 3)
335 if not rest:
336 attrvalue = None
337 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
338 attrvalue[:1] == '"' == attrvalue[-1:]:
339 attrvalue = attrvalue[1:-1]
Ezio Melottic2fe5772011-11-14 18:53:33 +0200340 if attrvalue:
Guido van Rossum8846d712001-05-18 14:50:52 +0000341 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000342 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000343 k = m.end()
344
Fred Drake248b0432001-12-03 17:09:50 +0000345 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000346 if end not in (">", "/>"):
347 lineno, offset = self.getpos()
348 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000349 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000350 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000351 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000352 else:
353 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000354 if self.strict:
355 self.error("junk characters in start tag: %r"
356 % (rawdata[k:endpos][:20],))
357 self.handle_data(rawdata[i:endpos])
358 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000359 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000360 # XHTML-style empty tag: <span attr="value" />
361 self.handle_startendtag(tag, attrs)
362 else:
363 self.handle_starttag(tag, attrs)
364 if tag in self.CDATA_CONTENT_ELEMENTS:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200365 self.set_cdata_mode(tag)
Guido van Rossum8846d712001-05-18 14:50:52 +0000366 return endpos
367
368 # Internal -- check to see if we have a complete starttag; return end
369 # or -1 if incomplete.
370 def check_for_whole_start_tag(self, i):
371 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000372 if self.strict:
373 m = locatestarttagend.match(rawdata, i)
374 else:
375 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000376 if m:
377 j = m.end()
378 next = rawdata[j:j+1]
379 if next == ">":
380 return j + 1
381 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000382 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000383 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000384 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000385 # buffer boundary
386 return -1
387 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000388 if self.strict:
389 self.updatepos(i, j + 1)
390 self.error("malformed empty start tag")
391 if j > i:
392 return j
393 else:
394 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000395 if next == "":
396 # end of input
397 return -1
398 if next in ("abcdefghijklmnopqrstuvwxyz=/"
399 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
400 # end of input in or before attribute value, or we have the
401 # '/' from a '/>' ending
402 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000403 if self.strict:
404 self.updatepos(i, j)
405 self.error("malformed start tag")
406 if j > i:
407 return j
408 else:
409 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000410 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000411
412 # Internal -- parse endtag, return end or -1 if incomplete
413 def parse_endtag(self, i):
414 rawdata = self.rawdata
415 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
416 match = endendtag.search(rawdata, i+1) # >
417 if not match:
418 return -1
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200419 gtpos = match.end()
Guido van Rossum8846d712001-05-18 14:50:52 +0000420 match = endtagfind.match(rawdata, i) # </ + tag + >
421 if not match:
Ezio Melotti7de56f62011-11-01 14:12:22 +0200422 if self.cdata_elem is not None:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200423 self.handle_data(rawdata[i:gtpos])
424 return gtpos
R. David Murrayb579dba2010-12-03 04:06:39 +0000425 if self.strict:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200426 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
427 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
428 namematch = tagfind_tolerant.match(rawdata, i+2)
429 if not namematch:
430 # w3.org/TR/html5/tokenization.html#end-tag-open-state
431 if rawdata[i:i+3] == '</>':
432 return i+3
433 else:
434 return self.parse_bogus_comment(i)
435 tagname = namematch.group().lower()
436 # consume and ignore other stuff between the name and the >
437 # Note: this is not 100% correct, since we might have things like
438 # </tag attr=">">, but looking for > after tha name should cover
439 # most of the cases and is much simpler
440 gtpos = rawdata.find('>', namematch.end())
441 self.handle_endtag(tagname)
442 return gtpos+1
Ezio Melotti7de56f62011-11-01 14:12:22 +0200443
444 elem = match.group(1).lower() # script or style
445 if self.cdata_elem is not None:
446 if elem != self.cdata_elem:
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200447 self.handle_data(rawdata[i:gtpos])
448 return gtpos
Ezio Melotti7de56f62011-11-01 14:12:22 +0200449
450 self.handle_endtag(elem.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000451 self.clear_cdata_mode()
Ezio Melotti5211ffe2012-02-13 11:24:50 +0200452 return gtpos
Guido van Rossum8846d712001-05-18 14:50:52 +0000453
454 # Overridable -- finish processing of start+end tag: <tag.../>
455 def handle_startendtag(self, tag, attrs):
456 self.handle_starttag(tag, attrs)
457 self.handle_endtag(tag)
458
459 # Overridable -- handle start tag
460 def handle_starttag(self, tag, attrs):
461 pass
462
463 # Overridable -- handle end tag
464 def handle_endtag(self, tag):
465 pass
466
467 # Overridable -- handle character reference
468 def handle_charref(self, name):
469 pass
470
471 # Overridable -- handle entity reference
472 def handle_entityref(self, name):
473 pass
474
475 # Overridable -- handle data
476 def handle_data(self, data):
477 pass
478
479 # Overridable -- handle comment
480 def handle_comment(self, data):
481 pass
482
483 # Overridable -- handle declaration
484 def handle_decl(self, decl):
485 pass
486
487 # Overridable -- handle processing instruction
488 def handle_pi(self, data):
489 pass
490
Fred Drakebfc8fea2001-09-24 20:10:28 +0000491 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000492 if self.strict:
493 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000494
Guido van Rossum8846d712001-05-18 14:50:52 +0000495 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000496 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000497 def unescape(self, s):
498 if '&' not in s:
499 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000500 def replaceEntities(s):
501 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000502 try:
503 if s[0] == "#":
504 s = s[1:]
505 if s[0] in ['x','X']:
506 c = int(s[1:], 16)
507 else:
508 c = int(s)
509 return chr(c)
510 except ValueError:
511 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000512 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000513 # Cannot use name2codepoint directly, because HTMLParser
514 # supports apos, which is not part of HTML 4
515 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000516 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000517 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000518 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000519 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000520 try:
521 return self.entitydefs[s]
522 except KeyError:
523 return '&'+s+';'
524
Fred Drake3c50ea42008-05-17 22:02:32 +0000525 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300526 replaceEntities, s, flags=re.ASCII)