blob: 8d275ab315858f4c49b8ef593e9bb66dd41c0637 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000027# Note, the strict one of this pair isn't really strict, but we can't
28# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000029attrfind = re.compile(
30 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Andrew M. Kuchlingb7d8ce02004-06-05 15:31:45 +000031 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000032attrfind_tolerant = re.compile(
33 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34 r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000035locatestarttagend = re.compile(r"""
36 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
37 (?:\s+ # whitespace before attribute name
38 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
39 (?:\s*=\s* # value indicator
40 (?:'[^']*' # LITA-enclosed value
41 |\"[^\"]*\" # LIT-enclosed value
42 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000043 )
Guido van Rossum8846d712001-05-18 14:50:52 +000044 )?
45 )
46 )*
47 \s* # trailing whitespace
48""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000049locatestarttagend_tolerant = re.compile(r"""
50 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
51 (?:\s* # optional whitespace before attribute name
52 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
53 (?:\s*=\s* # value indicator
54 (?:'[^']*' # LITA-enclosed value
55 |\"[^\"]*\" # LIT-enclosed value
56 |[^'\">\s]+ # bare value
57 )
58 (?:\s*,)* # possibly followed by a comma
59 )?
60 )
61 )*
62 \s* # trailing whitespace
63""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000064endendtag = re.compile('>')
65endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
66
Guido van Rossum8846d712001-05-18 14:50:52 +000067
68class HTMLParseError(Exception):
69 """Exception raised for all parse errors."""
70
71 def __init__(self, msg, position=(None, None)):
72 assert msg
73 self.msg = msg
74 self.lineno = position[0]
75 self.offset = position[1]
76
77 def __str__(self):
78 result = self.msg
79 if self.lineno is not None:
80 result = result + ", at line %d" % self.lineno
81 if self.offset is not None:
82 result = result + ", column %d" % (self.offset + 1)
83 return result
84
85
Fred Drakecb5c80f2007-12-07 11:10:11 +000086class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000087 """Find tags and other markup and call handler functions.
88
89 Usage:
90 p = HTMLParser()
91 p.feed(data)
92 ...
93 p.close()
94
95 Start tags are handled by calling self.handle_starttag() or
96 self.handle_startendtag(); end tags by self.handle_endtag(). The
97 data between tags is passed from the parser to the derived class
98 by calling self.handle_data() with the data as argument (the data
99 may be split up in arbitrary chunks). Entity references are
100 passed by calling self.handle_entityref() with the entity
101 reference as the argument. Numeric character references are
102 passed to self.handle_charref() with the string containing the
103 reference as the argument.
104 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000105
106 CDATA_CONTENT_ELEMENTS = ("script", "style")
107
R. David Murrayb579dba2010-12-03 04:06:39 +0000108 def __init__(self, strict=True):
109 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000110
R. David Murrayb579dba2010-12-03 04:06:39 +0000111 If strict is set to True (the default), errors are raised when invalid
112 HTML is encountered. If set to False, an attempt is instead made to
113 continue parsing, making "best guesses" about the intended meaning, in
114 a fashion similar to what browsers typically do.
115 """
116 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000117 self.reset()
118
Guido van Rossum8846d712001-05-18 14:50:52 +0000119 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000120 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000122 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.interesting = interesting_normal
Fred Drakecb5c80f2007-12-07 11:10:11 +0000124 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000125
Guido van Rossum8846d712001-05-18 14:50:52 +0000126 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000127 """Feed data to the parser.
128
129 Call this as often as you want, with as little or as much text
130 as you want (may include '\n').
131 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000132 self.rawdata = self.rawdata + data
133 self.goahead(0)
134
Guido van Rossum8846d712001-05-18 14:50:52 +0000135 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000136 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 self.goahead(1)
138
Fred Drakebfc8fea2001-09-24 20:10:28 +0000139 def error(self, message):
140 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000141
142 __starttag_text = None
143
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000145 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000146 return self.__starttag_text
147
148 def set_cdata_mode(self):
149 self.interesting = interesting_cdata
150
151 def clear_cdata_mode(self):
152 self.interesting = interesting_normal
153
154 # Internal -- handle data as far as reasonable. May leave state
155 # and data to be processed by a subsequent call. If 'end' is
156 # true, force handling all data as if followed by EOF marker.
157 def goahead(self, end):
158 rawdata = self.rawdata
159 i = 0
160 n = len(rawdata)
161 while i < n:
162 match = self.interesting.search(rawdata, i) # < or &
163 if match:
164 j = match.start()
165 else:
166 j = n
167 if i < j: self.handle_data(rawdata[i:j])
168 i = self.updatepos(i, j)
169 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000170 startswith = rawdata.startswith
171 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000172 if starttagopen.match(rawdata, i): # < + letter
173 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000174 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000175 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000176 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000177 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000178 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000179 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000180 elif startswith("<!", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000181 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000182 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000183 self.handle_data("<")
184 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000185 else:
186 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000187 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000188 if not end:
189 break
190 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000191 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000192 k = rawdata.find('>', i + 1)
193 if k < 0:
194 k = rawdata.find('<', i + 1)
195 if k < 0:
196 k = i + 1
197 else:
198 k += 1
199 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000200 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000201 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000202 match = charref.match(rawdata, i)
203 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000204 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000205 self.handle_charref(name)
206 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000207 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000208 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000209 i = self.updatepos(i, k)
210 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000211 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000212 if ";" in rawdata[i:]: #bail by consuming &#
213 self.handle_data(rawdata[0:2])
214 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000215 break
Fred Drake248b0432001-12-03 17:09:50 +0000216 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000217 match = entityref.match(rawdata, i)
218 if match:
219 name = match.group(1)
220 self.handle_entityref(name)
221 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000222 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000223 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000224 i = self.updatepos(i, k)
225 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000226 match = incomplete.match(rawdata, i)
227 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000228 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000229 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000230 if self.strict:
231 self.error("EOF in middle of entity or char ref")
232 else:
233 if k <= i:
234 k = n
235 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000236 # incomplete
237 break
238 elif (i + 1) < n:
239 # not the end of the buffer, and can't be confused
240 # with some other construct
241 self.handle_data("&")
242 i = self.updatepos(i, i + 1)
243 else:
244 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000245 else:
246 assert 0, "interesting.search() lied"
247 # end while
248 if end and i < n:
249 self.handle_data(rawdata[i:n])
250 i = self.updatepos(i, n)
251 self.rawdata = rawdata[i:]
252
Guido van Rossum8846d712001-05-18 14:50:52 +0000253 # Internal -- parse processing instr, return end or -1 if not terminated
254 def parse_pi(self, i):
255 rawdata = self.rawdata
256 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
257 match = piclose.search(rawdata, i+2) # >
258 if not match:
259 return -1
260 j = match.start()
261 self.handle_pi(rawdata[i+2: j])
262 j = match.end()
263 return j
264
265 # Internal -- handle starttag, return end or -1 if not terminated
266 def parse_starttag(self, i):
267 self.__starttag_text = None
268 endpos = self.check_for_whole_start_tag(i)
269 if endpos < 0:
270 return endpos
271 rawdata = self.rawdata
272 self.__starttag_text = rawdata[i:endpos]
273
274 # Now parse the data between i+1 and j into a tag and attrs
275 attrs = []
276 match = tagfind.match(rawdata, i+1)
277 assert match, 'unexpected call to parse_starttag()'
278 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000279 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000280
281 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000282 if self.strict:
283 m = attrfind.match(rawdata, k)
284 else:
285 m = attrfind_tolerant.search(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000286 if not m:
287 break
288 attrname, rest, attrvalue = m.group(1, 2, 3)
289 if not rest:
290 attrvalue = None
291 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
292 attrvalue[:1] == '"' == attrvalue[-1:]:
293 attrvalue = attrvalue[1:-1]
294 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000295 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000296 k = m.end()
297
Fred Drake248b0432001-12-03 17:09:50 +0000298 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000299 if end not in (">", "/>"):
300 lineno, offset = self.getpos()
301 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000302 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000303 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000304 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000305 else:
306 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000307 if self.strict:
308 self.error("junk characters in start tag: %r"
309 % (rawdata[k:endpos][:20],))
310 self.handle_data(rawdata[i:endpos])
311 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000312 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000313 # XHTML-style empty tag: <span attr="value" />
314 self.handle_startendtag(tag, attrs)
315 else:
316 self.handle_starttag(tag, attrs)
317 if tag in self.CDATA_CONTENT_ELEMENTS:
318 self.set_cdata_mode()
319 return endpos
320
321 # Internal -- check to see if we have a complete starttag; return end
322 # or -1 if incomplete.
323 def check_for_whole_start_tag(self, i):
324 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000325 if self.strict:
326 m = locatestarttagend.match(rawdata, i)
327 else:
328 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000329 if m:
330 j = m.end()
331 next = rawdata[j:j+1]
332 if next == ">":
333 return j + 1
334 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000335 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000336 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000337 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000338 # buffer boundary
339 return -1
340 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000341 if self.strict:
342 self.updatepos(i, j + 1)
343 self.error("malformed empty start tag")
344 if j > i:
345 return j
346 else:
347 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000348 if next == "":
349 # end of input
350 return -1
351 if next in ("abcdefghijklmnopqrstuvwxyz=/"
352 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
353 # end of input in or before attribute value, or we have the
354 # '/' from a '/>' ending
355 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000356 if self.strict:
357 self.updatepos(i, j)
358 self.error("malformed start tag")
359 if j > i:
360 return j
361 else:
362 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000363 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000364
365 # Internal -- parse endtag, return end or -1 if incomplete
366 def parse_endtag(self, i):
367 rawdata = self.rawdata
368 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
369 match = endendtag.search(rawdata, i+1) # >
370 if not match:
371 return -1
372 j = match.end()
373 match = endtagfind.match(rawdata, i) # </ + tag + >
374 if not match:
R. David Murrayb579dba2010-12-03 04:06:39 +0000375 if self.strict:
376 self.error("bad end tag: %r" % (rawdata[i:j],))
377 k = rawdata.find('<', i + 1, j)
378 if k > i:
379 j = k
380 if j <= i:
381 j = i + 1
382 self.handle_data(rawdata[i:j])
383 return j
Guido van Rossum8846d712001-05-18 14:50:52 +0000384 tag = match.group(1)
Fred Drake248b0432001-12-03 17:09:50 +0000385 self.handle_endtag(tag.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000386 self.clear_cdata_mode()
Guido van Rossum8846d712001-05-18 14:50:52 +0000387 return j
388
389 # Overridable -- finish processing of start+end tag: <tag.../>
390 def handle_startendtag(self, tag, attrs):
391 self.handle_starttag(tag, attrs)
392 self.handle_endtag(tag)
393
394 # Overridable -- handle start tag
395 def handle_starttag(self, tag, attrs):
396 pass
397
398 # Overridable -- handle end tag
399 def handle_endtag(self, tag):
400 pass
401
402 # Overridable -- handle character reference
403 def handle_charref(self, name):
404 pass
405
406 # Overridable -- handle entity reference
407 def handle_entityref(self, name):
408 pass
409
410 # Overridable -- handle data
411 def handle_data(self, data):
412 pass
413
414 # Overridable -- handle comment
415 def handle_comment(self, data):
416 pass
417
418 # Overridable -- handle declaration
419 def handle_decl(self, decl):
420 pass
421
422 # Overridable -- handle processing instruction
423 def handle_pi(self, data):
424 pass
425
Fred Drakebfc8fea2001-09-24 20:10:28 +0000426 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000427 if self.strict:
428 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000429
Guido van Rossum8846d712001-05-18 14:50:52 +0000430 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000431 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000432 def unescape(self, s):
433 if '&' not in s:
434 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000435 def replaceEntities(s):
436 s = s.groups()[0]
437 if s[0] == "#":
438 s = s[1:]
439 if s[0] in ['x','X']:
440 c = int(s[1:], 16)
441 else:
442 c = int(s)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000443 return chr(c)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000444 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000445 # Cannot use name2codepoint directly, because HTMLParser
446 # supports apos, which is not part of HTML 4
447 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000448 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000449 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000450 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000451 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000452 try:
453 return self.entitydefs[s]
454 except KeyError:
455 return '&'+s+';'
456
Fred Drake3c50ea42008-05-17 22:02:32 +0000457 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Antoine Pitroufd036452008-08-19 17:56:33 +0000458 replaceEntities, s, re.ASCII)