blob: affaf7344fa4f90d69b065efd936345bc63144f3 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
R. David Murrayb579dba2010-12-03 04:06:39 +000027# Note, the strict one of this pair isn't really strict, but we can't
28# make it correctly strict without breaking backward compatibility.
Guido van Rossum8846d712001-05-18 14:50:52 +000029attrfind = re.compile(
30 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Ezio Melotti2e3607c2011-04-07 22:03:31 +030031 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
R. David Murrayb579dba2010-12-03 04:06:39 +000032attrfind_tolerant = re.compile(
Ezio Melottif50ffa92011-10-28 13:21:09 +030033 r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
R. David Murrayb579dba2010-12-03 04:06:39 +000034 r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000035locatestarttagend = re.compile(r"""
36 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
37 (?:\s+ # whitespace before attribute name
38 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
39 (?:\s*=\s* # value indicator
40 (?:'[^']*' # LITA-enclosed value
41 |\"[^\"]*\" # LIT-enclosed value
42 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000043 )
Guido van Rossum8846d712001-05-18 14:50:52 +000044 )?
45 )
46 )*
47 \s* # trailing whitespace
48""", re.VERBOSE)
R. David Murrayb579dba2010-12-03 04:06:39 +000049locatestarttagend_tolerant = re.compile(r"""
50 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
51 (?:\s* # optional whitespace before attribute name
52 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
53 (?:\s*=\s* # value indicator
54 (?:'[^']*' # LITA-enclosed value
55 |\"[^\"]*\" # LIT-enclosed value
56 |[^'\">\s]+ # bare value
57 )
58 (?:\s*,)* # possibly followed by a comma
59 )?
60 )
61 )*
62 \s* # trailing whitespace
63""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000064endendtag = re.compile('>')
65endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
66
Guido van Rossum8846d712001-05-18 14:50:52 +000067
68class HTMLParseError(Exception):
69 """Exception raised for all parse errors."""
70
71 def __init__(self, msg, position=(None, None)):
72 assert msg
73 self.msg = msg
74 self.lineno = position[0]
75 self.offset = position[1]
76
77 def __str__(self):
78 result = self.msg
79 if self.lineno is not None:
80 result = result + ", at line %d" % self.lineno
81 if self.offset is not None:
82 result = result + ", column %d" % (self.offset + 1)
83 return result
84
85
Fred Drakecb5c80f2007-12-07 11:10:11 +000086class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000087 """Find tags and other markup and call handler functions.
88
89 Usage:
90 p = HTMLParser()
91 p.feed(data)
92 ...
93 p.close()
94
95 Start tags are handled by calling self.handle_starttag() or
96 self.handle_startendtag(); end tags by self.handle_endtag(). The
97 data between tags is passed from the parser to the derived class
98 by calling self.handle_data() with the data as argument (the data
99 may be split up in arbitrary chunks). Entity references are
100 passed by calling self.handle_entityref() with the entity
101 reference as the argument. Numeric character references are
102 passed to self.handle_charref() with the string containing the
103 reference as the argument.
104 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000105
106 CDATA_CONTENT_ELEMENTS = ("script", "style")
107
R. David Murrayb579dba2010-12-03 04:06:39 +0000108 def __init__(self, strict=True):
109 """Initialize and reset this instance.
Guido van Rossum8846d712001-05-18 14:50:52 +0000110
R. David Murrayb579dba2010-12-03 04:06:39 +0000111 If strict is set to True (the default), errors are raised when invalid
112 HTML is encountered. If set to False, an attempt is instead made to
113 continue parsing, making "best guesses" about the intended meaning, in
114 a fashion similar to what browsers typically do.
115 """
116 self.strict = strict
Guido van Rossum8846d712001-05-18 14:50:52 +0000117 self.reset()
118
Guido van Rossum8846d712001-05-18 14:50:52 +0000119 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000120 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +0000122 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.interesting = interesting_normal
Fred Drakecb5c80f2007-12-07 11:10:11 +0000124 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000125
Guido van Rossum8846d712001-05-18 14:50:52 +0000126 def feed(self, data):
Éric Araujo39f180b2011-05-04 15:55:47 +0200127 r"""Feed data to the parser.
Fred Drake1d4601d2001-08-03 19:50:59 +0000128
129 Call this as often as you want, with as little or as much text
130 as you want (may include '\n').
131 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000132 self.rawdata = self.rawdata + data
133 self.goahead(0)
134
Guido van Rossum8846d712001-05-18 14:50:52 +0000135 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000136 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000137 self.goahead(1)
138
Fred Drakebfc8fea2001-09-24 20:10:28 +0000139 def error(self, message):
140 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000141
142 __starttag_text = None
143
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000145 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000146 return self.__starttag_text
147
148 def set_cdata_mode(self):
149 self.interesting = interesting_cdata
150
151 def clear_cdata_mode(self):
152 self.interesting = interesting_normal
153
154 # Internal -- handle data as far as reasonable. May leave state
155 # and data to be processed by a subsequent call. If 'end' is
156 # true, force handling all data as if followed by EOF marker.
157 def goahead(self, end):
158 rawdata = self.rawdata
159 i = 0
160 n = len(rawdata)
161 while i < n:
162 match = self.interesting.search(rawdata, i) # < or &
163 if match:
164 j = match.start()
165 else:
166 j = n
167 if i < j: self.handle_data(rawdata[i:j])
168 i = self.updatepos(i, j)
169 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000170 startswith = rawdata.startswith
171 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000172 if starttagopen.match(rawdata, i): # < + letter
173 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000174 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000175 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000176 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000177 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000178 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000179 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000180 elif startswith("<!", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000181 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000182 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000183 self.handle_data("<")
184 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000185 else:
186 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000187 if k < 0:
R. David Murrayb579dba2010-12-03 04:06:39 +0000188 if not end:
189 break
190 if self.strict:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000191 self.error("EOF in middle of construct")
R. David Murrayb579dba2010-12-03 04:06:39 +0000192 k = rawdata.find('>', i + 1)
193 if k < 0:
194 k = rawdata.find('<', i + 1)
195 if k < 0:
196 k = i + 1
197 else:
198 k += 1
199 self.handle_data(rawdata[i:k])
Guido van Rossum8846d712001-05-18 14:50:52 +0000200 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000201 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000202 match = charref.match(rawdata, i)
203 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000204 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000205 self.handle_charref(name)
206 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000207 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000208 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000209 i = self.updatepos(i, k)
210 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000211 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000212 if ";" in rawdata[i:]: #bail by consuming &#
213 self.handle_data(rawdata[0:2])
214 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000215 break
Fred Drake248b0432001-12-03 17:09:50 +0000216 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000217 match = entityref.match(rawdata, i)
218 if match:
219 name = match.group(1)
220 self.handle_entityref(name)
221 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000222 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000223 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000224 i = self.updatepos(i, k)
225 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000226 match = incomplete.match(rawdata, i)
227 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000228 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000229 if end and match.group() == rawdata[i:]:
R. David Murrayb579dba2010-12-03 04:06:39 +0000230 if self.strict:
231 self.error("EOF in middle of entity or char ref")
232 else:
233 if k <= i:
234 k = n
235 i = self.updatepos(i, i + 1)
Fred Drake68eac2b2001-09-04 15:10:16 +0000236 # incomplete
237 break
238 elif (i + 1) < n:
239 # not the end of the buffer, and can't be confused
240 # with some other construct
241 self.handle_data("&")
242 i = self.updatepos(i, i + 1)
243 else:
244 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000245 else:
246 assert 0, "interesting.search() lied"
247 # end while
248 if end and i < n:
249 self.handle_data(rawdata[i:n])
250 i = self.updatepos(i, n)
251 self.rawdata = rawdata[i:]
252
Guido van Rossum8846d712001-05-18 14:50:52 +0000253 # Internal -- parse processing instr, return end or -1 if not terminated
254 def parse_pi(self, i):
255 rawdata = self.rawdata
256 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
257 match = piclose.search(rawdata, i+2) # >
258 if not match:
259 return -1
260 j = match.start()
261 self.handle_pi(rawdata[i+2: j])
262 j = match.end()
263 return j
264
265 # Internal -- handle starttag, return end or -1 if not terminated
266 def parse_starttag(self, i):
267 self.__starttag_text = None
268 endpos = self.check_for_whole_start_tag(i)
269 if endpos < 0:
270 return endpos
271 rawdata = self.rawdata
272 self.__starttag_text = rawdata[i:endpos]
273
274 # Now parse the data between i+1 and j into a tag and attrs
275 attrs = []
276 match = tagfind.match(rawdata, i+1)
277 assert match, 'unexpected call to parse_starttag()'
278 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000279 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000280 while k < endpos:
R. David Murrayb579dba2010-12-03 04:06:39 +0000281 if self.strict:
282 m = attrfind.match(rawdata, k)
283 else:
Ezio Melottif50ffa92011-10-28 13:21:09 +0300284 m = attrfind_tolerant.match(rawdata, k)
Guido van Rossum8846d712001-05-18 14:50:52 +0000285 if not m:
286 break
287 attrname, rest, attrvalue = m.group(1, 2, 3)
288 if not rest:
289 attrvalue = None
290 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
291 attrvalue[:1] == '"' == attrvalue[-1:]:
292 attrvalue = attrvalue[1:-1]
293 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000294 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000295 k = m.end()
296
Fred Drake248b0432001-12-03 17:09:50 +0000297 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000298 if end not in (">", "/>"):
299 lineno, offset = self.getpos()
300 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000301 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000302 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000303 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000304 else:
305 offset = offset + len(self.__starttag_text)
R. David Murrayb579dba2010-12-03 04:06:39 +0000306 if self.strict:
307 self.error("junk characters in start tag: %r"
308 % (rawdata[k:endpos][:20],))
309 self.handle_data(rawdata[i:endpos])
310 return endpos
Fred Drake248b0432001-12-03 17:09:50 +0000311 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000312 # XHTML-style empty tag: <span attr="value" />
313 self.handle_startendtag(tag, attrs)
314 else:
315 self.handle_starttag(tag, attrs)
316 if tag in self.CDATA_CONTENT_ELEMENTS:
317 self.set_cdata_mode()
318 return endpos
319
320 # Internal -- check to see if we have a complete starttag; return end
321 # or -1 if incomplete.
322 def check_for_whole_start_tag(self, i):
323 rawdata = self.rawdata
R. David Murrayb579dba2010-12-03 04:06:39 +0000324 if self.strict:
325 m = locatestarttagend.match(rawdata, i)
326 else:
327 m = locatestarttagend_tolerant.match(rawdata, i)
Guido van Rossum8846d712001-05-18 14:50:52 +0000328 if m:
329 j = m.end()
330 next = rawdata[j:j+1]
331 if next == ">":
332 return j + 1
333 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000334 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000335 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000336 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000337 # buffer boundary
338 return -1
339 # else bogus input
R. David Murrayb579dba2010-12-03 04:06:39 +0000340 if self.strict:
341 self.updatepos(i, j + 1)
342 self.error("malformed empty start tag")
343 if j > i:
344 return j
345 else:
346 return i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000347 if next == "":
348 # end of input
349 return -1
350 if next in ("abcdefghijklmnopqrstuvwxyz=/"
351 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
352 # end of input in or before attribute value, or we have the
353 # '/' from a '/>' ending
354 return -1
R. David Murrayb579dba2010-12-03 04:06:39 +0000355 if self.strict:
356 self.updatepos(i, j)
357 self.error("malformed start tag")
358 if j > i:
359 return j
360 else:
361 return i + 1
Fred Drakebfc8fea2001-09-24 20:10:28 +0000362 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000363
364 # Internal -- parse endtag, return end or -1 if incomplete
365 def parse_endtag(self, i):
366 rawdata = self.rawdata
367 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
368 match = endendtag.search(rawdata, i+1) # >
369 if not match:
370 return -1
371 j = match.end()
372 match = endtagfind.match(rawdata, i) # </ + tag + >
373 if not match:
R. David Murrayb579dba2010-12-03 04:06:39 +0000374 if self.strict:
375 self.error("bad end tag: %r" % (rawdata[i:j],))
376 k = rawdata.find('<', i + 1, j)
377 if k > i:
378 j = k
379 if j <= i:
380 j = i + 1
381 self.handle_data(rawdata[i:j])
382 return j
Guido van Rossum8846d712001-05-18 14:50:52 +0000383 tag = match.group(1)
Fred Drake248b0432001-12-03 17:09:50 +0000384 self.handle_endtag(tag.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000385 self.clear_cdata_mode()
Guido van Rossum8846d712001-05-18 14:50:52 +0000386 return j
387
388 # Overridable -- finish processing of start+end tag: <tag.../>
389 def handle_startendtag(self, tag, attrs):
390 self.handle_starttag(tag, attrs)
391 self.handle_endtag(tag)
392
393 # Overridable -- handle start tag
394 def handle_starttag(self, tag, attrs):
395 pass
396
397 # Overridable -- handle end tag
398 def handle_endtag(self, tag):
399 pass
400
401 # Overridable -- handle character reference
402 def handle_charref(self, name):
403 pass
404
405 # Overridable -- handle entity reference
406 def handle_entityref(self, name):
407 pass
408
409 # Overridable -- handle data
410 def handle_data(self, data):
411 pass
412
413 # Overridable -- handle comment
414 def handle_comment(self, data):
415 pass
416
417 # Overridable -- handle declaration
418 def handle_decl(self, decl):
419 pass
420
421 # Overridable -- handle processing instruction
422 def handle_pi(self, data):
423 pass
424
Fred Drakebfc8fea2001-09-24 20:10:28 +0000425 def unknown_decl(self, data):
R. David Murrayb579dba2010-12-03 04:06:39 +0000426 if self.strict:
427 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000428
Guido van Rossum8846d712001-05-18 14:50:52 +0000429 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000430 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000431 def unescape(self, s):
432 if '&' not in s:
433 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000434 def replaceEntities(s):
435 s = s.groups()[0]
Senthil Kumaran164540f2010-12-28 15:55:16 +0000436 try:
437 if s[0] == "#":
438 s = s[1:]
439 if s[0] in ['x','X']:
440 c = int(s[1:], 16)
441 else:
442 c = int(s)
443 return chr(c)
444 except ValueError:
445 return '&#'+ s +';'
Guido van Rossumd8faa362007-04-27 19:54:29 +0000446 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000447 # Cannot use name2codepoint directly, because HTMLParser
448 # supports apos, which is not part of HTML 4
449 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000450 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000451 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000452 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000453 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000454 try:
455 return self.entitydefs[s]
456 except KeyError:
457 return '&'+s+';'
458
Fred Drake3c50ea42008-05-17 22:02:32 +0000459 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Ezio Melottid9e0b062011-09-05 17:11:06 +0300460 replaceEntities, s, flags=re.ASCII)