blob: c2c7f6bf5da5fcab99caf17e19a7a7daafed0c83 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakecb5c80f2007-12-07 11:10:11 +000011import _markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
27attrfind = re.compile(
28 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
Andrew M. Kuchlingb7d8ce02004-06-05 15:31:45 +000029 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000030
31locatestarttagend = re.compile(r"""
32 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
33 (?:\s+ # whitespace before attribute name
34 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
35 (?:\s*=\s* # value indicator
36 (?:'[^']*' # LITA-enclosed value
37 |\"[^\"]*\" # LIT-enclosed value
38 |[^'\">\s]+ # bare value
Georg Brandlcd3c26a2005-09-01 06:25:34 +000039 )
Guido van Rossum8846d712001-05-18 14:50:52 +000040 )?
41 )
42 )*
43 \s* # trailing whitespace
44""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000045endendtag = re.compile('>')
46endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
47
Guido van Rossum8846d712001-05-18 14:50:52 +000048
49class HTMLParseError(Exception):
50 """Exception raised for all parse errors."""
51
52 def __init__(self, msg, position=(None, None)):
53 assert msg
54 self.msg = msg
55 self.lineno = position[0]
56 self.offset = position[1]
57
58 def __str__(self):
59 result = self.msg
60 if self.lineno is not None:
61 result = result + ", at line %d" % self.lineno
62 if self.offset is not None:
63 result = result + ", column %d" % (self.offset + 1)
64 return result
65
66
Fred Drakecb5c80f2007-12-07 11:10:11 +000067class HTMLParser(_markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000068 """Find tags and other markup and call handler functions.
69
70 Usage:
71 p = HTMLParser()
72 p.feed(data)
73 ...
74 p.close()
75
76 Start tags are handled by calling self.handle_starttag() or
77 self.handle_startendtag(); end tags by self.handle_endtag(). The
78 data between tags is passed from the parser to the derived class
79 by calling self.handle_data() with the data as argument (the data
80 may be split up in arbitrary chunks). Entity references are
81 passed by calling self.handle_entityref() with the entity
82 reference as the argument. Numeric character references are
83 passed to self.handle_charref() with the string containing the
84 reference as the argument.
85 """
Guido van Rossum8846d712001-05-18 14:50:52 +000086
87 CDATA_CONTENT_ELEMENTS = ("script", "style")
88
89
Guido van Rossum8846d712001-05-18 14:50:52 +000090 def __init__(self):
Fred Drake1d4601d2001-08-03 19:50:59 +000091 """Initialize and reset this instance."""
Guido van Rossum8846d712001-05-18 14:50:52 +000092 self.reset()
93
Guido van Rossum8846d712001-05-18 14:50:52 +000094 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +000095 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +000096 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +000097 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +000098 self.interesting = interesting_normal
Fred Drakecb5c80f2007-12-07 11:10:11 +000099 _markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000100
Guido van Rossum8846d712001-05-18 14:50:52 +0000101 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000102 """Feed data to the parser.
103
104 Call this as often as you want, with as little or as much text
105 as you want (may include '\n').
106 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000107 self.rawdata = self.rawdata + data
108 self.goahead(0)
109
Guido van Rossum8846d712001-05-18 14:50:52 +0000110 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000111 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000112 self.goahead(1)
113
Fred Drakebfc8fea2001-09-24 20:10:28 +0000114 def error(self, message):
115 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000116
117 __starttag_text = None
118
Guido van Rossum8846d712001-05-18 14:50:52 +0000119 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000120 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 return self.__starttag_text
122
123 def set_cdata_mode(self):
124 self.interesting = interesting_cdata
125
126 def clear_cdata_mode(self):
127 self.interesting = interesting_normal
128
129 # Internal -- handle data as far as reasonable. May leave state
130 # and data to be processed by a subsequent call. If 'end' is
131 # true, force handling all data as if followed by EOF marker.
132 def goahead(self, end):
133 rawdata = self.rawdata
134 i = 0
135 n = len(rawdata)
136 while i < n:
137 match = self.interesting.search(rawdata, i) # < or &
138 if match:
139 j = match.start()
140 else:
141 j = n
142 if i < j: self.handle_data(rawdata[i:j])
143 i = self.updatepos(i, j)
144 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000145 startswith = rawdata.startswith
146 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 if starttagopen.match(rawdata, i): # < + letter
148 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000149 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000150 k = self.parse_endtag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000151 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000152 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000153 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000154 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000155 elif startswith("<!", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000156 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000157 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000158 self.handle_data("<")
159 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000160 else:
161 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000162 if k < 0:
163 if end:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000164 self.error("EOF in middle of construct")
Guido van Rossum8846d712001-05-18 14:50:52 +0000165 break
166 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000167 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000168 match = charref.match(rawdata, i)
169 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000170 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000171 self.handle_charref(name)
172 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000173 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000174 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000175 i = self.updatepos(i, k)
176 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000177 else:
Victor Stinnere021f4b2010-05-24 21:46:25 +0000178 if ";" in rawdata[i:]: #bail by consuming &#
179 self.handle_data(rawdata[0:2])
180 i = self.updatepos(i, 2)
Fred Drake68eac2b2001-09-04 15:10:16 +0000181 break
Fred Drake248b0432001-12-03 17:09:50 +0000182 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000183 match = entityref.match(rawdata, i)
184 if match:
185 name = match.group(1)
186 self.handle_entityref(name)
187 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000188 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000189 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000190 i = self.updatepos(i, k)
191 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000192 match = incomplete.match(rawdata, i)
193 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000194 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000195 if end and match.group() == rawdata[i:]:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000196 self.error("EOF in middle of entity or char ref")
Fred Drake68eac2b2001-09-04 15:10:16 +0000197 # incomplete
198 break
199 elif (i + 1) < n:
200 # not the end of the buffer, and can't be confused
201 # with some other construct
202 self.handle_data("&")
203 i = self.updatepos(i, i + 1)
204 else:
205 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000206 else:
207 assert 0, "interesting.search() lied"
208 # end while
209 if end and i < n:
210 self.handle_data(rawdata[i:n])
211 i = self.updatepos(i, n)
212 self.rawdata = rawdata[i:]
213
Guido van Rossum8846d712001-05-18 14:50:52 +0000214 # Internal -- parse processing instr, return end or -1 if not terminated
215 def parse_pi(self, i):
216 rawdata = self.rawdata
217 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
218 match = piclose.search(rawdata, i+2) # >
219 if not match:
220 return -1
221 j = match.start()
222 self.handle_pi(rawdata[i+2: j])
223 j = match.end()
224 return j
225
226 # Internal -- handle starttag, return end or -1 if not terminated
227 def parse_starttag(self, i):
228 self.__starttag_text = None
229 endpos = self.check_for_whole_start_tag(i)
230 if endpos < 0:
231 return endpos
232 rawdata = self.rawdata
233 self.__starttag_text = rawdata[i:endpos]
234
235 # Now parse the data between i+1 and j into a tag and attrs
236 attrs = []
237 match = tagfind.match(rawdata, i+1)
238 assert match, 'unexpected call to parse_starttag()'
239 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000240 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000241
242 while k < endpos:
243 m = attrfind.match(rawdata, k)
244 if not m:
245 break
246 attrname, rest, attrvalue = m.group(1, 2, 3)
247 if not rest:
248 attrvalue = None
249 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
250 attrvalue[:1] == '"' == attrvalue[-1:]:
251 attrvalue = attrvalue[1:-1]
252 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000253 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000254 k = m.end()
255
Fred Drake248b0432001-12-03 17:09:50 +0000256 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000257 if end not in (">", "/>"):
258 lineno, offset = self.getpos()
259 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000260 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000261 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000262 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000263 else:
264 offset = offset + len(self.__starttag_text)
Walter Dörwald70a6b492004-02-12 17:35:32 +0000265 self.error("junk characters in start tag: %r"
266 % (rawdata[k:endpos][:20],))
Fred Drake248b0432001-12-03 17:09:50 +0000267 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000268 # XHTML-style empty tag: <span attr="value" />
269 self.handle_startendtag(tag, attrs)
270 else:
271 self.handle_starttag(tag, attrs)
272 if tag in self.CDATA_CONTENT_ELEMENTS:
273 self.set_cdata_mode()
274 return endpos
275
276 # Internal -- check to see if we have a complete starttag; return end
277 # or -1 if incomplete.
278 def check_for_whole_start_tag(self, i):
279 rawdata = self.rawdata
280 m = locatestarttagend.match(rawdata, i)
281 if m:
282 j = m.end()
283 next = rawdata[j:j+1]
284 if next == ">":
285 return j + 1
286 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000287 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000288 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000289 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000290 # buffer boundary
291 return -1
292 # else bogus input
293 self.updatepos(i, j + 1)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000294 self.error("malformed empty start tag")
Guido van Rossum8846d712001-05-18 14:50:52 +0000295 if next == "":
296 # end of input
297 return -1
298 if next in ("abcdefghijklmnopqrstuvwxyz=/"
299 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
300 # end of input in or before attribute value, or we have the
301 # '/' from a '/>' ending
302 return -1
303 self.updatepos(i, j)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000304 self.error("malformed start tag")
305 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000306
307 # Internal -- parse endtag, return end or -1 if incomplete
308 def parse_endtag(self, i):
309 rawdata = self.rawdata
310 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
311 match = endendtag.search(rawdata, i+1) # >
312 if not match:
313 return -1
314 j = match.end()
315 match = endtagfind.match(rawdata, i) # </ + tag + >
316 if not match:
Walter Dörwald70a6b492004-02-12 17:35:32 +0000317 self.error("bad end tag: %r" % (rawdata[i:j],))
Guido van Rossum8846d712001-05-18 14:50:52 +0000318 tag = match.group(1)
Fred Drake248b0432001-12-03 17:09:50 +0000319 self.handle_endtag(tag.lower())
Fred Drake30d59ba2002-05-14 15:50:11 +0000320 self.clear_cdata_mode()
Guido van Rossum8846d712001-05-18 14:50:52 +0000321 return j
322
323 # Overridable -- finish processing of start+end tag: <tag.../>
324 def handle_startendtag(self, tag, attrs):
325 self.handle_starttag(tag, attrs)
326 self.handle_endtag(tag)
327
328 # Overridable -- handle start tag
329 def handle_starttag(self, tag, attrs):
330 pass
331
332 # Overridable -- handle end tag
333 def handle_endtag(self, tag):
334 pass
335
336 # Overridable -- handle character reference
337 def handle_charref(self, name):
338 pass
339
340 # Overridable -- handle entity reference
341 def handle_entityref(self, name):
342 pass
343
344 # Overridable -- handle data
345 def handle_data(self, data):
346 pass
347
348 # Overridable -- handle comment
349 def handle_comment(self, data):
350 pass
351
352 # Overridable -- handle declaration
353 def handle_decl(self, decl):
354 pass
355
356 # Overridable -- handle processing instruction
357 def handle_pi(self, data):
358 pass
359
Fred Drakebfc8fea2001-09-24 20:10:28 +0000360 def unknown_decl(self, data):
Walter Dörwald70a6b492004-02-12 17:35:32 +0000361 self.error("unknown declaration: %r" % (data,))
Fred Drakebfc8fea2001-09-24 20:10:28 +0000362
Guido van Rossum8846d712001-05-18 14:50:52 +0000363 # Internal -- helper to remove special character quoting
Guido van Rossumd8faa362007-04-27 19:54:29 +0000364 entitydefs = None
Guido van Rossum8846d712001-05-18 14:50:52 +0000365 def unescape(self, s):
366 if '&' not in s:
367 return s
Guido van Rossumd8faa362007-04-27 19:54:29 +0000368 def replaceEntities(s):
369 s = s.groups()[0]
370 if s[0] == "#":
371 s = s[1:]
372 if s[0] in ['x','X']:
373 c = int(s[1:], 16)
374 else:
375 c = int(s)
Guido van Rossum84fc66d2007-05-03 17:18:26 +0000376 return chr(c)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000377 else:
Fred Drake3c50ea42008-05-17 22:02:32 +0000378 # Cannot use name2codepoint directly, because HTMLParser
379 # supports apos, which is not part of HTML 4
380 import html.entities
Guido van Rossumd8faa362007-04-27 19:54:29 +0000381 if HTMLParser.entitydefs is None:
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000382 entitydefs = HTMLParser.entitydefs = {'apos':"'"}
Fred Drake3c50ea42008-05-17 22:02:32 +0000383 for k, v in html.entities.name2codepoint.items():
Mark Dickinsonf64dcf32008-05-21 13:51:18 +0000384 entitydefs[k] = chr(v)
Guido van Rossumd8faa362007-04-27 19:54:29 +0000385 try:
386 return self.entitydefs[s]
387 except KeyError:
388 return '&'+s+';'
389
Fred Drake3c50ea42008-05-17 22:02:32 +0000390 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
Antoine Pitroufd036452008-08-19 17:56:33 +0000391 replaceEntities, s, re.ASCII)