blob: bd6a885fcb2f7e92ff19fd7255b59de7c02e4ae4 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakebfc8fea2001-09-24 20:10:28 +000011import markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
Guido van Rossum8846d712001-05-18 14:50:52 +000013
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000018incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000019
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000024piclose = re.compile('>')
Guido van Rossum8846d712001-05-18 14:50:52 +000025commentclose = re.compile(r'--\s*>')
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
27attrfind = re.compile(
28 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
29 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
30
31locatestarttagend = re.compile(r"""
32 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
33 (?:\s+ # whitespace before attribute name
34 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
35 (?:\s*=\s* # value indicator
36 (?:'[^']*' # LITA-enclosed value
37 |\"[^\"]*\" # LIT-enclosed value
38 |[^'\">\s]+ # bare value
39 )
40 )?
41 )
42 )*
43 \s* # trailing whitespace
44""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000045endendtag = re.compile('>')
46endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
47
Guido van Rossum8846d712001-05-18 14:50:52 +000048
49class HTMLParseError(Exception):
50 """Exception raised for all parse errors."""
51
52 def __init__(self, msg, position=(None, None)):
53 assert msg
54 self.msg = msg
55 self.lineno = position[0]
56 self.offset = position[1]
57
58 def __str__(self):
59 result = self.msg
60 if self.lineno is not None:
61 result = result + ", at line %d" % self.lineno
62 if self.offset is not None:
63 result = result + ", column %d" % (self.offset + 1)
64 return result
65
66
Fred Drakebfc8fea2001-09-24 20:10:28 +000067class HTMLParser(markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000068 """Find tags and other markup and call handler functions.
69
70 Usage:
71 p = HTMLParser()
72 p.feed(data)
73 ...
74 p.close()
75
76 Start tags are handled by calling self.handle_starttag() or
77 self.handle_startendtag(); end tags by self.handle_endtag(). The
78 data between tags is passed from the parser to the derived class
79 by calling self.handle_data() with the data as argument (the data
80 may be split up in arbitrary chunks). Entity references are
81 passed by calling self.handle_entityref() with the entity
82 reference as the argument. Numeric character references are
83 passed to self.handle_charref() with the string containing the
84 reference as the argument.
85 """
Guido van Rossum8846d712001-05-18 14:50:52 +000086
87 CDATA_CONTENT_ELEMENTS = ("script", "style")
88
89
Guido van Rossum8846d712001-05-18 14:50:52 +000090 def __init__(self):
Fred Drake1d4601d2001-08-03 19:50:59 +000091 """Initialize and reset this instance."""
Guido van Rossum8846d712001-05-18 14:50:52 +000092 self.reset()
93
Guido van Rossum8846d712001-05-18 14:50:52 +000094 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +000095 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +000096 self.rawdata = ''
Guido van Rossum8846d712001-05-18 14:50:52 +000097 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +000098 self.interesting = interesting_normal
Fred Drakebfc8fea2001-09-24 20:10:28 +000099 markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000100
Guido van Rossum8846d712001-05-18 14:50:52 +0000101 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000102 """Feed data to the parser.
103
104 Call this as often as you want, with as little or as much text
105 as you want (may include '\n').
106 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000107 self.rawdata = self.rawdata + data
108 self.goahead(0)
109
Guido van Rossum8846d712001-05-18 14:50:52 +0000110 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000111 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000112 self.goahead(1)
113
Fred Drakebfc8fea2001-09-24 20:10:28 +0000114 def error(self, message):
115 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000116
117 __starttag_text = None
118
Guido van Rossum8846d712001-05-18 14:50:52 +0000119 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000120 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 return self.__starttag_text
122
123 def set_cdata_mode(self):
124 self.interesting = interesting_cdata
125
126 def clear_cdata_mode(self):
127 self.interesting = interesting_normal
128
129 # Internal -- handle data as far as reasonable. May leave state
130 # and data to be processed by a subsequent call. If 'end' is
131 # true, force handling all data as if followed by EOF marker.
132 def goahead(self, end):
133 rawdata = self.rawdata
134 i = 0
135 n = len(rawdata)
136 while i < n:
137 match = self.interesting.search(rawdata, i) # < or &
138 if match:
139 j = match.start()
140 else:
141 j = n
142 if i < j: self.handle_data(rawdata[i:j])
143 i = self.updatepos(i, j)
144 if i == n: break
Fred Drake248b0432001-12-03 17:09:50 +0000145 startswith = rawdata.startswith
146 if startswith('<', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000147 if starttagopen.match(rawdata, i): # < + letter
148 k = self.parse_starttag(i)
Fred Drake248b0432001-12-03 17:09:50 +0000149 elif startswith("</", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000150 k = self.parse_endtag(i)
151 if k >= 0:
152 self.clear_cdata_mode()
Fred Drake248b0432001-12-03 17:09:50 +0000153 elif startswith("<!--", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000154 k = self.parse_comment(i)
Fred Drake248b0432001-12-03 17:09:50 +0000155 elif startswith("<?", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000156 k = self.parse_pi(i)
Fred Drake248b0432001-12-03 17:09:50 +0000157 elif startswith("<!", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000158 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000159 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000160 self.handle_data("<")
161 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000162 else:
163 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000164 if k < 0:
165 if end:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000166 self.error("EOF in middle of construct")
Guido van Rossum8846d712001-05-18 14:50:52 +0000167 break
168 i = self.updatepos(i, k)
Fred Drake248b0432001-12-03 17:09:50 +0000169 elif startswith("&#", i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000170 match = charref.match(rawdata, i)
171 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000172 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000173 self.handle_charref(name)
174 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000175 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000176 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000177 i = self.updatepos(i, k)
178 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000179 else:
180 break
Fred Drake248b0432001-12-03 17:09:50 +0000181 elif startswith('&', i):
Guido van Rossum8846d712001-05-18 14:50:52 +0000182 match = entityref.match(rawdata, i)
183 if match:
184 name = match.group(1)
185 self.handle_entityref(name)
186 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000187 if not startswith(';', k-1):
Fred Drake029acfb2001-08-20 21:24:19 +0000188 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000189 i = self.updatepos(i, k)
190 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000191 match = incomplete.match(rawdata, i)
192 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000193 # match.group() will contain at least 2 chars
Fred Drake248b0432001-12-03 17:09:50 +0000194 if end and match.group() == rawdata[i:]:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000195 self.error("EOF in middle of entity or char ref")
Fred Drake68eac2b2001-09-04 15:10:16 +0000196 # incomplete
197 break
198 elif (i + 1) < n:
199 # not the end of the buffer, and can't be confused
200 # with some other construct
201 self.handle_data("&")
202 i = self.updatepos(i, i + 1)
203 else:
204 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000205 else:
206 assert 0, "interesting.search() lied"
207 # end while
208 if end and i < n:
209 self.handle_data(rawdata[i:n])
210 i = self.updatepos(i, n)
211 self.rawdata = rawdata[i:]
212
213 # Internal -- parse comment, return end or -1 if not terminated
Fred Drake68eac2b2001-09-04 15:10:16 +0000214 def parse_comment(self, i, report=1):
Guido van Rossum8846d712001-05-18 14:50:52 +0000215 rawdata = self.rawdata
216 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
217 match = commentclose.search(rawdata, i+4)
218 if not match:
219 return -1
Fred Drake68eac2b2001-09-04 15:10:16 +0000220 if report:
221 j = match.start()
222 self.handle_comment(rawdata[i+4: j])
Guido van Rossum8846d712001-05-18 14:50:52 +0000223 j = match.end()
224 return j
225
Guido van Rossum8846d712001-05-18 14:50:52 +0000226 # Internal -- parse processing instr, return end or -1 if not terminated
227 def parse_pi(self, i):
228 rawdata = self.rawdata
229 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
230 match = piclose.search(rawdata, i+2) # >
231 if not match:
232 return -1
233 j = match.start()
234 self.handle_pi(rawdata[i+2: j])
235 j = match.end()
236 return j
237
238 # Internal -- handle starttag, return end or -1 if not terminated
239 def parse_starttag(self, i):
240 self.__starttag_text = None
241 endpos = self.check_for_whole_start_tag(i)
242 if endpos < 0:
243 return endpos
244 rawdata = self.rawdata
245 self.__starttag_text = rawdata[i:endpos]
246
247 # Now parse the data between i+1 and j into a tag and attrs
248 attrs = []
249 match = tagfind.match(rawdata, i+1)
250 assert match, 'unexpected call to parse_starttag()'
251 k = match.end()
Fred Drake248b0432001-12-03 17:09:50 +0000252 self.lasttag = tag = rawdata[i+1:k].lower()
Guido van Rossum8846d712001-05-18 14:50:52 +0000253
254 while k < endpos:
255 m = attrfind.match(rawdata, k)
256 if not m:
257 break
258 attrname, rest, attrvalue = m.group(1, 2, 3)
259 if not rest:
260 attrvalue = None
261 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
262 attrvalue[:1] == '"' == attrvalue[-1:]:
263 attrvalue = attrvalue[1:-1]
264 attrvalue = self.unescape(attrvalue)
Fred Drake248b0432001-12-03 17:09:50 +0000265 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum8846d712001-05-18 14:50:52 +0000266 k = m.end()
267
Fred Drake248b0432001-12-03 17:09:50 +0000268 end = rawdata[k:endpos].strip()
Guido van Rossum8846d712001-05-18 14:50:52 +0000269 if end not in (">", "/>"):
270 lineno, offset = self.getpos()
271 if "\n" in self.__starttag_text:
Fred Drake248b0432001-12-03 17:09:50 +0000272 lineno = lineno + self.__starttag_text.count("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000273 offset = len(self.__starttag_text) \
Fred Drake248b0432001-12-03 17:09:50 +0000274 - self.__starttag_text.rfind("\n")
Guido van Rossum8846d712001-05-18 14:50:52 +0000275 else:
276 offset = offset + len(self.__starttag_text)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000277 self.error("junk characters in start tag: %s"
278 % `rawdata[k:endpos][:20]`)
Fred Drake248b0432001-12-03 17:09:50 +0000279 if end.endswith('/>'):
Guido van Rossum8846d712001-05-18 14:50:52 +0000280 # XHTML-style empty tag: <span attr="value" />
281 self.handle_startendtag(tag, attrs)
282 else:
283 self.handle_starttag(tag, attrs)
284 if tag in self.CDATA_CONTENT_ELEMENTS:
285 self.set_cdata_mode()
286 return endpos
287
288 # Internal -- check to see if we have a complete starttag; return end
289 # or -1 if incomplete.
290 def check_for_whole_start_tag(self, i):
291 rawdata = self.rawdata
292 m = locatestarttagend.match(rawdata, i)
293 if m:
294 j = m.end()
295 next = rawdata[j:j+1]
296 if next == ">":
297 return j + 1
298 if next == "/":
Fred Drake248b0432001-12-03 17:09:50 +0000299 if rawdata.startswith("/>", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000300 return j + 2
Fred Drake248b0432001-12-03 17:09:50 +0000301 if rawdata.startswith("/", j):
Guido van Rossum8846d712001-05-18 14:50:52 +0000302 # buffer boundary
303 return -1
304 # else bogus input
305 self.updatepos(i, j + 1)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000306 self.error("malformed empty start tag")
Guido van Rossum8846d712001-05-18 14:50:52 +0000307 if next == "":
308 # end of input
309 return -1
310 if next in ("abcdefghijklmnopqrstuvwxyz=/"
311 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
312 # end of input in or before attribute value, or we have the
313 # '/' from a '/>' ending
314 return -1
315 self.updatepos(i, j)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000316 self.error("malformed start tag")
317 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000318
319 # Internal -- parse endtag, return end or -1 if incomplete
320 def parse_endtag(self, i):
321 rawdata = self.rawdata
322 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
323 match = endendtag.search(rawdata, i+1) # >
324 if not match:
325 return -1
326 j = match.end()
327 match = endtagfind.match(rawdata, i) # </ + tag + >
328 if not match:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000329 self.error("bad end tag: %s" % `rawdata[i:j]`)
Guido van Rossum8846d712001-05-18 14:50:52 +0000330 tag = match.group(1)
Fred Drake248b0432001-12-03 17:09:50 +0000331 self.handle_endtag(tag.lower())
Guido van Rossum8846d712001-05-18 14:50:52 +0000332 return j
333
334 # Overridable -- finish processing of start+end tag: <tag.../>
335 def handle_startendtag(self, tag, attrs):
336 self.handle_starttag(tag, attrs)
337 self.handle_endtag(tag)
338
339 # Overridable -- handle start tag
340 def handle_starttag(self, tag, attrs):
341 pass
342
343 # Overridable -- handle end tag
344 def handle_endtag(self, tag):
345 pass
346
347 # Overridable -- handle character reference
348 def handle_charref(self, name):
349 pass
350
351 # Overridable -- handle entity reference
352 def handle_entityref(self, name):
353 pass
354
355 # Overridable -- handle data
356 def handle_data(self, data):
357 pass
358
359 # Overridable -- handle comment
360 def handle_comment(self, data):
361 pass
362
363 # Overridable -- handle declaration
364 def handle_decl(self, decl):
365 pass
366
367 # Overridable -- handle processing instruction
368 def handle_pi(self, data):
369 pass
370
Fred Drakebfc8fea2001-09-24 20:10:28 +0000371 def unknown_decl(self, data):
372 self.error("unknown declaration: " + `data`)
373
Guido van Rossum8846d712001-05-18 14:50:52 +0000374 # Internal -- helper to remove special character quoting
375 def unescape(self, s):
376 if '&' not in s:
377 return s
Fred Drake248b0432001-12-03 17:09:50 +0000378 s = s.replace("&lt;", "<")
379 s = s.replace("&gt;", ">")
380 s = s.replace("&apos;", "'")
381 s = s.replace("&quot;", '"')
382 s = s.replace("&amp;", "&") # Must be last
Guido van Rossum8846d712001-05-18 14:50:52 +0000383 return s