blob: 954ce2647f570a9c9a3cd13e29cc817cf59bf536 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import string
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake029acfb2001-08-20 21:24:19 +000018incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
19 '|#([0-9]*|[xX][0-9a-fA-F]*))?')
Guido van Rossum8846d712001-05-18 14:50:52 +000020
21entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000022charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023
24starttagopen = re.compile('<[a-zA-Z]')
25piopen = re.compile(r'<\?')
26piclose = re.compile('>')
27endtagopen = re.compile('</')
28declopen = re.compile('<!')
29special = re.compile('<![^<>]*>')
30commentopen = re.compile('<!--')
31commentclose = re.compile(r'--\s*>')
32tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
33attrfind = re.compile(
34 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
35 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
36
37locatestarttagend = re.compile(r"""
38 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
39 (?:\s+ # whitespace before attribute name
40 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
41 (?:\s*=\s* # value indicator
42 (?:'[^']*' # LITA-enclosed value
43 |\"[^\"]*\" # LIT-enclosed value
44 |[^'\">\s]+ # bare value
45 )
46 )?
47 )
48 )*
49 \s* # trailing whitespace
50""", re.VERBOSE)
51endstarttag = re.compile(r"\s*/?>")
52endendtag = re.compile('>')
53endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
54
55declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
56declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
57
58
59class HTMLParseError(Exception):
60 """Exception raised for all parse errors."""
61
62 def __init__(self, msg, position=(None, None)):
63 assert msg
64 self.msg = msg
65 self.lineno = position[0]
66 self.offset = position[1]
67
68 def __str__(self):
69 result = self.msg
70 if self.lineno is not None:
71 result = result + ", at line %d" % self.lineno
72 if self.offset is not None:
73 result = result + ", column %d" % (self.offset + 1)
74 return result
75
76
Guido van Rossum8846d712001-05-18 14:50:52 +000077class HTMLParser:
Fred Drake1d4601d2001-08-03 19:50:59 +000078 """Find tags and other markup and call handler functions.
79
80 Usage:
81 p = HTMLParser()
82 p.feed(data)
83 ...
84 p.close()
85
86 Start tags are handled by calling self.handle_starttag() or
87 self.handle_startendtag(); end tags by self.handle_endtag(). The
88 data between tags is passed from the parser to the derived class
89 by calling self.handle_data() with the data as argument (the data
90 may be split up in arbitrary chunks). Entity references are
91 passed by calling self.handle_entityref() with the entity
92 reference as the argument. Numeric character references are
93 passed to self.handle_charref() with the string containing the
94 reference as the argument.
95 """
Guido van Rossum8846d712001-05-18 14:50:52 +000096
97 CDATA_CONTENT_ELEMENTS = ("script", "style")
98
99
Guido van Rossum8846d712001-05-18 14:50:52 +0000100 def __init__(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000101 """Initialize and reset this instance."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000102 self.reset()
103
Guido van Rossum8846d712001-05-18 14:50:52 +0000104 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000105 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000106 self.rawdata = ''
107 self.stack = []
108 self.lasttag = '???'
109 self.lineno = 1
110 self.offset = 0
111 self.interesting = interesting_normal
112
Guido van Rossum8846d712001-05-18 14:50:52 +0000113 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000114 """Feed data to the parser.
115
116 Call this as often as you want, with as little or as much text
117 as you want (may include '\n').
118 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000119 self.rawdata = self.rawdata + data
120 self.goahead(0)
121
Guido van Rossum8846d712001-05-18 14:50:52 +0000122 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000123 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000124 self.goahead(1)
125
126 # Internal -- update line number and offset. This should be
127 # called for each piece of data exactly once, in order -- in other
128 # words the concatenation of all the input strings to this
129 # function should be exactly the entire input.
130 def updatepos(self, i, j):
131 if i >= j:
132 return j
133 rawdata = self.rawdata
134 nlines = string.count(rawdata, "\n", i, j)
135 if nlines:
136 self.lineno = self.lineno + nlines
137 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
138 self.offset = j-(pos+1)
139 else:
140 self.offset = self.offset + j-i
141 return j
142
Guido van Rossum8846d712001-05-18 14:50:52 +0000143 def getpos(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000144 """Return current line number and offset."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000145 return self.lineno, self.offset
146
147 __starttag_text = None
148
Guido van Rossum8846d712001-05-18 14:50:52 +0000149 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000150 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000151 return self.__starttag_text
152
153 def set_cdata_mode(self):
154 self.interesting = interesting_cdata
155
156 def clear_cdata_mode(self):
157 self.interesting = interesting_normal
158
159 # Internal -- handle data as far as reasonable. May leave state
160 # and data to be processed by a subsequent call. If 'end' is
161 # true, force handling all data as if followed by EOF marker.
162 def goahead(self, end):
163 rawdata = self.rawdata
164 i = 0
165 n = len(rawdata)
166 while i < n:
167 match = self.interesting.search(rawdata, i) # < or &
168 if match:
169 j = match.start()
170 else:
171 j = n
172 if i < j: self.handle_data(rawdata[i:j])
173 i = self.updatepos(i, j)
174 if i == n: break
175 if rawdata[i] == '<':
176 if starttagopen.match(rawdata, i): # < + letter
177 k = self.parse_starttag(i)
178 elif endtagopen.match(rawdata, i): # </
179 k = self.parse_endtag(i)
180 if k >= 0:
181 self.clear_cdata_mode()
182 elif commentopen.match(rawdata, i): # <!--
183 k = self.parse_comment(i)
184 elif piopen.match(rawdata, i): # <?
185 k = self.parse_pi(i)
186 elif declopen.match(rawdata, i): # <!
187 k = self.parse_declaration(i)
188 else:
Fred Drake029acfb2001-08-20 21:24:19 +0000189 self.handle_data("<")
190 k = i + 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000191 if k < 0:
192 if end:
193 raise HTMLParseError("EOF in middle of construct",
194 self.getpos())
195 break
196 i = self.updatepos(i, k)
197 elif rawdata[i] == '&':
198 match = charref.match(rawdata, i)
199 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000200 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000201 self.handle_charref(name)
202 k = match.end()
203 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000204 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000205 i = self.updatepos(i, k)
206 continue
207 match = entityref.match(rawdata, i)
208 if match:
209 name = match.group(1)
210 self.handle_entityref(name)
211 k = match.end()
212 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000213 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000214 i = self.updatepos(i, k)
215 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000216 match = incomplete.match(rawdata, i)
217 if match:
218 rest = rawdata[i:]
219 if end and rest != "&" and match.group() == rest:
Guido van Rossum8846d712001-05-18 14:50:52 +0000220 raise HTMLParseError(
221 "EOF in middle of entity or char ref",
222 self.getpos())
223 return -1 # incomplete
Fred Drake029acfb2001-08-20 21:24:19 +0000224 self.handle_data("&")
225 i = self.updatepos(i, i + 1)
Guido van Rossum8846d712001-05-18 14:50:52 +0000226 else:
227 assert 0, "interesting.search() lied"
228 # end while
229 if end and i < n:
230 self.handle_data(rawdata[i:n])
231 i = self.updatepos(i, n)
232 self.rawdata = rawdata[i:]
233
234 # Internal -- parse comment, return end or -1 if not terminated
235 def parse_comment(self, i):
236 rawdata = self.rawdata
237 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
238 match = commentclose.search(rawdata, i+4)
239 if not match:
240 return -1
241 j = match.start()
242 self.handle_comment(rawdata[i+4: j])
243 j = match.end()
244 return j
245
246 # Internal -- parse declaration.
247 def parse_declaration(self, i):
248 # This is some sort of declaration; in "HTML as
249 # deployed," this should only be the document type
250 # declaration ("<!DOCTYPE html...>").
251 rawdata = self.rawdata
252 j = i + 2
253 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
254 if rawdata[j:j+1] in ("-", ""):
255 # Start of comment followed by buffer boundary,
256 # or just a buffer boundary.
257 return -1
258 # in practice, this should look like: ((name|stringlit) S*)+ '>'
259 n = len(rawdata)
260 while j < n:
261 c = rawdata[j]
262 if c == ">":
263 # end of declaration syntax
264 self.handle_decl(rawdata[i+2:j])
265 return j + 1
266 if c in "\"'":
267 m = declstringlit.match(rawdata, j)
268 if not m:
269 return -1 # incomplete
270 j = m.end()
271 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
272 m = declname.match(rawdata, j)
273 if not m:
274 return -1 # incomplete
275 j = m.end()
276 else:
277 raise HTMLParseError(
278 "unexpected char in declaration: %s" % `rawdata[j]`,
279 self.getpos())
280 return -1 # incomplete
281
282 # Internal -- parse processing instr, return end or -1 if not terminated
283 def parse_pi(self, i):
284 rawdata = self.rawdata
285 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
286 match = piclose.search(rawdata, i+2) # >
287 if not match:
288 return -1
289 j = match.start()
290 self.handle_pi(rawdata[i+2: j])
291 j = match.end()
292 return j
293
294 # Internal -- handle starttag, return end or -1 if not terminated
295 def parse_starttag(self, i):
296 self.__starttag_text = None
297 endpos = self.check_for_whole_start_tag(i)
298 if endpos < 0:
299 return endpos
300 rawdata = self.rawdata
301 self.__starttag_text = rawdata[i:endpos]
302
303 # Now parse the data between i+1 and j into a tag and attrs
304 attrs = []
305 match = tagfind.match(rawdata, i+1)
306 assert match, 'unexpected call to parse_starttag()'
307 k = match.end()
308 self.lasttag = tag = string.lower(rawdata[i+1:k])
309
310 while k < endpos:
311 m = attrfind.match(rawdata, k)
312 if not m:
313 break
314 attrname, rest, attrvalue = m.group(1, 2, 3)
315 if not rest:
316 attrvalue = None
317 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
318 attrvalue[:1] == '"' == attrvalue[-1:]:
319 attrvalue = attrvalue[1:-1]
320 attrvalue = self.unescape(attrvalue)
321 attrs.append((string.lower(attrname), attrvalue))
322 k = m.end()
323
324 end = string.strip(rawdata[k:endpos])
325 if end not in (">", "/>"):
326 lineno, offset = self.getpos()
327 if "\n" in self.__starttag_text:
328 lineno = lineno + string.count(self.__starttag_text, "\n")
329 offset = len(self.__starttag_text) \
330 - string.rfind(self.__starttag_text, "\n")
331 else:
332 offset = offset + len(self.__starttag_text)
333 raise HTMLParseError("junk characters in start tag: %s"
334 % `rawdata[k:endpos][:20]`,
335 (lineno, offset))
336 if end[-2:] == '/>':
337 # XHTML-style empty tag: <span attr="value" />
338 self.handle_startendtag(tag, attrs)
339 else:
340 self.handle_starttag(tag, attrs)
341 if tag in self.CDATA_CONTENT_ELEMENTS:
342 self.set_cdata_mode()
343 return endpos
344
345 # Internal -- check to see if we have a complete starttag; return end
346 # or -1 if incomplete.
347 def check_for_whole_start_tag(self, i):
348 rawdata = self.rawdata
349 m = locatestarttagend.match(rawdata, i)
350 if m:
351 j = m.end()
352 next = rawdata[j:j+1]
353 if next == ">":
354 return j + 1
355 if next == "/":
356 s = rawdata[j:j+2]
357 if s == "/>":
358 return j + 2
359 if s == "/":
360 # buffer boundary
361 return -1
362 # else bogus input
363 self.updatepos(i, j + 1)
364 raise HTMLParseError("malformed empty start tag",
365 self.getpos())
366 if next == "":
367 # end of input
368 return -1
369 if next in ("abcdefghijklmnopqrstuvwxyz=/"
370 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
371 # end of input in or before attribute value, or we have the
372 # '/' from a '/>' ending
373 return -1
374 self.updatepos(i, j)
375 raise HTMLParseError("malformed start tag", self.getpos())
376 raise AssertionError("we should not gt here!")
377
378 # Internal -- parse endtag, return end or -1 if incomplete
379 def parse_endtag(self, i):
380 rawdata = self.rawdata
381 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
382 match = endendtag.search(rawdata, i+1) # >
383 if not match:
384 return -1
385 j = match.end()
386 match = endtagfind.match(rawdata, i) # </ + tag + >
387 if not match:
388 raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
389 self.getpos())
390 tag = match.group(1)
391 self.handle_endtag(string.lower(tag))
392 return j
393
394 # Overridable -- finish processing of start+end tag: <tag.../>
395 def handle_startendtag(self, tag, attrs):
396 self.handle_starttag(tag, attrs)
397 self.handle_endtag(tag)
398
399 # Overridable -- handle start tag
400 def handle_starttag(self, tag, attrs):
401 pass
402
403 # Overridable -- handle end tag
404 def handle_endtag(self, tag):
405 pass
406
407 # Overridable -- handle character reference
408 def handle_charref(self, name):
409 pass
410
411 # Overridable -- handle entity reference
412 def handle_entityref(self, name):
413 pass
414
415 # Overridable -- handle data
416 def handle_data(self, data):
417 pass
418
419 # Overridable -- handle comment
420 def handle_comment(self, data):
421 pass
422
423 # Overridable -- handle declaration
424 def handle_decl(self, decl):
425 pass
426
427 # Overridable -- handle processing instruction
428 def handle_pi(self, data):
429 pass
430
431 # Internal -- helper to remove special character quoting
432 def unescape(self, s):
433 if '&' not in s:
434 return s
435 s = string.replace(s, "&lt;", "<")
436 s = string.replace(s, "&gt;", ">")
437 s = string.replace(s, "&apos;", "'")
438 s = string.replace(s, "&quot;", '"')
439 s = string.replace(s, "&amp;", "&") # Must be last
440 return s