blob: 39a5d8262ff1bb5cbe4e5f6fd6499e655daef2e7 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import string
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
18incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
19
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000021charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000022
23starttagopen = re.compile('<[a-zA-Z]')
24piopen = re.compile(r'<\?')
25piclose = re.compile('>')
26endtagopen = re.compile('</')
27declopen = re.compile('<!')
28special = re.compile('<![^<>]*>')
29commentopen = re.compile('<!--')
30commentclose = re.compile(r'--\s*>')
31tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
32attrfind = re.compile(
33 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
35
36locatestarttagend = re.compile(r"""
37 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
38 (?:\s+ # whitespace before attribute name
39 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
40 (?:\s*=\s* # value indicator
41 (?:'[^']*' # LITA-enclosed value
42 |\"[^\"]*\" # LIT-enclosed value
43 |[^'\">\s]+ # bare value
44 )
45 )?
46 )
47 )*
48 \s* # trailing whitespace
49""", re.VERBOSE)
50endstarttag = re.compile(r"\s*/?>")
51endendtag = re.compile('>')
52endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
53
54declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
55declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
56
57
58class HTMLParseError(Exception):
59 """Exception raised for all parse errors."""
60
61 def __init__(self, msg, position=(None, None)):
62 assert msg
63 self.msg = msg
64 self.lineno = position[0]
65 self.offset = position[1]
66
67 def __str__(self):
68 result = self.msg
69 if self.lineno is not None:
70 result = result + ", at line %d" % self.lineno
71 if self.offset is not None:
72 result = result + ", column %d" % (self.offset + 1)
73 return result
74
75
Guido van Rossum8846d712001-05-18 14:50:52 +000076class HTMLParser:
Fred Drake1d4601d2001-08-03 19:50:59 +000077 """Find tags and other markup and call handler functions.
78
79 Usage:
80 p = HTMLParser()
81 p.feed(data)
82 ...
83 p.close()
84
85 Start tags are handled by calling self.handle_starttag() or
86 self.handle_startendtag(); end tags by self.handle_endtag(). The
87 data between tags is passed from the parser to the derived class
88 by calling self.handle_data() with the data as argument (the data
89 may be split up in arbitrary chunks). Entity references are
90 passed by calling self.handle_entityref() with the entity
91 reference as the argument. Numeric character references are
92 passed to self.handle_charref() with the string containing the
93 reference as the argument.
94 """
Guido van Rossum8846d712001-05-18 14:50:52 +000095
96 CDATA_CONTENT_ELEMENTS = ("script", "style")
97
98
Guido van Rossum8846d712001-05-18 14:50:52 +000099 def __init__(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000100 """Initialize and reset this instance."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000101 self.reset()
102
Guido van Rossum8846d712001-05-18 14:50:52 +0000103 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000104 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000105 self.rawdata = ''
106 self.stack = []
107 self.lasttag = '???'
108 self.lineno = 1
109 self.offset = 0
110 self.interesting = interesting_normal
111
Guido van Rossum8846d712001-05-18 14:50:52 +0000112 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000113 """Feed data to the parser.
114
115 Call this as often as you want, with as little or as much text
116 as you want (may include '\n').
117 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000118 self.rawdata = self.rawdata + data
119 self.goahead(0)
120
Guido van Rossum8846d712001-05-18 14:50:52 +0000121 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000122 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000123 self.goahead(1)
124
125 # Internal -- update line number and offset. This should be
126 # called for each piece of data exactly once, in order -- in other
127 # words the concatenation of all the input strings to this
128 # function should be exactly the entire input.
129 def updatepos(self, i, j):
130 if i >= j:
131 return j
132 rawdata = self.rawdata
133 nlines = string.count(rawdata, "\n", i, j)
134 if nlines:
135 self.lineno = self.lineno + nlines
136 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
137 self.offset = j-(pos+1)
138 else:
139 self.offset = self.offset + j-i
140 return j
141
Guido van Rossum8846d712001-05-18 14:50:52 +0000142 def getpos(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000143 """Return current line number and offset."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000144 return self.lineno, self.offset
145
146 __starttag_text = None
147
Guido van Rossum8846d712001-05-18 14:50:52 +0000148 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000149 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000150 return self.__starttag_text
151
152 def set_cdata_mode(self):
153 self.interesting = interesting_cdata
154
155 def clear_cdata_mode(self):
156 self.interesting = interesting_normal
157
158 # Internal -- handle data as far as reasonable. May leave state
159 # and data to be processed by a subsequent call. If 'end' is
160 # true, force handling all data as if followed by EOF marker.
161 def goahead(self, end):
162 rawdata = self.rawdata
163 i = 0
164 n = len(rawdata)
165 while i < n:
166 match = self.interesting.search(rawdata, i) # < or &
167 if match:
168 j = match.start()
169 else:
170 j = n
171 if i < j: self.handle_data(rawdata[i:j])
172 i = self.updatepos(i, j)
173 if i == n: break
174 if rawdata[i] == '<':
175 if starttagopen.match(rawdata, i): # < + letter
176 k = self.parse_starttag(i)
177 elif endtagopen.match(rawdata, i): # </
178 k = self.parse_endtag(i)
179 if k >= 0:
180 self.clear_cdata_mode()
181 elif commentopen.match(rawdata, i): # <!--
182 k = self.parse_comment(i)
183 elif piopen.match(rawdata, i): # <?
184 k = self.parse_pi(i)
185 elif declopen.match(rawdata, i): # <!
186 k = self.parse_declaration(i)
187 else:
188 if i < n-1:
189 raise HTMLParseError(
190 "invalid '<' construct: %s" % `rawdata[i:i+2]`,
191 self.getpos())
192 k = -1
193 if k < 0:
194 if end:
195 raise HTMLParseError("EOF in middle of construct",
196 self.getpos())
197 break
198 i = self.updatepos(i, k)
199 elif rawdata[i] == '&':
200 match = charref.match(rawdata, i)
201 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000202 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000203 self.handle_charref(name)
204 k = match.end()
205 if rawdata[k-1] != ';':
206 k = k-1
207 i = self.updatepos(i, k)
208 continue
209 match = entityref.match(rawdata, i)
210 if match:
211 name = match.group(1)
212 self.handle_entityref(name)
213 k = match.end()
214 if rawdata[k-1] != ';':
215 k = k-1
216 i = self.updatepos(i, k)
217 continue
218 if incomplete.match(rawdata, i):
219 if end:
220 raise HTMLParseError(
221 "EOF in middle of entity or char ref",
222 self.getpos())
223 return -1 # incomplete
224 raise HTMLParseError("'&' not part of entity or char ref",
225 self.getpos())
226 else:
227 assert 0, "interesting.search() lied"
228 # end while
229 if end and i < n:
230 self.handle_data(rawdata[i:n])
231 i = self.updatepos(i, n)
232 self.rawdata = rawdata[i:]
233
234 # Internal -- parse comment, return end or -1 if not terminated
235 def parse_comment(self, i):
236 rawdata = self.rawdata
237 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
238 match = commentclose.search(rawdata, i+4)
239 if not match:
240 return -1
241 j = match.start()
242 self.handle_comment(rawdata[i+4: j])
243 j = match.end()
244 return j
245
246 # Internal -- parse declaration.
247 def parse_declaration(self, i):
248 # This is some sort of declaration; in "HTML as
249 # deployed," this should only be the document type
250 # declaration ("<!DOCTYPE html...>").
251 rawdata = self.rawdata
252 j = i + 2
253 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
254 if rawdata[j:j+1] in ("-", ""):
255 # Start of comment followed by buffer boundary,
256 # or just a buffer boundary.
257 return -1
258 # in practice, this should look like: ((name|stringlit) S*)+ '>'
259 n = len(rawdata)
260 while j < n:
261 c = rawdata[j]
262 if c == ">":
263 # end of declaration syntax
264 self.handle_decl(rawdata[i+2:j])
265 return j + 1
266 if c in "\"'":
267 m = declstringlit.match(rawdata, j)
268 if not m:
269 return -1 # incomplete
270 j = m.end()
271 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
272 m = declname.match(rawdata, j)
273 if not m:
274 return -1 # incomplete
275 j = m.end()
276 else:
277 raise HTMLParseError(
278 "unexpected char in declaration: %s" % `rawdata[j]`,
279 self.getpos())
280 return -1 # incomplete
281
282 # Internal -- parse processing instr, return end or -1 if not terminated
283 def parse_pi(self, i):
284 rawdata = self.rawdata
285 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
286 match = piclose.search(rawdata, i+2) # >
287 if not match:
288 return -1
289 j = match.start()
290 self.handle_pi(rawdata[i+2: j])
291 j = match.end()
292 return j
293
294 # Internal -- handle starttag, return end or -1 if not terminated
295 def parse_starttag(self, i):
296 self.__starttag_text = None
297 endpos = self.check_for_whole_start_tag(i)
298 if endpos < 0:
299 return endpos
300 rawdata = self.rawdata
301 self.__starttag_text = rawdata[i:endpos]
302
303 # Now parse the data between i+1 and j into a tag and attrs
304 attrs = []
305 match = tagfind.match(rawdata, i+1)
306 assert match, 'unexpected call to parse_starttag()'
307 k = match.end()
308 self.lasttag = tag = string.lower(rawdata[i+1:k])
309
310 while k < endpos:
311 m = attrfind.match(rawdata, k)
312 if not m:
313 break
314 attrname, rest, attrvalue = m.group(1, 2, 3)
315 if not rest:
316 attrvalue = None
317 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
318 attrvalue[:1] == '"' == attrvalue[-1:]:
319 attrvalue = attrvalue[1:-1]
320 attrvalue = self.unescape(attrvalue)
321 attrs.append((string.lower(attrname), attrvalue))
322 k = m.end()
323
324 end = string.strip(rawdata[k:endpos])
325 if end not in (">", "/>"):
326 lineno, offset = self.getpos()
327 if "\n" in self.__starttag_text:
328 lineno = lineno + string.count(self.__starttag_text, "\n")
329 offset = len(self.__starttag_text) \
330 - string.rfind(self.__starttag_text, "\n")
331 else:
332 offset = offset + len(self.__starttag_text)
333 raise HTMLParseError("junk characters in start tag: %s"
334 % `rawdata[k:endpos][:20]`,
335 (lineno, offset))
336 if end[-2:] == '/>':
337 # XHTML-style empty tag: <span attr="value" />
338 self.handle_startendtag(tag, attrs)
339 else:
340 self.handle_starttag(tag, attrs)
341 if tag in self.CDATA_CONTENT_ELEMENTS:
342 self.set_cdata_mode()
343 return endpos
344
345 # Internal -- check to see if we have a complete starttag; return end
346 # or -1 if incomplete.
347 def check_for_whole_start_tag(self, i):
348 rawdata = self.rawdata
349 m = locatestarttagend.match(rawdata, i)
350 if m:
351 j = m.end()
352 next = rawdata[j:j+1]
353 if next == ">":
354 return j + 1
355 if next == "/":
356 s = rawdata[j:j+2]
357 if s == "/>":
358 return j + 2
359 if s == "/":
360 # buffer boundary
361 return -1
362 # else bogus input
363 self.updatepos(i, j + 1)
364 raise HTMLParseError("malformed empty start tag",
365 self.getpos())
366 if next == "":
367 # end of input
368 return -1
369 if next in ("abcdefghijklmnopqrstuvwxyz=/"
370 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
371 # end of input in or before attribute value, or we have the
372 # '/' from a '/>' ending
373 return -1
374 self.updatepos(i, j)
375 raise HTMLParseError("malformed start tag", self.getpos())
376 raise AssertionError("we should not gt here!")
377
378 # Internal -- parse endtag, return end or -1 if incomplete
379 def parse_endtag(self, i):
380 rawdata = self.rawdata
381 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
382 match = endendtag.search(rawdata, i+1) # >
383 if not match:
384 return -1
385 j = match.end()
386 match = endtagfind.match(rawdata, i) # </ + tag + >
387 if not match:
388 raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
389 self.getpos())
390 tag = match.group(1)
391 self.handle_endtag(string.lower(tag))
392 return j
393
394 # Overridable -- finish processing of start+end tag: <tag.../>
395 def handle_startendtag(self, tag, attrs):
396 self.handle_starttag(tag, attrs)
397 self.handle_endtag(tag)
398
399 # Overridable -- handle start tag
400 def handle_starttag(self, tag, attrs):
401 pass
402
403 # Overridable -- handle end tag
404 def handle_endtag(self, tag):
405 pass
406
407 # Overridable -- handle character reference
408 def handle_charref(self, name):
409 pass
410
411 # Overridable -- handle entity reference
412 def handle_entityref(self, name):
413 pass
414
415 # Overridable -- handle data
416 def handle_data(self, data):
417 pass
418
419 # Overridable -- handle comment
420 def handle_comment(self, data):
421 pass
422
423 # Overridable -- handle declaration
424 def handle_decl(self, decl):
425 pass
426
427 # Overridable -- handle processing instruction
428 def handle_pi(self, data):
429 pass
430
431 # Internal -- helper to remove special character quoting
432 def unescape(self, s):
433 if '&' not in s:
434 return s
435 s = string.replace(s, "&lt;", "<")
436 s = string.replace(s, "&gt;", ">")
437 s = string.replace(s, "&apos;", "'")
438 s = string.replace(s, "&quot;", '"')
439 s = string.replace(s, "&amp;", "&") # Must be last
440 return s