blob: 363a6723a8b35406ef8d3b07a57d7f2cecf96bf5 [file] [log] [blame]
Guido van Rossum8846d712001-05-18 14:50:52 +00001"""A parser for HTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import string
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
18incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
19
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21charref = re.compile('&#([0-9]+)[^0-9]')
22
23starttagopen = re.compile('<[a-zA-Z]')
24piopen = re.compile(r'<\?')
25piclose = re.compile('>')
26endtagopen = re.compile('</')
27declopen = re.compile('<!')
28special = re.compile('<![^<>]*>')
29commentopen = re.compile('<!--')
30commentclose = re.compile(r'--\s*>')
31tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
32attrfind = re.compile(
33 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
35
36locatestarttagend = re.compile(r"""
37 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
38 (?:\s+ # whitespace before attribute name
39 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
40 (?:\s*=\s* # value indicator
41 (?:'[^']*' # LITA-enclosed value
42 |\"[^\"]*\" # LIT-enclosed value
43 |[^'\">\s]+ # bare value
44 )
45 )?
46 )
47 )*
48 \s* # trailing whitespace
49""", re.VERBOSE)
50endstarttag = re.compile(r"\s*/?>")
51endendtag = re.compile('>')
52endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
53
54declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
55declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
56
57
58class HTMLParseError(Exception):
59 """Exception raised for all parse errors."""
60
61 def __init__(self, msg, position=(None, None)):
62 assert msg
63 self.msg = msg
64 self.lineno = position[0]
65 self.offset = position[1]
66
67 def __str__(self):
68 result = self.msg
69 if self.lineno is not None:
70 result = result + ", at line %d" % self.lineno
71 if self.offset is not None:
72 result = result + ", column %d" % (self.offset + 1)
73 return result
74
75
76# HTML parser class -- find tags and call handler functions.
77# Usage: p = HTMLParser(); p.feed(data); ...; p.close().
78# The dtd is defined by deriving a class which defines methods
79# with special names to handle tags: start_foo and end_foo to handle
80# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
81# (Tags are converted to lower case for this purpose.) The data
82# between tags is passed to the parser by calling self.handle_data()
83# with some data as argument (the data may be split up in arbitrary
84# chunks). Entity references are passed by calling
85# self.handle_entityref() with the entity reference as argument.
86
87class HTMLParser:
88
89 CDATA_CONTENT_ELEMENTS = ("script", "style")
90
91
92 # Interface -- initialize and reset this instance
93 def __init__(self):
94 self.reset()
95
96 # Interface -- reset this instance. Loses all unprocessed data
97 def reset(self):
98 self.rawdata = ''
99 self.stack = []
100 self.lasttag = '???'
101 self.lineno = 1
102 self.offset = 0
103 self.interesting = interesting_normal
104
105 # Interface -- feed some data to the parser. Call this as
106 # often as you want, with as little or as much text as you
107 # want (may include '\n'). (This just saves the text, all the
108 # processing is done by goahead().)
109 def feed(self, data):
110 self.rawdata = self.rawdata + data
111 self.goahead(0)
112
113 # Interface -- handle the remaining data
114 def close(self):
115 self.goahead(1)
116
117 # Internal -- update line number and offset. This should be
118 # called for each piece of data exactly once, in order -- in other
119 # words the concatenation of all the input strings to this
120 # function should be exactly the entire input.
121 def updatepos(self, i, j):
122 if i >= j:
123 return j
124 rawdata = self.rawdata
125 nlines = string.count(rawdata, "\n", i, j)
126 if nlines:
127 self.lineno = self.lineno + nlines
128 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
129 self.offset = j-(pos+1)
130 else:
131 self.offset = self.offset + j-i
132 return j
133
134 # Interface -- return current line number and offset.
135 def getpos(self):
136 return self.lineno, self.offset
137
138 __starttag_text = None
139
140 # Interface -- return full source of start tag: "<...>"
141 def get_starttag_text(self):
142 return self.__starttag_text
143
144 def set_cdata_mode(self):
145 self.interesting = interesting_cdata
146
147 def clear_cdata_mode(self):
148 self.interesting = interesting_normal
149
150 # Internal -- handle data as far as reasonable. May leave state
151 # and data to be processed by a subsequent call. If 'end' is
152 # true, force handling all data as if followed by EOF marker.
153 def goahead(self, end):
154 rawdata = self.rawdata
155 i = 0
156 n = len(rawdata)
157 while i < n:
158 match = self.interesting.search(rawdata, i) # < or &
159 if match:
160 j = match.start()
161 else:
162 j = n
163 if i < j: self.handle_data(rawdata[i:j])
164 i = self.updatepos(i, j)
165 if i == n: break
166 if rawdata[i] == '<':
167 if starttagopen.match(rawdata, i): # < + letter
168 k = self.parse_starttag(i)
169 elif endtagopen.match(rawdata, i): # </
170 k = self.parse_endtag(i)
171 if k >= 0:
172 self.clear_cdata_mode()
173 elif commentopen.match(rawdata, i): # <!--
174 k = self.parse_comment(i)
175 elif piopen.match(rawdata, i): # <?
176 k = self.parse_pi(i)
177 elif declopen.match(rawdata, i): # <!
178 k = self.parse_declaration(i)
179 else:
180 if i < n-1:
181 raise HTMLParseError(
182 "invalid '<' construct: %s" % `rawdata[i:i+2]`,
183 self.getpos())
184 k = -1
185 if k < 0:
186 if end:
187 raise HTMLParseError("EOF in middle of construct",
188 self.getpos())
189 break
190 i = self.updatepos(i, k)
191 elif rawdata[i] == '&':
192 match = charref.match(rawdata, i)
193 if match:
194 name = match.group(1)
195 self.handle_charref(name)
196 k = match.end()
197 if rawdata[k-1] != ';':
198 k = k-1
199 i = self.updatepos(i, k)
200 continue
201 match = entityref.match(rawdata, i)
202 if match:
203 name = match.group(1)
204 self.handle_entityref(name)
205 k = match.end()
206 if rawdata[k-1] != ';':
207 k = k-1
208 i = self.updatepos(i, k)
209 continue
210 if incomplete.match(rawdata, i):
211 if end:
212 raise HTMLParseError(
213 "EOF in middle of entity or char ref",
214 self.getpos())
215 return -1 # incomplete
216 raise HTMLParseError("'&' not part of entity or char ref",
217 self.getpos())
218 else:
219 assert 0, "interesting.search() lied"
220 # end while
221 if end and i < n:
222 self.handle_data(rawdata[i:n])
223 i = self.updatepos(i, n)
224 self.rawdata = rawdata[i:]
225
226 # Internal -- parse comment, return end or -1 if not terminated
227 def parse_comment(self, i):
228 rawdata = self.rawdata
229 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
230 match = commentclose.search(rawdata, i+4)
231 if not match:
232 return -1
233 j = match.start()
234 self.handle_comment(rawdata[i+4: j])
235 j = match.end()
236 return j
237
238 # Internal -- parse declaration.
239 def parse_declaration(self, i):
240 # This is some sort of declaration; in "HTML as
241 # deployed," this should only be the document type
242 # declaration ("<!DOCTYPE html...>").
243 rawdata = self.rawdata
244 j = i + 2
245 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
246 if rawdata[j:j+1] in ("-", ""):
247 # Start of comment followed by buffer boundary,
248 # or just a buffer boundary.
249 return -1
250 # in practice, this should look like: ((name|stringlit) S*)+ '>'
251 n = len(rawdata)
252 while j < n:
253 c = rawdata[j]
254 if c == ">":
255 # end of declaration syntax
256 self.handle_decl(rawdata[i+2:j])
257 return j + 1
258 if c in "\"'":
259 m = declstringlit.match(rawdata, j)
260 if not m:
261 return -1 # incomplete
262 j = m.end()
263 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
264 m = declname.match(rawdata, j)
265 if not m:
266 return -1 # incomplete
267 j = m.end()
268 else:
269 raise HTMLParseError(
270 "unexpected char in declaration: %s" % `rawdata[j]`,
271 self.getpos())
272 return -1 # incomplete
273
274 # Internal -- parse processing instr, return end or -1 if not terminated
275 def parse_pi(self, i):
276 rawdata = self.rawdata
277 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
278 match = piclose.search(rawdata, i+2) # >
279 if not match:
280 return -1
281 j = match.start()
282 self.handle_pi(rawdata[i+2: j])
283 j = match.end()
284 return j
285
286 # Internal -- handle starttag, return end or -1 if not terminated
287 def parse_starttag(self, i):
288 self.__starttag_text = None
289 endpos = self.check_for_whole_start_tag(i)
290 if endpos < 0:
291 return endpos
292 rawdata = self.rawdata
293 self.__starttag_text = rawdata[i:endpos]
294
295 # Now parse the data between i+1 and j into a tag and attrs
296 attrs = []
297 match = tagfind.match(rawdata, i+1)
298 assert match, 'unexpected call to parse_starttag()'
299 k = match.end()
300 self.lasttag = tag = string.lower(rawdata[i+1:k])
301
302 while k < endpos:
303 m = attrfind.match(rawdata, k)
304 if not m:
305 break
306 attrname, rest, attrvalue = m.group(1, 2, 3)
307 if not rest:
308 attrvalue = None
309 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
310 attrvalue[:1] == '"' == attrvalue[-1:]:
311 attrvalue = attrvalue[1:-1]
312 attrvalue = self.unescape(attrvalue)
313 attrs.append((string.lower(attrname), attrvalue))
314 k = m.end()
315
316 end = string.strip(rawdata[k:endpos])
317 if end not in (">", "/>"):
318 lineno, offset = self.getpos()
319 if "\n" in self.__starttag_text:
320 lineno = lineno + string.count(self.__starttag_text, "\n")
321 offset = len(self.__starttag_text) \
322 - string.rfind(self.__starttag_text, "\n")
323 else:
324 offset = offset + len(self.__starttag_text)
325 raise HTMLParseError("junk characters in start tag: %s"
326 % `rawdata[k:endpos][:20]`,
327 (lineno, offset))
328 if end[-2:] == '/>':
329 # XHTML-style empty tag: <span attr="value" />
330 self.handle_startendtag(tag, attrs)
331 else:
332 self.handle_starttag(tag, attrs)
333 if tag in self.CDATA_CONTENT_ELEMENTS:
334 self.set_cdata_mode()
335 return endpos
336
337 # Internal -- check to see if we have a complete starttag; return end
338 # or -1 if incomplete.
339 def check_for_whole_start_tag(self, i):
340 rawdata = self.rawdata
341 m = locatestarttagend.match(rawdata, i)
342 if m:
343 j = m.end()
344 next = rawdata[j:j+1]
345 if next == ">":
346 return j + 1
347 if next == "/":
348 s = rawdata[j:j+2]
349 if s == "/>":
350 return j + 2
351 if s == "/":
352 # buffer boundary
353 return -1
354 # else bogus input
355 self.updatepos(i, j + 1)
356 raise HTMLParseError("malformed empty start tag",
357 self.getpos())
358 if next == "":
359 # end of input
360 return -1
361 if next in ("abcdefghijklmnopqrstuvwxyz=/"
362 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
363 # end of input in or before attribute value, or we have the
364 # '/' from a '/>' ending
365 return -1
366 self.updatepos(i, j)
367 raise HTMLParseError("malformed start tag", self.getpos())
368 raise AssertionError("we should not gt here!")
369
370 # Internal -- parse endtag, return end or -1 if incomplete
371 def parse_endtag(self, i):
372 rawdata = self.rawdata
373 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
374 match = endendtag.search(rawdata, i+1) # >
375 if not match:
376 return -1
377 j = match.end()
378 match = endtagfind.match(rawdata, i) # </ + tag + >
379 if not match:
380 raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
381 self.getpos())
382 tag = match.group(1)
383 self.handle_endtag(string.lower(tag))
384 return j
385
386 # Overridable -- finish processing of start+end tag: <tag.../>
387 def handle_startendtag(self, tag, attrs):
388 self.handle_starttag(tag, attrs)
389 self.handle_endtag(tag)
390
391 # Overridable -- handle start tag
392 def handle_starttag(self, tag, attrs):
393 pass
394
395 # Overridable -- handle end tag
396 def handle_endtag(self, tag):
397 pass
398
399 # Overridable -- handle character reference
400 def handle_charref(self, name):
401 pass
402
403 # Overridable -- handle entity reference
404 def handle_entityref(self, name):
405 pass
406
407 # Overridable -- handle data
408 def handle_data(self, data):
409 pass
410
411 # Overridable -- handle comment
412 def handle_comment(self, data):
413 pass
414
415 # Overridable -- handle declaration
416 def handle_decl(self, decl):
417 pass
418
419 # Overridable -- handle processing instruction
420 def handle_pi(self, data):
421 pass
422
423 # Internal -- helper to remove special character quoting
424 def unescape(self, s):
425 if '&' not in s:
426 return s
427 s = string.replace(s, "&lt;", "<")
428 s = string.replace(s, "&gt;", ">")
429 s = string.replace(s, "&apos;", "'")
430 s = string.replace(s, "&quot;", '"')
431 s = string.replace(s, "&amp;", "&") # Must be last
432 return s