blob: 339c132207afd23e9bbc3f95d8cbfa9cbd1a5450 [file] [log] [blame]
Guido van Rossum8846d712001-05-18 14:50:52 +00001"""A parser for HTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import string
13
14# Regular expressions used for parsing
15
16interesting_normal = re.compile('[&<]')
17interesting_cdata = re.compile(r'<(/|\Z)')
18incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
19
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21charref = re.compile('&#([0-9]+)[^0-9]')
22
23starttagopen = re.compile('<[a-zA-Z]')
24piopen = re.compile(r'<\?')
25piclose = re.compile('>')
26endtagopen = re.compile('</')
27declopen = re.compile('<!')
28special = re.compile('<![^<>]*>')
29commentopen = re.compile('<!--')
30commentclose = re.compile(r'--\s*>')
31tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
32attrfind = re.compile(
33 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
35
36locatestarttagend = re.compile(r"""
37 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
38 (?:\s+ # whitespace before attribute name
39 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
40 (?:\s*=\s* # value indicator
41 (?:'[^']*' # LITA-enclosed value
42 |\"[^\"]*\" # LIT-enclosed value
43 |[^'\">\s]+ # bare value
44 )
45 )?
46 )
47 )*
48 \s* # trailing whitespace
49""", re.VERBOSE)
50endstarttag = re.compile(r"\s*/?>")
51endendtag = re.compile('>')
52endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
53
54declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
55declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
56
57
58class HTMLParseError(Exception):
59 """Exception raised for all parse errors."""
60
61 def __init__(self, msg, position=(None, None)):
62 assert msg
63 self.msg = msg
64 self.lineno = position[0]
65 self.offset = position[1]
66
67 def __str__(self):
68 result = self.msg
69 if self.lineno is not None:
70 result = result + ", at line %d" % self.lineno
71 if self.offset is not None:
72 result = result + ", column %d" % (self.offset + 1)
73 return result
74
75
76# HTML parser class -- find tags and call handler functions.
Fred Drake1c48eb72001-05-23 04:53:44 +000077# Usage:
78#
79# p = HTMLParser(); p.feed(data); ...; p.close()
Guido van Rossum07f353c2001-05-22 23:39:10 +000080
81# Start tags are handled by calling self.handle_starttag() or
82# self.handle_startendtag(); end tags by self.handle_endtag(). The
Fred Drake1c48eb72001-05-23 04:53:44 +000083# data between tags is passed from the parser to the derived class by
84# calling self.handle_data() with the data as argument (the data may
85# be split up in arbitrary chunks). Entity references are passed by
86# calling self.handle_entityref() with the entity reference as the
87# argument. Numeric character references are passed to
88# self.handle_charref() with the string containing the reference as
89# the argument.
Guido van Rossum8846d712001-05-18 14:50:52 +000090
91class HTMLParser:
92
93 CDATA_CONTENT_ELEMENTS = ("script", "style")
94
95
96 # Interface -- initialize and reset this instance
97 def __init__(self):
98 self.reset()
99
100 # Interface -- reset this instance. Loses all unprocessed data
101 def reset(self):
102 self.rawdata = ''
103 self.stack = []
104 self.lasttag = '???'
105 self.lineno = 1
106 self.offset = 0
107 self.interesting = interesting_normal
108
109 # Interface -- feed some data to the parser. Call this as
110 # often as you want, with as little or as much text as you
111 # want (may include '\n'). (This just saves the text, all the
112 # processing is done by goahead().)
113 def feed(self, data):
114 self.rawdata = self.rawdata + data
115 self.goahead(0)
116
117 # Interface -- handle the remaining data
118 def close(self):
119 self.goahead(1)
120
121 # Internal -- update line number and offset. This should be
122 # called for each piece of data exactly once, in order -- in other
123 # words the concatenation of all the input strings to this
124 # function should be exactly the entire input.
125 def updatepos(self, i, j):
126 if i >= j:
127 return j
128 rawdata = self.rawdata
129 nlines = string.count(rawdata, "\n", i, j)
130 if nlines:
131 self.lineno = self.lineno + nlines
132 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
133 self.offset = j-(pos+1)
134 else:
135 self.offset = self.offset + j-i
136 return j
137
138 # Interface -- return current line number and offset.
139 def getpos(self):
140 return self.lineno, self.offset
141
142 __starttag_text = None
143
144 # Interface -- return full source of start tag: "<...>"
145 def get_starttag_text(self):
146 return self.__starttag_text
147
148 def set_cdata_mode(self):
149 self.interesting = interesting_cdata
150
151 def clear_cdata_mode(self):
152 self.interesting = interesting_normal
153
154 # Internal -- handle data as far as reasonable. May leave state
155 # and data to be processed by a subsequent call. If 'end' is
156 # true, force handling all data as if followed by EOF marker.
157 def goahead(self, end):
158 rawdata = self.rawdata
159 i = 0
160 n = len(rawdata)
161 while i < n:
162 match = self.interesting.search(rawdata, i) # < or &
163 if match:
164 j = match.start()
165 else:
166 j = n
167 if i < j: self.handle_data(rawdata[i:j])
168 i = self.updatepos(i, j)
169 if i == n: break
170 if rawdata[i] == '<':
171 if starttagopen.match(rawdata, i): # < + letter
172 k = self.parse_starttag(i)
173 elif endtagopen.match(rawdata, i): # </
174 k = self.parse_endtag(i)
175 if k >= 0:
176 self.clear_cdata_mode()
177 elif commentopen.match(rawdata, i): # <!--
178 k = self.parse_comment(i)
179 elif piopen.match(rawdata, i): # <?
180 k = self.parse_pi(i)
181 elif declopen.match(rawdata, i): # <!
182 k = self.parse_declaration(i)
183 else:
184 if i < n-1:
185 raise HTMLParseError(
186 "invalid '<' construct: %s" % `rawdata[i:i+2]`,
187 self.getpos())
188 k = -1
189 if k < 0:
190 if end:
191 raise HTMLParseError("EOF in middle of construct",
192 self.getpos())
193 break
194 i = self.updatepos(i, k)
195 elif rawdata[i] == '&':
196 match = charref.match(rawdata, i)
197 if match:
198 name = match.group(1)
199 self.handle_charref(name)
200 k = match.end()
201 if rawdata[k-1] != ';':
202 k = k-1
203 i = self.updatepos(i, k)
204 continue
205 match = entityref.match(rawdata, i)
206 if match:
207 name = match.group(1)
208 self.handle_entityref(name)
209 k = match.end()
210 if rawdata[k-1] != ';':
211 k = k-1
212 i = self.updatepos(i, k)
213 continue
214 if incomplete.match(rawdata, i):
215 if end:
216 raise HTMLParseError(
217 "EOF in middle of entity or char ref",
218 self.getpos())
219 return -1 # incomplete
220 raise HTMLParseError("'&' not part of entity or char ref",
221 self.getpos())
222 else:
223 assert 0, "interesting.search() lied"
224 # end while
225 if end and i < n:
226 self.handle_data(rawdata[i:n])
227 i = self.updatepos(i, n)
228 self.rawdata = rawdata[i:]
229
230 # Internal -- parse comment, return end or -1 if not terminated
231 def parse_comment(self, i):
232 rawdata = self.rawdata
233 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
234 match = commentclose.search(rawdata, i+4)
235 if not match:
236 return -1
237 j = match.start()
238 self.handle_comment(rawdata[i+4: j])
239 j = match.end()
240 return j
241
242 # Internal -- parse declaration.
243 def parse_declaration(self, i):
244 # This is some sort of declaration; in "HTML as
245 # deployed," this should only be the document type
246 # declaration ("<!DOCTYPE html...>").
247 rawdata = self.rawdata
248 j = i + 2
249 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
250 if rawdata[j:j+1] in ("-", ""):
251 # Start of comment followed by buffer boundary,
252 # or just a buffer boundary.
253 return -1
254 # in practice, this should look like: ((name|stringlit) S*)+ '>'
255 n = len(rawdata)
256 while j < n:
257 c = rawdata[j]
258 if c == ">":
259 # end of declaration syntax
260 self.handle_decl(rawdata[i+2:j])
261 return j + 1
262 if c in "\"'":
263 m = declstringlit.match(rawdata, j)
264 if not m:
265 return -1 # incomplete
266 j = m.end()
267 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
268 m = declname.match(rawdata, j)
269 if not m:
270 return -1 # incomplete
271 j = m.end()
272 else:
273 raise HTMLParseError(
274 "unexpected char in declaration: %s" % `rawdata[j]`,
275 self.getpos())
276 return -1 # incomplete
277
278 # Internal -- parse processing instr, return end or -1 if not terminated
279 def parse_pi(self, i):
280 rawdata = self.rawdata
281 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
282 match = piclose.search(rawdata, i+2) # >
283 if not match:
284 return -1
285 j = match.start()
286 self.handle_pi(rawdata[i+2: j])
287 j = match.end()
288 return j
289
290 # Internal -- handle starttag, return end or -1 if not terminated
291 def parse_starttag(self, i):
292 self.__starttag_text = None
293 endpos = self.check_for_whole_start_tag(i)
294 if endpos < 0:
295 return endpos
296 rawdata = self.rawdata
297 self.__starttag_text = rawdata[i:endpos]
298
299 # Now parse the data between i+1 and j into a tag and attrs
300 attrs = []
301 match = tagfind.match(rawdata, i+1)
302 assert match, 'unexpected call to parse_starttag()'
303 k = match.end()
304 self.lasttag = tag = string.lower(rawdata[i+1:k])
305
306 while k < endpos:
307 m = attrfind.match(rawdata, k)
308 if not m:
309 break
310 attrname, rest, attrvalue = m.group(1, 2, 3)
311 if not rest:
312 attrvalue = None
313 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
314 attrvalue[:1] == '"' == attrvalue[-1:]:
315 attrvalue = attrvalue[1:-1]
316 attrvalue = self.unescape(attrvalue)
317 attrs.append((string.lower(attrname), attrvalue))
318 k = m.end()
319
320 end = string.strip(rawdata[k:endpos])
321 if end not in (">", "/>"):
322 lineno, offset = self.getpos()
323 if "\n" in self.__starttag_text:
324 lineno = lineno + string.count(self.__starttag_text, "\n")
325 offset = len(self.__starttag_text) \
326 - string.rfind(self.__starttag_text, "\n")
327 else:
328 offset = offset + len(self.__starttag_text)
329 raise HTMLParseError("junk characters in start tag: %s"
330 % `rawdata[k:endpos][:20]`,
331 (lineno, offset))
332 if end[-2:] == '/>':
333 # XHTML-style empty tag: <span attr="value" />
334 self.handle_startendtag(tag, attrs)
335 else:
336 self.handle_starttag(tag, attrs)
337 if tag in self.CDATA_CONTENT_ELEMENTS:
338 self.set_cdata_mode()
339 return endpos
340
341 # Internal -- check to see if we have a complete starttag; return end
342 # or -1 if incomplete.
343 def check_for_whole_start_tag(self, i):
344 rawdata = self.rawdata
345 m = locatestarttagend.match(rawdata, i)
346 if m:
347 j = m.end()
348 next = rawdata[j:j+1]
349 if next == ">":
350 return j + 1
351 if next == "/":
352 s = rawdata[j:j+2]
353 if s == "/>":
354 return j + 2
355 if s == "/":
356 # buffer boundary
357 return -1
358 # else bogus input
359 self.updatepos(i, j + 1)
360 raise HTMLParseError("malformed empty start tag",
361 self.getpos())
362 if next == "":
363 # end of input
364 return -1
365 if next in ("abcdefghijklmnopqrstuvwxyz=/"
366 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
367 # end of input in or before attribute value, or we have the
368 # '/' from a '/>' ending
369 return -1
370 self.updatepos(i, j)
371 raise HTMLParseError("malformed start tag", self.getpos())
372 raise AssertionError("we should not gt here!")
373
374 # Internal -- parse endtag, return end or -1 if incomplete
375 def parse_endtag(self, i):
376 rawdata = self.rawdata
377 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
378 match = endendtag.search(rawdata, i+1) # >
379 if not match:
380 return -1
381 j = match.end()
382 match = endtagfind.match(rawdata, i) # </ + tag + >
383 if not match:
384 raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
385 self.getpos())
386 tag = match.group(1)
387 self.handle_endtag(string.lower(tag))
388 return j
389
390 # Overridable -- finish processing of start+end tag: <tag.../>
391 def handle_startendtag(self, tag, attrs):
392 self.handle_starttag(tag, attrs)
393 self.handle_endtag(tag)
394
395 # Overridable -- handle start tag
396 def handle_starttag(self, tag, attrs):
397 pass
398
399 # Overridable -- handle end tag
400 def handle_endtag(self, tag):
401 pass
402
403 # Overridable -- handle character reference
404 def handle_charref(self, name):
405 pass
406
407 # Overridable -- handle entity reference
408 def handle_entityref(self, name):
409 pass
410
411 # Overridable -- handle data
412 def handle_data(self, data):
413 pass
414
415 # Overridable -- handle comment
416 def handle_comment(self, data):
417 pass
418
419 # Overridable -- handle declaration
420 def handle_decl(self, decl):
421 pass
422
423 # Overridable -- handle processing instruction
424 def handle_pi(self, data):
425 pass
426
427 # Internal -- helper to remove special character quoting
428 def unescape(self, s):
429 if '&' not in s:
430 return s
431 s = string.replace(s, "&lt;", "<")
432 s = string.replace(s, "&gt;", ">")
433 s = string.replace(s, "&apos;", "'")
434 s = string.replace(s, "&quot;", '"')
435 s = string.replace(s, "&amp;", "&") # Must be last
436 return s