blob: 08c53b3ff22dfc3bfde7e8f77196a5426b3abb28 [file] [log] [blame]
Fred Drake1d4601d2001-08-03 19:50:59 +00001"""A parser for HTML and XHTML."""
Guido van Rossum8846d712001-05-18 14:50:52 +00002
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Fred Drakebfc8fea2001-09-24 20:10:28 +000011import markupbase
Guido van Rossum8846d712001-05-18 14:50:52 +000012import re
13import string
14
15# Regular expressions used for parsing
16
17interesting_normal = re.compile('[&<]')
18interesting_cdata = re.compile(r'<(/|\Z)')
Fred Drake68eac2b2001-09-04 15:10:16 +000019incomplete = re.compile('&[a-zA-Z#]')
Guido van Rossum8846d712001-05-18 14:50:52 +000020
21entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Fred Drake1d4601d2001-08-03 19:50:59 +000022charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
Guido van Rossum8846d712001-05-18 14:50:52 +000023
24starttagopen = re.compile('<[a-zA-Z]')
Guido van Rossum8846d712001-05-18 14:50:52 +000025piclose = re.compile('>')
26endtagopen = re.compile('</')
Guido van Rossum8846d712001-05-18 14:50:52 +000027commentclose = re.compile(r'--\s*>')
28tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
29attrfind = re.compile(
30 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
31 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
32
33locatestarttagend = re.compile(r"""
34 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
35 (?:\s+ # whitespace before attribute name
36 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
37 (?:\s*=\s* # value indicator
38 (?:'[^']*' # LITA-enclosed value
39 |\"[^\"]*\" # LIT-enclosed value
40 |[^'\">\s]+ # bare value
41 )
42 )?
43 )
44 )*
45 \s* # trailing whitespace
46""", re.VERBOSE)
Guido van Rossum8846d712001-05-18 14:50:52 +000047endendtag = re.compile('>')
48endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
49
Guido van Rossum8846d712001-05-18 14:50:52 +000050
51class HTMLParseError(Exception):
52 """Exception raised for all parse errors."""
53
54 def __init__(self, msg, position=(None, None)):
55 assert msg
56 self.msg = msg
57 self.lineno = position[0]
58 self.offset = position[1]
59
60 def __str__(self):
61 result = self.msg
62 if self.lineno is not None:
63 result = result + ", at line %d" % self.lineno
64 if self.offset is not None:
65 result = result + ", column %d" % (self.offset + 1)
66 return result
67
68
Fred Drakebfc8fea2001-09-24 20:10:28 +000069class HTMLParser(markupbase.ParserBase):
Fred Drake1d4601d2001-08-03 19:50:59 +000070 """Find tags and other markup and call handler functions.
71
72 Usage:
73 p = HTMLParser()
74 p.feed(data)
75 ...
76 p.close()
77
78 Start tags are handled by calling self.handle_starttag() or
79 self.handle_startendtag(); end tags by self.handle_endtag(). The
80 data between tags is passed from the parser to the derived class
81 by calling self.handle_data() with the data as argument (the data
82 may be split up in arbitrary chunks). Entity references are
83 passed by calling self.handle_entityref() with the entity
84 reference as the argument. Numeric character references are
85 passed to self.handle_charref() with the string containing the
86 reference as the argument.
87 """
Guido van Rossum8846d712001-05-18 14:50:52 +000088
89 CDATA_CONTENT_ELEMENTS = ("script", "style")
90
91
Guido van Rossum8846d712001-05-18 14:50:52 +000092 def __init__(self):
Fred Drake1d4601d2001-08-03 19:50:59 +000093 """Initialize and reset this instance."""
Guido van Rossum8846d712001-05-18 14:50:52 +000094 self.reset()
95
Guido van Rossum8846d712001-05-18 14:50:52 +000096 def reset(self):
Fred Drake1d4601d2001-08-03 19:50:59 +000097 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum8846d712001-05-18 14:50:52 +000098 self.rawdata = ''
99 self.stack = []
100 self.lasttag = '???'
Guido van Rossum8846d712001-05-18 14:50:52 +0000101 self.interesting = interesting_normal
Fred Drakebfc8fea2001-09-24 20:10:28 +0000102 markupbase.ParserBase.reset(self)
Guido van Rossum8846d712001-05-18 14:50:52 +0000103
Guido van Rossum8846d712001-05-18 14:50:52 +0000104 def feed(self, data):
Fred Drake1d4601d2001-08-03 19:50:59 +0000105 """Feed data to the parser.
106
107 Call this as often as you want, with as little or as much text
108 as you want (may include '\n').
109 """
Guido van Rossum8846d712001-05-18 14:50:52 +0000110 self.rawdata = self.rawdata + data
111 self.goahead(0)
112
Guido van Rossum8846d712001-05-18 14:50:52 +0000113 def close(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000114 """Handle any buffered data."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000115 self.goahead(1)
116
Fred Drakebfc8fea2001-09-24 20:10:28 +0000117 def error(self, message):
118 raise HTMLParseError(message, self.getpos())
Guido van Rossum8846d712001-05-18 14:50:52 +0000119
120 __starttag_text = None
121
Guido van Rossum8846d712001-05-18 14:50:52 +0000122 def get_starttag_text(self):
Fred Drake1d4601d2001-08-03 19:50:59 +0000123 """Return full source of start tag: '<...>'."""
Guido van Rossum8846d712001-05-18 14:50:52 +0000124 return self.__starttag_text
125
126 def set_cdata_mode(self):
127 self.interesting = interesting_cdata
128
129 def clear_cdata_mode(self):
130 self.interesting = interesting_normal
131
132 # Internal -- handle data as far as reasonable. May leave state
133 # and data to be processed by a subsequent call. If 'end' is
134 # true, force handling all data as if followed by EOF marker.
135 def goahead(self, end):
136 rawdata = self.rawdata
137 i = 0
138 n = len(rawdata)
139 while i < n:
140 match = self.interesting.search(rawdata, i) # < or &
141 if match:
142 j = match.start()
143 else:
144 j = n
145 if i < j: self.handle_data(rawdata[i:j])
146 i = self.updatepos(i, j)
147 if i == n: break
148 if rawdata[i] == '<':
149 if starttagopen.match(rawdata, i): # < + letter
150 k = self.parse_starttag(i)
151 elif endtagopen.match(rawdata, i): # </
152 k = self.parse_endtag(i)
153 if k >= 0:
154 self.clear_cdata_mode()
Fred Drakebfc8fea2001-09-24 20:10:28 +0000155 elif rawdata.startswith("<!--", i): # <!--
Guido van Rossum8846d712001-05-18 14:50:52 +0000156 k = self.parse_comment(i)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000157 elif rawdata.startswith("<?", i): # <?
Guido van Rossum8846d712001-05-18 14:50:52 +0000158 k = self.parse_pi(i)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000159 elif rawdata.startswith("<!", i): # <!
Guido van Rossum8846d712001-05-18 14:50:52 +0000160 k = self.parse_declaration(i)
Fred Drake68eac2b2001-09-04 15:10:16 +0000161 elif (i + 1) < n:
Fred Drake029acfb2001-08-20 21:24:19 +0000162 self.handle_data("<")
163 k = i + 1
Fred Drake68eac2b2001-09-04 15:10:16 +0000164 else:
165 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000166 if k < 0:
167 if end:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000168 self.error("EOF in middle of construct")
Guido van Rossum8846d712001-05-18 14:50:52 +0000169 break
170 i = self.updatepos(i, k)
Fred Drake68eac2b2001-09-04 15:10:16 +0000171 elif rawdata[i:i+2] == "&#":
Guido van Rossum8846d712001-05-18 14:50:52 +0000172 match = charref.match(rawdata, i)
173 if match:
Fred Drake1d4601d2001-08-03 19:50:59 +0000174 name = match.group()[2:-1]
Guido van Rossum8846d712001-05-18 14:50:52 +0000175 self.handle_charref(name)
176 k = match.end()
177 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000178 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000179 i = self.updatepos(i, k)
180 continue
Fred Drake68eac2b2001-09-04 15:10:16 +0000181 else:
182 break
183 elif rawdata[i] == '&':
Guido van Rossum8846d712001-05-18 14:50:52 +0000184 match = entityref.match(rawdata, i)
185 if match:
186 name = match.group(1)
187 self.handle_entityref(name)
188 k = match.end()
189 if rawdata[k-1] != ';':
Fred Drake029acfb2001-08-20 21:24:19 +0000190 k = k - 1
Guido van Rossum8846d712001-05-18 14:50:52 +0000191 i = self.updatepos(i, k)
192 continue
Fred Drake029acfb2001-08-20 21:24:19 +0000193 match = incomplete.match(rawdata, i)
194 if match:
Fred Drake68eac2b2001-09-04 15:10:16 +0000195 # match.group() will contain at least 2 chars
Fred Drake029acfb2001-08-20 21:24:19 +0000196 rest = rawdata[i:]
Fred Drake68eac2b2001-09-04 15:10:16 +0000197 if end and match.group() == rest:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000198 self.error("EOF in middle of entity or char ref")
Fred Drake68eac2b2001-09-04 15:10:16 +0000199 # incomplete
200 break
201 elif (i + 1) < n:
202 # not the end of the buffer, and can't be confused
203 # with some other construct
204 self.handle_data("&")
205 i = self.updatepos(i, i + 1)
206 else:
207 break
Guido van Rossum8846d712001-05-18 14:50:52 +0000208 else:
209 assert 0, "interesting.search() lied"
210 # end while
211 if end and i < n:
212 self.handle_data(rawdata[i:n])
213 i = self.updatepos(i, n)
214 self.rawdata = rawdata[i:]
215
216 # Internal -- parse comment, return end or -1 if not terminated
Fred Drake68eac2b2001-09-04 15:10:16 +0000217 def parse_comment(self, i, report=1):
Guido van Rossum8846d712001-05-18 14:50:52 +0000218 rawdata = self.rawdata
219 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
220 match = commentclose.search(rawdata, i+4)
221 if not match:
222 return -1
Fred Drake68eac2b2001-09-04 15:10:16 +0000223 if report:
224 j = match.start()
225 self.handle_comment(rawdata[i+4: j])
Guido van Rossum8846d712001-05-18 14:50:52 +0000226 j = match.end()
227 return j
228
Guido van Rossum8846d712001-05-18 14:50:52 +0000229 # Internal -- parse processing instr, return end or -1 if not terminated
230 def parse_pi(self, i):
231 rawdata = self.rawdata
232 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
233 match = piclose.search(rawdata, i+2) # >
234 if not match:
235 return -1
236 j = match.start()
237 self.handle_pi(rawdata[i+2: j])
238 j = match.end()
239 return j
240
241 # Internal -- handle starttag, return end or -1 if not terminated
242 def parse_starttag(self, i):
243 self.__starttag_text = None
244 endpos = self.check_for_whole_start_tag(i)
245 if endpos < 0:
246 return endpos
247 rawdata = self.rawdata
248 self.__starttag_text = rawdata[i:endpos]
249
250 # Now parse the data between i+1 and j into a tag and attrs
251 attrs = []
252 match = tagfind.match(rawdata, i+1)
253 assert match, 'unexpected call to parse_starttag()'
254 k = match.end()
255 self.lasttag = tag = string.lower(rawdata[i+1:k])
256
257 while k < endpos:
258 m = attrfind.match(rawdata, k)
259 if not m:
260 break
261 attrname, rest, attrvalue = m.group(1, 2, 3)
262 if not rest:
263 attrvalue = None
264 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
265 attrvalue[:1] == '"' == attrvalue[-1:]:
266 attrvalue = attrvalue[1:-1]
267 attrvalue = self.unescape(attrvalue)
268 attrs.append((string.lower(attrname), attrvalue))
269 k = m.end()
270
271 end = string.strip(rawdata[k:endpos])
272 if end not in (">", "/>"):
273 lineno, offset = self.getpos()
274 if "\n" in self.__starttag_text:
275 lineno = lineno + string.count(self.__starttag_text, "\n")
276 offset = len(self.__starttag_text) \
277 - string.rfind(self.__starttag_text, "\n")
278 else:
279 offset = offset + len(self.__starttag_text)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000280 self.error("junk characters in start tag: %s"
281 % `rawdata[k:endpos][:20]`)
Guido van Rossum8846d712001-05-18 14:50:52 +0000282 if end[-2:] == '/>':
283 # XHTML-style empty tag: <span attr="value" />
284 self.handle_startendtag(tag, attrs)
285 else:
286 self.handle_starttag(tag, attrs)
287 if tag in self.CDATA_CONTENT_ELEMENTS:
288 self.set_cdata_mode()
289 return endpos
290
291 # Internal -- check to see if we have a complete starttag; return end
292 # or -1 if incomplete.
293 def check_for_whole_start_tag(self, i):
294 rawdata = self.rawdata
295 m = locatestarttagend.match(rawdata, i)
296 if m:
297 j = m.end()
298 next = rawdata[j:j+1]
299 if next == ">":
300 return j + 1
301 if next == "/":
302 s = rawdata[j:j+2]
303 if s == "/>":
304 return j + 2
305 if s == "/":
306 # buffer boundary
307 return -1
308 # else bogus input
309 self.updatepos(i, j + 1)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000310 self.error("malformed empty start tag")
Guido van Rossum8846d712001-05-18 14:50:52 +0000311 if next == "":
312 # end of input
313 return -1
314 if next in ("abcdefghijklmnopqrstuvwxyz=/"
315 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
316 # end of input in or before attribute value, or we have the
317 # '/' from a '/>' ending
318 return -1
319 self.updatepos(i, j)
Fred Drakebfc8fea2001-09-24 20:10:28 +0000320 self.error("malformed start tag")
321 raise AssertionError("we should not get here!")
Guido van Rossum8846d712001-05-18 14:50:52 +0000322
323 # Internal -- parse endtag, return end or -1 if incomplete
324 def parse_endtag(self, i):
325 rawdata = self.rawdata
326 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
327 match = endendtag.search(rawdata, i+1) # >
328 if not match:
329 return -1
330 j = match.end()
331 match = endtagfind.match(rawdata, i) # </ + tag + >
332 if not match:
Fred Drakebfc8fea2001-09-24 20:10:28 +0000333 self.error("bad end tag: %s" % `rawdata[i:j]`)
Guido van Rossum8846d712001-05-18 14:50:52 +0000334 tag = match.group(1)
335 self.handle_endtag(string.lower(tag))
336 return j
337
338 # Overridable -- finish processing of start+end tag: <tag.../>
339 def handle_startendtag(self, tag, attrs):
340 self.handle_starttag(tag, attrs)
341 self.handle_endtag(tag)
342
343 # Overridable -- handle start tag
344 def handle_starttag(self, tag, attrs):
345 pass
346
347 # Overridable -- handle end tag
348 def handle_endtag(self, tag):
349 pass
350
351 # Overridable -- handle character reference
352 def handle_charref(self, name):
353 pass
354
355 # Overridable -- handle entity reference
356 def handle_entityref(self, name):
357 pass
358
359 # Overridable -- handle data
360 def handle_data(self, data):
361 pass
362
363 # Overridable -- handle comment
364 def handle_comment(self, data):
365 pass
366
367 # Overridable -- handle declaration
368 def handle_decl(self, decl):
369 pass
370
371 # Overridable -- handle processing instruction
372 def handle_pi(self, data):
373 pass
374
Fred Drakebfc8fea2001-09-24 20:10:28 +0000375 def unknown_decl(self, data):
376 self.error("unknown declaration: " + `data`)
377
Guido van Rossum8846d712001-05-18 14:50:52 +0000378 # Internal -- helper to remove special character quoting
379 def unescape(self, s):
380 if '&' not in s:
381 return s
382 s = string.replace(s, "&lt;", "<")
383 s = string.replace(s, "&gt;", ">")
384 s = string.replace(s, "&apos;", "'")
385 s = string.replace(s, "&quot;", '"')
386 s = string.replace(s, "&amp;", "&") # Must be last
387 return s