blob: a620a7deb59fe5a1cc6967e5c6a6b76fa62248e9 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Guido van Rossum1fef1811997-10-23 19:09:21 +000011import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000012import string
13
14
15# Regular expressions used for parsing
16
Guido van Rossum1fef1811997-10-23 19:09:21 +000017interesting = re.compile('[&<]')
18incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000019 '<([a-zA-Z][^<>]*|'
20 '/([a-zA-Z][^<>]*)?|'
21 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000022
Guido van Rossum1ad00711998-05-28 22:48:53 +000023entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000024charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000025
Guido van Rossum1fef1811997-10-23 19:09:21 +000026starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000027shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
28shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000029piopen = re.compile('<\?')
30piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000031endtagopen = re.compile('</[<>a-zA-Z]')
32endbracket = re.compile('[<>]')
33special = re.compile('<![^<>]*>')
34commentopen = re.compile('<!--')
Fred Drakede2f7081998-04-16 21:04:26 +000035commentclose = re.compile('--[%s]*>' % string.whitespace)
Guido van Rossum5fdf8521998-08-24 20:59:13 +000036tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000037attrfind = re.compile(
Fred Drakedfd89541999-01-25 21:57:07 +000038 '[%s]*([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
Fred Drakede2f7081998-04-16 21:04:26 +000039 + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
40 + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000041
42
43# SGML parser base class -- find tags and call handler functions.
44# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
45# The dtd is defined by deriving a class which defines methods
46# with special names to handle tags: start_foo and end_foo to handle
47# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
48# (Tags are converted to lower case for this purpose.) The data
49# between tags is passed to the parser by calling self.handle_data()
50# with some data as argument (the data may be split up in arbutrary
51# chunks). Entity references are passed by calling
52# self.handle_entityref() with the entity reference as argument.
53
54class SGMLParser:
55
Guido van Rossum48766511996-03-28 18:45:04 +000056 # Interface -- initialize and reset this instance
57 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000058 self.verbose = verbose
59 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000060
Guido van Rossum48766511996-03-28 18:45:04 +000061 # Interface -- reset this instance. Loses all unprocessed data
62 def reset(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000063 self.rawdata = ''
64 self.stack = []
65 self.lasttag = '???'
66 self.nomoretags = 0
67 self.literal = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000068
Guido van Rossum48766511996-03-28 18:45:04 +000069 # For derived classes only -- enter literal mode (CDATA) till EOF
70 def setnomoretags(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000071 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000072
Guido van Rossum48766511996-03-28 18:45:04 +000073 # For derived classes only -- enter literal mode (CDATA)
74 def setliteral(self, *args):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000075 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000076
Guido van Rossum48766511996-03-28 18:45:04 +000077 # Interface -- feed some data to the parser. Call this as
78 # often as you want, with as little or as much text as you
79 # want (may include '\n'). (This just saves the text, all the
80 # processing is done by goahead().)
81 def feed(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 self.rawdata = self.rawdata + data
83 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +000084
Guido van Rossum48766511996-03-28 18:45:04 +000085 # Interface -- handle the remaining data
86 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +000088
Guido van Rossum48766511996-03-28 18:45:04 +000089 # Internal -- handle data as far as reasonable. May leave state
90 # and data to be processed by a subsequent call. If 'end' is
91 # true, force handling all data as if followed by EOF marker.
92 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 rawdata = self.rawdata
94 i = 0
95 n = len(rawdata)
96 while i < n:
97 if self.nomoretags:
98 self.handle_data(rawdata[i:n])
99 i = n
100 break
101 match = interesting.search(rawdata, i)
102 if match: j = match.start(0)
103 else: j = n
104 if i < j: self.handle_data(rawdata[i:j])
105 i = j
106 if i == n: break
107 if rawdata[i] == '<':
108 if starttagopen.match(rawdata, i):
109 if self.literal:
110 self.handle_data(rawdata[i])
111 i = i+1
112 continue
113 k = self.parse_starttag(i)
114 if k < 0: break
115 i = k
116 continue
117 if endtagopen.match(rawdata, i):
118 k = self.parse_endtag(i)
119 if k < 0: break
120 i = k
121 self.literal = 0
122 continue
123 if commentopen.match(rawdata, i):
124 if self.literal:
125 self.handle_data(rawdata[i])
126 i = i+1
127 continue
128 k = self.parse_comment(i)
129 if k < 0: break
130 i = i+k
131 continue
Guido van Rossum1ad00711998-05-28 22:48:53 +0000132 if piopen.match(rawdata, i):
133 if self.literal:
134 self.handle_data(rawdata[i])
135 i = i+1
136 continue
137 k = self.parse_pi(i)
138 if k < 0: break
139 i = i+k
140 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000141 match = special.match(rawdata, i)
142 if match:
143 if self.literal:
144 self.handle_data(rawdata[i])
145 i = i+1
146 continue
147 i = match.end(0)
148 continue
149 elif rawdata[i] == '&':
150 match = charref.match(rawdata, i)
151 if match:
152 name = match.group(1)
153 self.handle_charref(name)
154 i = match.end(0)
155 if rawdata[i-1] != ';': i = i-1
156 continue
157 match = entityref.match(rawdata, i)
158 if match:
159 name = match.group(1)
160 self.handle_entityref(name)
161 i = match.end(0)
162 if rawdata[i-1] != ';': i = i-1
163 continue
164 else:
165 raise RuntimeError, 'neither < nor & ??'
166 # We get here only if incomplete matches but
167 # nothing else
168 match = incomplete.match(rawdata, i)
169 if not match:
170 self.handle_data(rawdata[i])
171 i = i+1
172 continue
173 j = match.end(0)
174 if j == n:
175 break # Really incomplete
176 self.handle_data(rawdata[i:j])
177 i = j
178 # end while
179 if end and i < n:
180 self.handle_data(rawdata[i:n])
181 i = n
182 self.rawdata = rawdata[i:]
183 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000184
Guido van Rossum48766511996-03-28 18:45:04 +0000185 # Internal -- parse comment, return length or -1 if not terminated
186 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 rawdata = self.rawdata
188 if rawdata[i:i+4] <> '<!--':
189 raise RuntimeError, 'unexpected call to handle_comment'
190 match = commentclose.search(rawdata, i+4)
191 if not match:
192 return -1
193 j = match.start(0)
194 self.handle_comment(rawdata[i+4: j])
195 j = match.end(0)
196 return j-i
Guido van Rossum7c750e11995-02-27 13:16:55 +0000197
Guido van Rossum1ad00711998-05-28 22:48:53 +0000198 # Internal -- parse processing instr, return length or -1 if not terminated
199 def parse_pi(self, i):
200 rawdata = self.rawdata
201 if rawdata[i:i+2] <> '<?':
202 raise RuntimeError, 'unexpected call to handle_pi'
203 match = piclose.search(rawdata, i+2)
204 if not match:
205 return -1
206 j = match.start(0)
207 self.handle_pi(rawdata[i+2: j])
208 j = match.end(0)
209 return j-i
210
Guido van Rossum48766511996-03-28 18:45:04 +0000211 # Internal -- handle starttag, return length or -1 if not terminated
212 def parse_starttag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 rawdata = self.rawdata
214 if shorttagopen.match(rawdata, i):
215 # SGML shorthand: <tag/data/ == <tag>data</tag>
216 # XXX Can data contain &... (entity or char refs)?
217 # XXX Can data contain < or > (tag characters)?
218 # XXX Can there be whitespace before the first /?
219 match = shorttag.match(rawdata, i)
220 if not match:
221 return -1
222 tag, data = match.group(1, 2)
223 tag = string.lower(tag)
224 self.finish_shorttag(tag, data)
225 k = match.end(0)
226 return k
227 # XXX The following should skip matching quotes (' or ")
228 match = endbracket.search(rawdata, i+1)
229 if not match:
230 return -1
231 j = match.start(0)
232 # Now parse the data between i+1 and j into a tag and attrs
233 attrs = []
234 if rawdata[i:i+2] == '<>':
235 # SGML shorthand: <> == <last open tag seen>
236 k = j
237 tag = self.lasttag
238 else:
239 match = tagfind.match(rawdata, i+1)
240 if not match:
241 raise RuntimeError, 'unexpected call to parse_starttag'
242 k = match.end(0)
243 tag = string.lower(rawdata[i+1:k])
244 self.lasttag = tag
245 while k < j:
246 match = attrfind.match(rawdata, k)
247 if not match: break
248 attrname, rest, attrvalue = match.group(1, 2, 3)
249 if not rest:
250 attrvalue = attrname
251 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
252 attrvalue[:1] == '"' == attrvalue[-1:]:
253 attrvalue = attrvalue[1:-1]
254 attrs.append((string.lower(attrname), attrvalue))
255 k = match.end(0)
256 if rawdata[j] == '>':
257 j = j+1
258 self.finish_starttag(tag, attrs)
259 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000260
261 # Internal -- parse endtag
262 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 rawdata = self.rawdata
264 match = endbracket.search(rawdata, i+1)
265 if not match:
266 return -1
267 j = match.start(0)
268 tag = string.lower(string.strip(rawdata[i+2:j]))
269 if rawdata[j] == '>':
270 j = j+1
271 self.finish_endtag(tag)
272 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000273
274 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
275 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 self.finish_starttag(tag, [])
277 self.handle_data(data)
278 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000279
280 # Internal -- finish processing of start tag
281 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
282 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000283 try:
284 method = getattr(self, 'start_' + tag)
285 except AttributeError:
286 try:
287 method = getattr(self, 'do_' + tag)
288 except AttributeError:
289 self.unknown_starttag(tag, attrs)
290 return -1
291 else:
292 self.handle_starttag(tag, method, attrs)
293 return 0
294 else:
295 self.stack.append(tag)
296 self.handle_starttag(tag, method, attrs)
297 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000298
299 # Internal -- finish processing of end tag
300 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000301 if not tag:
302 found = len(self.stack) - 1
303 if found < 0:
304 self.unknown_endtag(tag)
305 return
306 else:
307 if tag not in self.stack:
308 try:
309 method = getattr(self, 'end_' + tag)
310 except AttributeError:
311 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000312 else:
313 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000314 return
315 found = len(self.stack)
316 for i in range(found):
317 if self.stack[i] == tag: found = i
318 while len(self.stack) > found:
319 tag = self.stack[-1]
320 try:
321 method = getattr(self, 'end_' + tag)
322 except AttributeError:
323 method = None
324 if method:
325 self.handle_endtag(tag, method)
326 else:
327 self.unknown_endtag(tag)
328 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000329
Guido van Rossum48766511996-03-28 18:45:04 +0000330 # Overridable -- handle start tag
331 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000332 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000333
Guido van Rossum48766511996-03-28 18:45:04 +0000334 # Overridable -- handle end tag
335 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000336 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000337
Guido van Rossum48766511996-03-28 18:45:04 +0000338 # Example -- report an unbalanced </...> tag.
339 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 if self.verbose:
341 print '*** Unbalanced </' + tag + '>'
342 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000343
Guido van Rossum48766511996-03-28 18:45:04 +0000344 # Example -- handle character reference, no need to override
345 def handle_charref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000346 try:
347 n = string.atoi(name)
348 except string.atoi_error:
349 self.unknown_charref(name)
350 return
351 if not 0 <= n <= 255:
352 self.unknown_charref(name)
353 return
354 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000355
Guido van Rossum48766511996-03-28 18:45:04 +0000356 # Definition of entities -- derived classes may override
357 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000359
Guido van Rossum48766511996-03-28 18:45:04 +0000360 # Example -- handle entity reference, no need to override
361 def handle_entityref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000362 table = self.entitydefs
363 if table.has_key(name):
364 self.handle_data(table[name])
365 else:
366 self.unknown_entityref(name)
367 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000368
Guido van Rossum48766511996-03-28 18:45:04 +0000369 # Example -- handle data, should be overridden
370 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000371 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000372
Guido van Rossum48766511996-03-28 18:45:04 +0000373 # Example -- handle comment, could be overridden
374 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000375 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000376
Guido van Rossum1ad00711998-05-28 22:48:53 +0000377 # Example -- handle processing instruction, could be overridden
378 def handle_pi(self, data):
379 pass
380
Guido van Rossum48766511996-03-28 18:45:04 +0000381 # To be overridden -- handlers for unknown objects
382 def unknown_starttag(self, tag, attrs): pass
383 def unknown_endtag(self, tag): pass
384 def unknown_charref(self, ref): pass
385 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000386
387
Guido van Rossum48766511996-03-28 18:45:04 +0000388class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000389
Guido van Rossum48766511996-03-28 18:45:04 +0000390 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000391 self.testdata = ""
392 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000393
Guido van Rossum48766511996-03-28 18:45:04 +0000394 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000395 self.testdata = self.testdata + data
396 if len(`self.testdata`) >= 70:
397 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000398
Guido van Rossum48766511996-03-28 18:45:04 +0000399 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000400 data = self.testdata
401 if data:
402 self.testdata = ""
403 print 'data:', `data`
Guido van Rossum7c750e11995-02-27 13:16:55 +0000404
Guido van Rossum48766511996-03-28 18:45:04 +0000405 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000406 self.flush()
407 r = `data`
408 if len(r) > 68:
409 r = r[:32] + '...' + r[-32:]
410 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000411
Guido van Rossum48766511996-03-28 18:45:04 +0000412 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000413 self.flush()
414 if not attrs:
415 print 'start tag: <' + tag + '>'
416 else:
417 print 'start tag: <' + tag,
418 for name, value in attrs:
419 print name + '=' + '"' + value + '"',
420 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000421
Guido van Rossum48766511996-03-28 18:45:04 +0000422 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000423 self.flush()
424 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000425
426 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000427 self.flush()
428 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000429
430 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000431 self.flush()
432 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000433
434 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000435 SGMLParser.close(self)
436 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000437
438
Guido van Rossum48766511996-03-28 18:45:04 +0000439def test(args = None):
440 import sys
441
442 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000443 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000444
445 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 args = args[1:]
447 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000448 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000449 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000450
451 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000452 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000453 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000454 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000455
456 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000457 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000458 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000459 try:
460 f = open(file, 'r')
461 except IOError, msg:
462 print file, ":", msg
463 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000464
465 data = f.read()
466 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000467 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000468
469 x = klass()
470 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000471 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000472 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000473
474
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000475if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000476 test()