blob: 681760854f96ba7a570d8b3bdf82f1e2fafe53ae [file] [log] [blame]
Guido van Rossum7c750e11995-02-27 13:16:55 +00001# A parser for SGML, using the derived class as static DTD.
2
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
Guido van Rossum1fef1811997-10-23 19:09:21 +000011import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000012import string
13
14
15# Regular expressions used for parsing
16
Guido van Rossum1fef1811997-10-23 19:09:21 +000017interesting = re.compile('[&<]')
18incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000019 '<([a-zA-Z][^<>]*|'
20 '/([a-zA-Z][^<>]*)?|'
21 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000022
Guido van Rossum1ad00711998-05-28 22:48:53 +000023entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000024charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000025
Guido van Rossum1fef1811997-10-23 19:09:21 +000026starttagopen = re.compile('<[>a-zA-Z]')
27shorttagopen = re.compile('<[a-zA-Z][a-zA-Z0-9]*/')
28shorttag = re.compile('<([a-zA-Z][a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000029piopen = re.compile('<\?')
30piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000031endtagopen = re.compile('</[<>a-zA-Z]')
32endbracket = re.compile('[<>]')
33special = re.compile('<![^<>]*>')
34commentopen = re.compile('<!--')
Fred Drakede2f7081998-04-16 21:04:26 +000035commentclose = re.compile('--[%s]*>' % string.whitespace)
Guido van Rossum1fef1811997-10-23 19:09:21 +000036tagfind = re.compile('[a-zA-Z][a-zA-Z0-9]*')
37attrfind = re.compile(
Guido van Rossum1ad00711998-05-28 22:48:53 +000038 '[%s]+([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
Fred Drakede2f7081998-04-16 21:04:26 +000039 + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
40 + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000041
42
43# SGML parser base class -- find tags and call handler functions.
44# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
45# The dtd is defined by deriving a class which defines methods
46# with special names to handle tags: start_foo and end_foo to handle
47# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
48# (Tags are converted to lower case for this purpose.) The data
49# between tags is passed to the parser by calling self.handle_data()
50# with some data as argument (the data may be split up in arbutrary
51# chunks). Entity references are passed by calling
52# self.handle_entityref() with the entity reference as argument.
53
54class SGMLParser:
55
Guido van Rossum48766511996-03-28 18:45:04 +000056 # Interface -- initialize and reset this instance
57 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000058 self.verbose = verbose
59 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000060
Guido van Rossum48766511996-03-28 18:45:04 +000061 # Interface -- reset this instance. Loses all unprocessed data
62 def reset(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000063 self.rawdata = ''
64 self.stack = []
65 self.lasttag = '???'
66 self.nomoretags = 0
67 self.literal = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000068
Guido van Rossum48766511996-03-28 18:45:04 +000069 # For derived classes only -- enter literal mode (CDATA) till EOF
70 def setnomoretags(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000071 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000072
Guido van Rossum48766511996-03-28 18:45:04 +000073 # For derived classes only -- enter literal mode (CDATA)
74 def setliteral(self, *args):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000075 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000076
Guido van Rossum48766511996-03-28 18:45:04 +000077 # Interface -- feed some data to the parser. Call this as
78 # often as you want, with as little or as much text as you
79 # want (may include '\n'). (This just saves the text, all the
80 # processing is done by goahead().)
81 def feed(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 self.rawdata = self.rawdata + data
83 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +000084
Guido van Rossum48766511996-03-28 18:45:04 +000085 # Interface -- handle the remaining data
86 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +000088
Guido van Rossum48766511996-03-28 18:45:04 +000089 # Internal -- handle data as far as reasonable. May leave state
90 # and data to be processed by a subsequent call. If 'end' is
91 # true, force handling all data as if followed by EOF marker.
92 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 rawdata = self.rawdata
94 i = 0
95 n = len(rawdata)
96 while i < n:
97 if self.nomoretags:
98 self.handle_data(rawdata[i:n])
99 i = n
100 break
101 match = interesting.search(rawdata, i)
102 if match: j = match.start(0)
103 else: j = n
104 if i < j: self.handle_data(rawdata[i:j])
105 i = j
106 if i == n: break
107 if rawdata[i] == '<':
108 if starttagopen.match(rawdata, i):
109 if self.literal:
110 self.handle_data(rawdata[i])
111 i = i+1
112 continue
113 k = self.parse_starttag(i)
114 if k < 0: break
115 i = k
116 continue
117 if endtagopen.match(rawdata, i):
118 k = self.parse_endtag(i)
119 if k < 0: break
120 i = k
121 self.literal = 0
122 continue
123 if commentopen.match(rawdata, i):
124 if self.literal:
125 self.handle_data(rawdata[i])
126 i = i+1
127 continue
128 k = self.parse_comment(i)
129 if k < 0: break
130 i = i+k
131 continue
Guido van Rossum1ad00711998-05-28 22:48:53 +0000132 if piopen.match(rawdata, i):
133 if self.literal:
134 self.handle_data(rawdata[i])
135 i = i+1
136 continue
137 k = self.parse_pi(i)
138 if k < 0: break
139 i = i+k
140 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000141 match = special.match(rawdata, i)
142 if match:
143 if self.literal:
144 self.handle_data(rawdata[i])
145 i = i+1
146 continue
147 i = match.end(0)
148 continue
149 elif rawdata[i] == '&':
150 match = charref.match(rawdata, i)
151 if match:
152 name = match.group(1)
153 self.handle_charref(name)
154 i = match.end(0)
155 if rawdata[i-1] != ';': i = i-1
156 continue
157 match = entityref.match(rawdata, i)
158 if match:
159 name = match.group(1)
160 self.handle_entityref(name)
161 i = match.end(0)
162 if rawdata[i-1] != ';': i = i-1
163 continue
164 else:
165 raise RuntimeError, 'neither < nor & ??'
166 # We get here only if incomplete matches but
167 # nothing else
168 match = incomplete.match(rawdata, i)
169 if not match:
170 self.handle_data(rawdata[i])
171 i = i+1
172 continue
173 j = match.end(0)
174 if j == n:
175 break # Really incomplete
176 self.handle_data(rawdata[i:j])
177 i = j
178 # end while
179 if end and i < n:
180 self.handle_data(rawdata[i:n])
181 i = n
182 self.rawdata = rawdata[i:]
183 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000184
Guido van Rossum48766511996-03-28 18:45:04 +0000185 # Internal -- parse comment, return length or -1 if not terminated
186 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 rawdata = self.rawdata
188 if rawdata[i:i+4] <> '<!--':
189 raise RuntimeError, 'unexpected call to handle_comment'
190 match = commentclose.search(rawdata, i+4)
191 if not match:
192 return -1
193 j = match.start(0)
194 self.handle_comment(rawdata[i+4: j])
195 j = match.end(0)
196 return j-i
Guido van Rossum7c750e11995-02-27 13:16:55 +0000197
Guido van Rossum1ad00711998-05-28 22:48:53 +0000198 # Internal -- parse processing instr, return length or -1 if not terminated
199 def parse_pi(self, i):
200 rawdata = self.rawdata
201 if rawdata[i:i+2] <> '<?':
202 raise RuntimeError, 'unexpected call to handle_pi'
203 match = piclose.search(rawdata, i+2)
204 if not match:
205 return -1
206 j = match.start(0)
207 self.handle_pi(rawdata[i+2: j])
208 j = match.end(0)
209 return j-i
210
Guido van Rossum48766511996-03-28 18:45:04 +0000211 # Internal -- handle starttag, return length or -1 if not terminated
212 def parse_starttag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000213 rawdata = self.rawdata
214 if shorttagopen.match(rawdata, i):
215 # SGML shorthand: <tag/data/ == <tag>data</tag>
216 # XXX Can data contain &... (entity or char refs)?
217 # XXX Can data contain < or > (tag characters)?
218 # XXX Can there be whitespace before the first /?
219 match = shorttag.match(rawdata, i)
220 if not match:
221 return -1
222 tag, data = match.group(1, 2)
223 tag = string.lower(tag)
224 self.finish_shorttag(tag, data)
225 k = match.end(0)
226 return k
227 # XXX The following should skip matching quotes (' or ")
228 match = endbracket.search(rawdata, i+1)
229 if not match:
230 return -1
231 j = match.start(0)
232 # Now parse the data between i+1 and j into a tag and attrs
233 attrs = []
234 if rawdata[i:i+2] == '<>':
235 # SGML shorthand: <> == <last open tag seen>
236 k = j
237 tag = self.lasttag
238 else:
239 match = tagfind.match(rawdata, i+1)
240 if not match:
241 raise RuntimeError, 'unexpected call to parse_starttag'
242 k = match.end(0)
243 tag = string.lower(rawdata[i+1:k])
244 self.lasttag = tag
245 while k < j:
246 match = attrfind.match(rawdata, k)
247 if not match: break
248 attrname, rest, attrvalue = match.group(1, 2, 3)
249 if not rest:
250 attrvalue = attrname
251 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
252 attrvalue[:1] == '"' == attrvalue[-1:]:
253 attrvalue = attrvalue[1:-1]
254 attrs.append((string.lower(attrname), attrvalue))
255 k = match.end(0)
256 if rawdata[j] == '>':
257 j = j+1
258 self.finish_starttag(tag, attrs)
259 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000260
261 # Internal -- parse endtag
262 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000263 rawdata = self.rawdata
264 match = endbracket.search(rawdata, i+1)
265 if not match:
266 return -1
267 j = match.start(0)
268 tag = string.lower(string.strip(rawdata[i+2:j]))
269 if rawdata[j] == '>':
270 j = j+1
271 self.finish_endtag(tag)
272 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000273
274 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
275 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 self.finish_starttag(tag, [])
277 self.handle_data(data)
278 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000279
280 # Internal -- finish processing of start tag
281 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
282 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000283 try:
284 method = getattr(self, 'start_' + tag)
285 except AttributeError:
286 try:
287 method = getattr(self, 'do_' + tag)
288 except AttributeError:
289 self.unknown_starttag(tag, attrs)
290 return -1
291 else:
292 self.handle_starttag(tag, method, attrs)
293 return 0
294 else:
295 self.stack.append(tag)
296 self.handle_starttag(tag, method, attrs)
297 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000298
299 # Internal -- finish processing of end tag
300 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000301 if not tag:
302 found = len(self.stack) - 1
303 if found < 0:
304 self.unknown_endtag(tag)
305 return
306 else:
307 if tag not in self.stack:
308 try:
309 method = getattr(self, 'end_' + tag)
310 except AttributeError:
311 self.unknown_endtag(tag)
312 return
313 found = len(self.stack)
314 for i in range(found):
315 if self.stack[i] == tag: found = i
316 while len(self.stack) > found:
317 tag = self.stack[-1]
318 try:
319 method = getattr(self, 'end_' + tag)
320 except AttributeError:
321 method = None
322 if method:
323 self.handle_endtag(tag, method)
324 else:
325 self.unknown_endtag(tag)
326 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000327
Guido van Rossum48766511996-03-28 18:45:04 +0000328 # Overridable -- handle start tag
329 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000330 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000331
Guido van Rossum48766511996-03-28 18:45:04 +0000332 # Overridable -- handle end tag
333 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000334 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000335
Guido van Rossum48766511996-03-28 18:45:04 +0000336 # Example -- report an unbalanced </...> tag.
337 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 if self.verbose:
339 print '*** Unbalanced </' + tag + '>'
340 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000341
Guido van Rossum48766511996-03-28 18:45:04 +0000342 # Example -- handle character reference, no need to override
343 def handle_charref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000344 try:
345 n = string.atoi(name)
346 except string.atoi_error:
347 self.unknown_charref(name)
348 return
349 if not 0 <= n <= 255:
350 self.unknown_charref(name)
351 return
352 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000353
Guido van Rossum48766511996-03-28 18:45:04 +0000354 # Definition of entities -- derived classes may override
355 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000356 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000357
Guido van Rossum48766511996-03-28 18:45:04 +0000358 # Example -- handle entity reference, no need to override
359 def handle_entityref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000360 table = self.entitydefs
361 if table.has_key(name):
362 self.handle_data(table[name])
363 else:
364 self.unknown_entityref(name)
365 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000366
Guido van Rossum48766511996-03-28 18:45:04 +0000367 # Example -- handle data, should be overridden
368 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000369 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000370
Guido van Rossum48766511996-03-28 18:45:04 +0000371 # Example -- handle comment, could be overridden
372 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000373 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000374
Guido van Rossum1ad00711998-05-28 22:48:53 +0000375 # Example -- handle processing instruction, could be overridden
376 def handle_pi(self, data):
377 pass
378
Guido van Rossum48766511996-03-28 18:45:04 +0000379 # To be overridden -- handlers for unknown objects
380 def unknown_starttag(self, tag, attrs): pass
381 def unknown_endtag(self, tag): pass
382 def unknown_charref(self, ref): pass
383 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000384
385
Guido van Rossum48766511996-03-28 18:45:04 +0000386class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000387
Guido van Rossum48766511996-03-28 18:45:04 +0000388 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000389 self.testdata = ""
390 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000391
Guido van Rossum48766511996-03-28 18:45:04 +0000392 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000393 self.testdata = self.testdata + data
394 if len(`self.testdata`) >= 70:
395 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000396
Guido van Rossum48766511996-03-28 18:45:04 +0000397 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 data = self.testdata
399 if data:
400 self.testdata = ""
401 print 'data:', `data`
Guido van Rossum7c750e11995-02-27 13:16:55 +0000402
Guido van Rossum48766511996-03-28 18:45:04 +0000403 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000404 self.flush()
405 r = `data`
406 if len(r) > 68:
407 r = r[:32] + '...' + r[-32:]
408 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000409
Guido van Rossum48766511996-03-28 18:45:04 +0000410 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000411 self.flush()
412 if not attrs:
413 print 'start tag: <' + tag + '>'
414 else:
415 print 'start tag: <' + tag,
416 for name, value in attrs:
417 print name + '=' + '"' + value + '"',
418 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000419
Guido van Rossum48766511996-03-28 18:45:04 +0000420 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000421 self.flush()
422 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000423
424 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000425 self.flush()
426 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000427
428 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000429 self.flush()
430 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000431
432 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000433 SGMLParser.close(self)
434 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000435
436
Guido van Rossum48766511996-03-28 18:45:04 +0000437def test(args = None):
438 import sys
439
440 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000441 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000442
443 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000444 args = args[1:]
445 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000446 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000447 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000448
449 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000450 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000451 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000452 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000453
454 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000455 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000456 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000457 try:
458 f = open(file, 'r')
459 except IOError, msg:
460 print file, ":", msg
461 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000462
463 data = f.read()
464 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000465 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000466
467 x = klass()
468 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000469 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000470 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000471
472
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000473if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000474 test()