blob: 38328affce5061751665dceca992a0fc1510321b [file] [log] [blame]
Guido van Rossuma219efa1997-11-18 15:09:54 +00001# A parser for XML, using the derived class as static DTD.
2# Author: Sjoerd Mullender
3
4import re
5import string
6
7
8# Regular expressions used for parsing
9
10_S = '[ \t\r\n]+'
11_opS = '[ \t\r\n]*'
12_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
13interesting = re.compile('[&<]')
14incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
15 '<([a-zA-Z_:][^<>]*|'
16 '/([a-zA-Z_:][^<>]*)?|'
17 '![^<>]*|'
18 '\?[^<>]*)?')
19
20ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
21entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
22charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
23space = re.compile(_S)
24newline = re.compile('\n')
25
26starttagopen = re.compile('<' + _Name)
27endtagopen = re.compile('</')
28starttagend = re.compile(_opS + '(?P<slash>/?)>')
29endbracket = re.compile('>')
30tagfind = re.compile(_Name)
31cdataopen = re.compile('<!\[CDATA\[')
32cdataclose = re.compile('\]\]>')
33special = re.compile('<!(?P<special>[^<>]*)>')
34procopen = re.compile('<\?(?P<proc>' + _Name + ')' + _S)
35procclose = re.compile('\?>')
36commentopen = re.compile('<!--')
37commentclose = re.compile('-->')
38doubledash = re.compile('--')
39attrfind = re.compile(
40 _S + '(?P<name>' + _Name + ')'
41 '(' + _opS + '=' + _opS +
42 '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
43
44
45# XML parser base class -- find tags and call handler functions.
46# Usage: p = XMLParser(); p.feed(data); ...; p.close().
47# The dtd is defined by deriving a class which defines methods
48# with special names to handle tags: start_foo and end_foo to handle
49# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
50# (Tags are converted to lower case for this purpose.) The data
51# between tags is passed to the parser by calling self.handle_data()
52# with some data as argument (the data may be split up in arbutrary
53# chunks). Entity references are passed by calling
54# self.handle_entityref() with the entity reference as argument.
55
56class XMLParser:
57
58 # Interface -- initialize and reset this instance
59 def __init__(self, verbose=0):
60 self.verbose = verbose
61 self.reset()
62
63 # Interface -- reset this instance. Loses all unprocessed data
64 def reset(self):
65 self.rawdata = ''
66 self.stack = []
67 self.lasttag = '???'
68 self.nomoretags = 0
69 self.literal = 0
70 self.lineno = 1
71
72 # For derived classes only -- enter literal mode (CDATA) till EOF
73 def setnomoretags(self):
74 self.nomoretags = self.literal = 1
75
76 # For derived classes only -- enter literal mode (CDATA)
77 def setliteral(self, *args):
78 self.literal = 1
79
80 # Interface -- feed some data to the parser. Call this as
81 # often as you want, with as little or as much text as you
82 # want (may include '\n'). (This just saves the text, all the
83 # processing is done by goahead().)
84 def feed(self, data):
85 self.rawdata = self.rawdata + data
86 self.goahead(0)
87
88 # Interface -- handle the remaining data
89 def close(self):
90 self.goahead(1)
91
92 # Interface -- translate references
93 def translate_references(self, data):
94 newdata = []
95 i = 0
96 while 1:
97 res = ref.search(data, i)
98 if res is None:
99 newdata.append(data[i:])
100 return string.join(newdata, '')
101 if data[res.end(0) - 1] != ';':
102 self.syntax_error(self.lineno,
103 '; missing after entity/char reference')
104 newdata.append(data[i:res.start(0)])
105 str = res.group(1)
106 if str[0] == '#':
107 if str[1] == 'x':
108 newdata.append(chr(string.atoi(str[2:], 16)))
109 else:
110 newdata.append(chr(string.atoi(str[1:])))
111 else:
112 try:
113 newdata.append(self.entitydefs[str])
114 except KeyError:
115 # can't do it, so keep the entity ref in
116 newdata.append('&' + str + ';')
117 i = res.end(0)
118
119 # Internal -- handle data as far as reasonable. May leave state
120 # and data to be processed by a subsequent call. If 'end' is
121 # true, force handling all data as if followed by EOF marker.
122 def goahead(self, end):
123 rawdata = self.rawdata
124 i = 0
125 n = len(rawdata)
126 while i < n:
127 if self.nomoretags:
128 data = rawdata[i:n]
129 self.handle_data(data)
130 self.lineno = self.lineno + string.count(data, '\n')
131 i = n
132 break
133 res = interesting.search(rawdata, i)
134 if res:
135 j = res.start(0)
136 else:
137 j = n
138 if i < j:
139 data = rawdata[i:j]
140 self.handle_data(data)
141 self.lineno = self.lineno + string.count(data, '\n')
142 i = j
143 if i == n: break
144 if rawdata[i] == '<':
145 if starttagopen.match(rawdata, i):
146 if self.literal:
147 data = rawdata[i]
148 self.handle_data(data)
149 self.lineno = self.lineno + string.count(data, '\n')
150 i = i+1
151 continue
152 k = self.parse_starttag(i)
153 if k < 0: break
154 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
155 i = k
156 continue
157 if endtagopen.match(rawdata, i):
158 k = self.parse_endtag(i)
159 if k < 0: break
160 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
161 i = k
162 self.literal = 0
163 continue
164 if commentopen.match(rawdata, i):
165 if self.literal:
166 data = rawdata[i]
167 self.handle_data(data)
168 self.lineno = self.lineno + string.count(data, '\n')
169 i = i+1
170 continue
171 k = self.parse_comment(i)
172 if k < 0: break
173 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
174 i = k
175 continue
176 if cdataopen.match(rawdata, i):
177 k = self.parse_cdata(i)
178 if k < 0: break
179 self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
180 i = k
181 continue
182 res = procopen.match(rawdata, i)
183 if res:
184 k = self.parse_proc(i, res)
185 if k < 0: break
186 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
187 i = k
188 continue
189 res = special.match(rawdata, i)
190 if res:
191 if self.literal:
192 data = rawdata[i]
193 self.handle_data(data)
194 self.lineno = self.lineno + string.count(data, '\n')
195 i = i+1
196 continue
197 self.handle_special(res.group('special'))
198 self.lineno = self.lineno + string.count(res.group(0), '\n')
199 i = res.end(0)
200 continue
201 elif rawdata[i] == '&':
202 res = charref.match(rawdata, i)
203 if res is not None:
204 i = res.end(0)
205 if rawdata[i-1] != ';':
206 self.syntax_error(self.lineno, '; missing in charref')
207 i = i-1
208 self.handle_charref(res.group('char')[:-1])
209 self.lineno = self.lineno + string.count(res.group(0), '\n')
210 continue
211 res = entityref.match(rawdata, i)
212 if res is not None:
213 i = res.end(0)
214 if rawdata[i-1] != ';':
215 self.syntax_error(self.lineno, '; missing in entityref')
216 i = i-1
217 self.handle_entityref(res.group('name'))
218 self.lineno = self.lineno + string.count(res.group(0), '\n')
219 continue
220 else:
221 raise RuntimeError, 'neither < nor & ??'
222 # We get here only if incomplete matches but
223 # nothing else
224 res = incomplete.match(rawdata, i)
225 if not res:
226 data = rawdata[i]
227 self.handle_data(data)
228 self.lineno = self.lineno + string.count(data, '\n')
229 i = i+1
230 continue
231 j = res.end(0)
232 if j == n:
233 break # Really incomplete
234 self.syntax_error(self.lineno, 'bogus < or &')
235 data = res.group(0)
236 self.handle_data(data)
237 self.lineno = self.lineno + string.count(data, '\n')
238 i = j
239 # end while
240 if end and i < n:
241 data = rawdata[i:n]
242 self.handle_data(data)
243 self.lineno = self.lineno + string.count(data, '\n')
244 i = n
245 self.rawdata = rawdata[i:]
246 # XXX if end: check for empty stack
247
248 # Internal -- parse comment, return length or -1 if not terminated
249 def parse_comment(self, i):
250 rawdata = self.rawdata
251 if rawdata[i:i+4] <> '<!--':
252 raise RuntimeError, 'unexpected call to handle_comment'
253 res = commentclose.search(rawdata, i+4)
254 if not res:
255 return -1
256 # doubledash search will succeed because it's a subset of commentclose
257 if doubledash.search(rawdata, i+4).start(0) < res.start(0):
258 self.syntax_error(self.lineno, "`--' inside comment")
259 self.handle_comment(rawdata[i+4: res.start(0)])
260 return res.end(0)
261
262 # Internal -- handle CDATA tag, return lenth or -1 if not terminated
263 def parse_cdata(self, i):
264 rawdata = self.rawdata
265 if rawdata[i:i+9] <> '<![CDATA[':
266 raise RuntimeError, 'unexpected call to handle_cdata'
267 res = cdataclose.search(rawdata, i+9)
268 if not res:
269 return -1
270 self.handle_cdata(rawdata[i+9:res.start(0)])
271 return res.end(0)
272
273 def parse_proc(self, i, res):
274 rawdata = self.rawdata
275 if not res:
276 raise RuntimeError, 'unexpected call to parse_proc'
277 name = res.group('proc')
278 res = procclose.search(rawdata, res.end(0))
279 if not res:
280 return -1
281 self.handle_proc(name, rawdata[res.pos:res.start(0)])
282 return res.end(0)
283
284 # Internal -- handle starttag, return length or -1 if not terminated
285 def parse_starttag(self, i):
286 rawdata = self.rawdata
287 # i points to start of tag
288 end = endbracket.search(rawdata, i+1)
289 if not end:
290 return -1
291 j = end.start(0)
292 # Now parse the data between i+1 and j into a tag and attrs
293 attrdict = {}
294 res = tagfind.match(rawdata, i+1)
295 if not res:
296 raise RuntimeError, 'unexpected call to parse_starttag'
297 k = res.end(0)
298 tag = res.group(0)
299 if hasattr(self, tag + '_attributes'):
300 attrlist = getattr(self, tag + '_attributes')
301 else:
302 attrlist = None
303 self.lasttag = tag
304 while k < j:
305 res = attrfind.match(rawdata, k)
306 if not res: break
307 attrname, attrvalue = res.group('name', 'value')
308 if attrvalue is None:
309 self.syntax_error(self.lineno, 'no attribute value specified')
310 attrvalue = attrname
311 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
312 attrvalue[:1] == '"' == attrvalue[-1:]:
313 attrvalue = attrvalue[1:-1]
314 else:
315 self.syntax_error(self.lineno, 'attribute value not quoted')
316 # XXXX are attribute names case sensitive?
317 attrname = string.lower(attrname)
318 if attrlist is not None and attrname not in attrlist:
319 self.syntax_error(self.lineno,
320 'unknown attribute %s of element %s' %
321 (attrname, tag))
322 if attrdict.has_key(attrname):
323 self.syntax_error(self.lineno, 'attribute specified twice')
324 attrdict[attrname] = self.translate_references(attrvalue)
325 k = res.end(0)
326 res = starttagend.match(rawdata, k)
327 if not res:
328 self.syntax_error(self.lineno, 'garbage in start tag')
329 self.finish_starttag(tag, attrdict)
330 if res and res.group('slash') == '/':
331 self.finish_endtag(tag)
332 return end.end(0)
333
334 # Internal -- parse endtag
335 def parse_endtag(self, i):
336 rawdata = self.rawdata
337 end = endbracket.search(rawdata, i+1)
338 if not end:
339 return -1
340 res = tagfind.match(rawdata, i+2)
341 if not res:
342 self.syntax_error(self.lineno, 'no name specified in end tag')
343 tag = ''
344 k = i+2
345 else:
346 tag = res.group(0)
347 k = res.end(0)
348 if k != end.start(0):
349 # check that there is only white space at end of tag
350 res = space.match(rawdata, k)
351 if res is None or res.end(0) != end.start(0):
352 self.syntax_error(self.lineno, 'garbage in end tag')
353 self.finish_endtag(tag)
354 return end.end(0)
355
356 # Internal -- finish processing of start tag
357 # Return -1 for unknown tag, 1 for balanced tag
358 def finish_starttag(self, tag, attrs):
359 self.stack.append(tag)
360 try:
361 method = getattr(self, 'start_' + tag)
362 except AttributeError:
363 self.unknown_starttag(tag, attrs)
364 return -1
365 else:
366 self.handle_starttag(tag, method, attrs)
367 return 1
368
369 # Internal -- finish processing of end tag
370 def finish_endtag(self, tag):
371 if not tag:
372 found = len(self.stack) - 1
373 if found < 0:
374 self.unknown_endtag(tag)
375 return
376 else:
377 if tag not in self.stack:
378 try:
379 method = getattr(self, 'end_' + tag)
380 except AttributeError:
381 self.unknown_endtag(tag)
382 return
383 found = len(self.stack)
384 for i in range(found):
385 if self.stack[i] == tag: found = i
386 while len(self.stack) > found:
387 tag = self.stack[-1]
388 try:
389 method = getattr(self, 'end_' + tag)
390 except AttributeError:
391 method = None
392 if method:
393 self.handle_endtag(tag, method)
394 else:
395 self.unknown_endtag(tag)
396 del self.stack[-1]
397
398 # Overridable -- handle start tag
399 def handle_starttag(self, tag, method, attrs):
400 method(attrs)
401
402 # Overridable -- handle end tag
403 def handle_endtag(self, tag, method):
404 method()
405
406 # Example -- handle character reference, no need to override
407 def handle_charref(self, name):
408 try:
409 if name[0] == 'x':
410 n = string.atoi(name[1:], 16)
411 else:
412 n = string.atoi(name)
413 except string.atoi_error:
414 self.unknown_charref(name)
415 return
416 if not 0 <= n <= 255:
417 self.unknown_charref(name)
418 return
419 self.handle_data(chr(n))
420
421 # Definition of entities -- derived classes may override
422 entitydefs = \
423 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
424
425 # Example -- handle entity reference, no need to override
426 def handle_entityref(self, name):
427 table = self.entitydefs
428 if table.has_key(name):
429 self.handle_data(table[name])
430 else:
431 self.unknown_entityref(name)
432 return
433
434 # Example -- handle data, should be overridden
435 def handle_data(self, data):
436 pass
437
438 # Example -- handle cdata, could be overridden
439 def handle_cdata(self, data):
440 pass
441
442 # Example -- handle comment, could be overridden
443 def handle_comment(self, data):
444 pass
445
446 # Example -- handle processing instructions, could be overridden
447 def handle_proc(self, name, data):
448 pass
449
450 # Example -- handle special instructions, could be overridden
451 def handle_special(self, data):
452 pass
453
454 # Example -- handle relatively harmless syntax errors, could be overridden
455 def syntax_error(self, lineno, message):
456 raise RuntimeError, 'Syntax error at line %d: %s' % (lineno, message)
457
458 # To be overridden -- handlers for unknown objects
459 def unknown_starttag(self, tag, attrs): pass
460 def unknown_endtag(self, tag): pass
461 def unknown_charref(self, ref): pass
462 def unknown_entityref(self, ref): pass
463
464
465class TestXMLParser(XMLParser):
466
467 def __init__(self, verbose=0):
468 self.testdata = ""
469 XMLParser.__init__(self, verbose)
470
471 def handle_data(self, data):
472 self.testdata = self.testdata + data
473 if len(`self.testdata`) >= 70:
474 self.flush()
475
476 def flush(self):
477 data = self.testdata
478 if data:
479 self.testdata = ""
480 print 'data:', `data`
481
482 def handle_cdata(self, data):
483 self.flush()
484 print 'cdata:', `data`
485
486 def handle_proc(self, name, data):
487 self.flush()
488 print 'processing:',name,`data`
489
490 def handle_special(self, data):
491 self.flush()
492 print 'special:',`data`
493
494 def handle_comment(self, data):
495 self.flush()
496 r = `data`
497 if len(r) > 68:
498 r = r[:32] + '...' + r[-32:]
499 print 'comment:', r
500
501 def syntax_error(self, lineno, message):
502 print 'error at line %d:' % lineno, message
503
504 def unknown_starttag(self, tag, attrs):
505 self.flush()
506 if not attrs:
507 print 'start tag: <' + tag + '>'
508 else:
509 print 'start tag: <' + tag,
510 for name, value in attrs:
511 print name + '=' + '"' + value + '"',
512 print '>'
513
514 def unknown_endtag(self, tag):
515 self.flush()
516 print 'end tag: </' + tag + '>'
517
518 def unknown_entityref(self, ref):
519 self.flush()
520 print '*** unknown entity ref: &' + ref + ';'
521
522 def unknown_charref(self, ref):
523 self.flush()
524 print '*** unknown char ref: &#' + ref + ';'
525
526 def close(self):
527 XMLParser.close(self)
528 self.flush()
529
530def test(args = None):
531 import sys
532
533 if not args:
534 args = sys.argv[1:]
535
536 if args and args[0] == '-s':
537 args = args[1:]
538 klass = XMLParser
539 else:
540 klass = TestXMLParser
541
542 if args:
543 file = args[0]
544 else:
545 file = 'test.xml'
546
547 if file == '-':
548 f = sys.stdin
549 else:
550 try:
551 f = open(file, 'r')
552 except IOError, msg:
553 print file, ":", msg
554 sys.exit(1)
555
556 data = f.read()
557 if f is not sys.stdin:
558 f.close()
559
560 x = klass()
561 for c in data:
562 x.feed(c)
563 x.close()
564
565
566if __name__ == '__main__':
567 test()
568