blob: 7b2a76a7c6b181e077fd2f4ebf82befcec3e9916 [file] [log] [blame]
Guido van Rossuma219efa1997-11-18 15:09:54 +00001# A parser for XML, using the derived class as static DTD.
Guido van Rossum5d68e8e1997-11-18 15:27:20 +00002# Author: Sjoerd Mullender.
Guido van Rossuma219efa1997-11-18 15:09:54 +00003
4import re
5import string
6
7
8# Regular expressions used for parsing
9
10_S = '[ \t\r\n]+'
11_opS = '[ \t\r\n]*'
12_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
13interesting = re.compile('[&<]')
14incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
15 '<([a-zA-Z_:][^<>]*|'
16 '/([a-zA-Z_:][^<>]*)?|'
17 '![^<>]*|'
18 '\?[^<>]*)?')
19
20ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
21entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
22charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
23space = re.compile(_S)
24newline = re.compile('\n')
25
26starttagopen = re.compile('<' + _Name)
27endtagopen = re.compile('</')
28starttagend = re.compile(_opS + '(?P<slash>/?)>')
29endbracket = re.compile('>')
30tagfind = re.compile(_Name)
31cdataopen = re.compile('<!\[CDATA\[')
32cdataclose = re.compile('\]\]>')
33special = re.compile('<!(?P<special>[^<>]*)>')
34procopen = re.compile('<\?(?P<proc>' + _Name + ')' + _S)
35procclose = re.compile('\?>')
36commentopen = re.compile('<!--')
37commentclose = re.compile('-->')
38doubledash = re.compile('--')
39attrfind = re.compile(
40 _S + '(?P<name>' + _Name + ')'
41 '(' + _opS + '=' + _opS +
42 '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
43
44
45# XML parser base class -- find tags and call handler functions.
46# Usage: p = XMLParser(); p.feed(data); ...; p.close().
Guido van Rossum5d68e8e1997-11-18 15:27:20 +000047# The dtd is defined by deriving a class which defines methods with
48# special names to handle tags: start_foo and end_foo to handle <foo>
49# and </foo>, respectively. The data between tags is passed to the
50# parser by calling self.handle_data() with some data as argument (the
51# data may be split up in arbutrary chunks). Entity references are
52# passed by calling self.handle_entityref() with the entity reference
53# as argument.
Guido van Rossuma219efa1997-11-18 15:09:54 +000054
55class XMLParser:
56
57 # Interface -- initialize and reset this instance
58 def __init__(self, verbose=0):
59 self.verbose = verbose
60 self.reset()
61
62 # Interface -- reset this instance. Loses all unprocessed data
63 def reset(self):
64 self.rawdata = ''
65 self.stack = []
66 self.lasttag = '???'
67 self.nomoretags = 0
68 self.literal = 0
69 self.lineno = 1
70
71 # For derived classes only -- enter literal mode (CDATA) till EOF
72 def setnomoretags(self):
73 self.nomoretags = self.literal = 1
74
75 # For derived classes only -- enter literal mode (CDATA)
76 def setliteral(self, *args):
77 self.literal = 1
78
79 # Interface -- feed some data to the parser. Call this as
80 # often as you want, with as little or as much text as you
81 # want (may include '\n'). (This just saves the text, all the
82 # processing is done by goahead().)
83 def feed(self, data):
84 self.rawdata = self.rawdata + data
85 self.goahead(0)
86
87 # Interface -- handle the remaining data
88 def close(self):
89 self.goahead(1)
90
91 # Interface -- translate references
92 def translate_references(self, data):
93 newdata = []
94 i = 0
95 while 1:
96 res = ref.search(data, i)
97 if res is None:
98 newdata.append(data[i:])
99 return string.join(newdata, '')
100 if data[res.end(0) - 1] != ';':
101 self.syntax_error(self.lineno,
102 '; missing after entity/char reference')
103 newdata.append(data[i:res.start(0)])
104 str = res.group(1)
105 if str[0] == '#':
106 if str[1] == 'x':
107 newdata.append(chr(string.atoi(str[2:], 16)))
108 else:
109 newdata.append(chr(string.atoi(str[1:])))
110 else:
111 try:
112 newdata.append(self.entitydefs[str])
113 except KeyError:
114 # can't do it, so keep the entity ref in
115 newdata.append('&' + str + ';')
116 i = res.end(0)
117
118 # Internal -- handle data as far as reasonable. May leave state
119 # and data to be processed by a subsequent call. If 'end' is
120 # true, force handling all data as if followed by EOF marker.
121 def goahead(self, end):
122 rawdata = self.rawdata
123 i = 0
124 n = len(rawdata)
125 while i < n:
126 if self.nomoretags:
127 data = rawdata[i:n]
128 self.handle_data(data)
129 self.lineno = self.lineno + string.count(data, '\n')
130 i = n
131 break
132 res = interesting.search(rawdata, i)
133 if res:
134 j = res.start(0)
135 else:
136 j = n
137 if i < j:
138 data = rawdata[i:j]
139 self.handle_data(data)
140 self.lineno = self.lineno + string.count(data, '\n')
141 i = j
142 if i == n: break
143 if rawdata[i] == '<':
144 if starttagopen.match(rawdata, i):
145 if self.literal:
146 data = rawdata[i]
147 self.handle_data(data)
148 self.lineno = self.lineno + string.count(data, '\n')
149 i = i+1
150 continue
151 k = self.parse_starttag(i)
152 if k < 0: break
153 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
154 i = k
155 continue
156 if endtagopen.match(rawdata, i):
157 k = self.parse_endtag(i)
158 if k < 0: break
159 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
160 i = k
161 self.literal = 0
162 continue
163 if commentopen.match(rawdata, i):
164 if self.literal:
165 data = rawdata[i]
166 self.handle_data(data)
167 self.lineno = self.lineno + string.count(data, '\n')
168 i = i+1
169 continue
170 k = self.parse_comment(i)
171 if k < 0: break
172 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
173 i = k
174 continue
175 if cdataopen.match(rawdata, i):
176 k = self.parse_cdata(i)
177 if k < 0: break
178 self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
179 i = k
180 continue
181 res = procopen.match(rawdata, i)
182 if res:
183 k = self.parse_proc(i, res)
184 if k < 0: break
185 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
186 i = k
187 continue
188 res = special.match(rawdata, i)
189 if res:
190 if self.literal:
191 data = rawdata[i]
192 self.handle_data(data)
193 self.lineno = self.lineno + string.count(data, '\n')
194 i = i+1
195 continue
196 self.handle_special(res.group('special'))
197 self.lineno = self.lineno + string.count(res.group(0), '\n')
198 i = res.end(0)
199 continue
200 elif rawdata[i] == '&':
201 res = charref.match(rawdata, i)
202 if res is not None:
203 i = res.end(0)
204 if rawdata[i-1] != ';':
205 self.syntax_error(self.lineno, '; missing in charref')
206 i = i-1
207 self.handle_charref(res.group('char')[:-1])
208 self.lineno = self.lineno + string.count(res.group(0), '\n')
209 continue
210 res = entityref.match(rawdata, i)
211 if res is not None:
212 i = res.end(0)
213 if rawdata[i-1] != ';':
214 self.syntax_error(self.lineno, '; missing in entityref')
215 i = i-1
216 self.handle_entityref(res.group('name'))
217 self.lineno = self.lineno + string.count(res.group(0), '\n')
218 continue
219 else:
220 raise RuntimeError, 'neither < nor & ??'
221 # We get here only if incomplete matches but
222 # nothing else
223 res = incomplete.match(rawdata, i)
224 if not res:
225 data = rawdata[i]
226 self.handle_data(data)
227 self.lineno = self.lineno + string.count(data, '\n')
228 i = i+1
229 continue
230 j = res.end(0)
231 if j == n:
232 break # Really incomplete
233 self.syntax_error(self.lineno, 'bogus < or &')
234 data = res.group(0)
235 self.handle_data(data)
236 self.lineno = self.lineno + string.count(data, '\n')
237 i = j
238 # end while
239 if end and i < n:
240 data = rawdata[i:n]
241 self.handle_data(data)
242 self.lineno = self.lineno + string.count(data, '\n')
243 i = n
244 self.rawdata = rawdata[i:]
245 # XXX if end: check for empty stack
246
247 # Internal -- parse comment, return length or -1 if not terminated
248 def parse_comment(self, i):
249 rawdata = self.rawdata
250 if rawdata[i:i+4] <> '<!--':
251 raise RuntimeError, 'unexpected call to handle_comment'
252 res = commentclose.search(rawdata, i+4)
253 if not res:
254 return -1
255 # doubledash search will succeed because it's a subset of commentclose
256 if doubledash.search(rawdata, i+4).start(0) < res.start(0):
257 self.syntax_error(self.lineno, "`--' inside comment")
258 self.handle_comment(rawdata[i+4: res.start(0)])
259 return res.end(0)
260
261 # Internal -- handle CDATA tag, return lenth or -1 if not terminated
262 def parse_cdata(self, i):
263 rawdata = self.rawdata
264 if rawdata[i:i+9] <> '<![CDATA[':
265 raise RuntimeError, 'unexpected call to handle_cdata'
266 res = cdataclose.search(rawdata, i+9)
267 if not res:
268 return -1
269 self.handle_cdata(rawdata[i+9:res.start(0)])
270 return res.end(0)
271
272 def parse_proc(self, i, res):
273 rawdata = self.rawdata
274 if not res:
275 raise RuntimeError, 'unexpected call to parse_proc'
276 name = res.group('proc')
277 res = procclose.search(rawdata, res.end(0))
278 if not res:
279 return -1
280 self.handle_proc(name, rawdata[res.pos:res.start(0)])
281 return res.end(0)
282
283 # Internal -- handle starttag, return length or -1 if not terminated
284 def parse_starttag(self, i):
285 rawdata = self.rawdata
286 # i points to start of tag
287 end = endbracket.search(rawdata, i+1)
288 if not end:
289 return -1
290 j = end.start(0)
291 # Now parse the data between i+1 and j into a tag and attrs
292 attrdict = {}
293 res = tagfind.match(rawdata, i+1)
294 if not res:
295 raise RuntimeError, 'unexpected call to parse_starttag'
296 k = res.end(0)
297 tag = res.group(0)
298 if hasattr(self, tag + '_attributes'):
299 attrlist = getattr(self, tag + '_attributes')
300 else:
301 attrlist = None
302 self.lasttag = tag
303 while k < j:
304 res = attrfind.match(rawdata, k)
305 if not res: break
306 attrname, attrvalue = res.group('name', 'value')
307 if attrvalue is None:
308 self.syntax_error(self.lineno, 'no attribute value specified')
309 attrvalue = attrname
310 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
311 attrvalue[:1] == '"' == attrvalue[-1:]:
312 attrvalue = attrvalue[1:-1]
313 else:
314 self.syntax_error(self.lineno, 'attribute value not quoted')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000315 if attrlist is not None and attrname not in attrlist:
316 self.syntax_error(self.lineno,
317 'unknown attribute %s of element %s' %
318 (attrname, tag))
319 if attrdict.has_key(attrname):
320 self.syntax_error(self.lineno, 'attribute specified twice')
321 attrdict[attrname] = self.translate_references(attrvalue)
322 k = res.end(0)
323 res = starttagend.match(rawdata, k)
324 if not res:
325 self.syntax_error(self.lineno, 'garbage in start tag')
326 self.finish_starttag(tag, attrdict)
327 if res and res.group('slash') == '/':
328 self.finish_endtag(tag)
329 return end.end(0)
330
331 # Internal -- parse endtag
332 def parse_endtag(self, i):
333 rawdata = self.rawdata
334 end = endbracket.search(rawdata, i+1)
335 if not end:
336 return -1
337 res = tagfind.match(rawdata, i+2)
338 if not res:
339 self.syntax_error(self.lineno, 'no name specified in end tag')
340 tag = ''
341 k = i+2
342 else:
343 tag = res.group(0)
344 k = res.end(0)
345 if k != end.start(0):
346 # check that there is only white space at end of tag
347 res = space.match(rawdata, k)
348 if res is None or res.end(0) != end.start(0):
349 self.syntax_error(self.lineno, 'garbage in end tag')
350 self.finish_endtag(tag)
351 return end.end(0)
352
353 # Internal -- finish processing of start tag
354 # Return -1 for unknown tag, 1 for balanced tag
355 def finish_starttag(self, tag, attrs):
356 self.stack.append(tag)
357 try:
358 method = getattr(self, 'start_' + tag)
359 except AttributeError:
360 self.unknown_starttag(tag, attrs)
361 return -1
362 else:
363 self.handle_starttag(tag, method, attrs)
364 return 1
365
366 # Internal -- finish processing of end tag
367 def finish_endtag(self, tag):
368 if not tag:
369 found = len(self.stack) - 1
370 if found < 0:
371 self.unknown_endtag(tag)
372 return
373 else:
374 if tag not in self.stack:
375 try:
376 method = getattr(self, 'end_' + tag)
377 except AttributeError:
378 self.unknown_endtag(tag)
379 return
380 found = len(self.stack)
381 for i in range(found):
382 if self.stack[i] == tag: found = i
383 while len(self.stack) > found:
384 tag = self.stack[-1]
385 try:
386 method = getattr(self, 'end_' + tag)
387 except AttributeError:
388 method = None
389 if method:
390 self.handle_endtag(tag, method)
391 else:
392 self.unknown_endtag(tag)
393 del self.stack[-1]
394
395 # Overridable -- handle start tag
396 def handle_starttag(self, tag, method, attrs):
397 method(attrs)
398
399 # Overridable -- handle end tag
400 def handle_endtag(self, tag, method):
401 method()
402
403 # Example -- handle character reference, no need to override
404 def handle_charref(self, name):
405 try:
406 if name[0] == 'x':
407 n = string.atoi(name[1:], 16)
408 else:
409 n = string.atoi(name)
410 except string.atoi_error:
411 self.unknown_charref(name)
412 return
413 if not 0 <= n <= 255:
414 self.unknown_charref(name)
415 return
416 self.handle_data(chr(n))
417
418 # Definition of entities -- derived classes may override
419 entitydefs = \
420 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
421
422 # Example -- handle entity reference, no need to override
423 def handle_entityref(self, name):
424 table = self.entitydefs
425 if table.has_key(name):
426 self.handle_data(table[name])
427 else:
428 self.unknown_entityref(name)
429 return
430
431 # Example -- handle data, should be overridden
432 def handle_data(self, data):
433 pass
434
435 # Example -- handle cdata, could be overridden
436 def handle_cdata(self, data):
437 pass
438
439 # Example -- handle comment, could be overridden
440 def handle_comment(self, data):
441 pass
442
443 # Example -- handle processing instructions, could be overridden
444 def handle_proc(self, name, data):
445 pass
446
447 # Example -- handle special instructions, could be overridden
448 def handle_special(self, data):
449 pass
450
451 # Example -- handle relatively harmless syntax errors, could be overridden
452 def syntax_error(self, lineno, message):
453 raise RuntimeError, 'Syntax error at line %d: %s' % (lineno, message)
454
455 # To be overridden -- handlers for unknown objects
456 def unknown_starttag(self, tag, attrs): pass
457 def unknown_endtag(self, tag): pass
458 def unknown_charref(self, ref): pass
459 def unknown_entityref(self, ref): pass
460
461
462class TestXMLParser(XMLParser):
463
464 def __init__(self, verbose=0):
465 self.testdata = ""
466 XMLParser.__init__(self, verbose)
467
468 def handle_data(self, data):
469 self.testdata = self.testdata + data
470 if len(`self.testdata`) >= 70:
471 self.flush()
472
473 def flush(self):
474 data = self.testdata
475 if data:
476 self.testdata = ""
477 print 'data:', `data`
478
479 def handle_cdata(self, data):
480 self.flush()
481 print 'cdata:', `data`
482
483 def handle_proc(self, name, data):
484 self.flush()
485 print 'processing:',name,`data`
486
487 def handle_special(self, data):
488 self.flush()
489 print 'special:',`data`
490
491 def handle_comment(self, data):
492 self.flush()
493 r = `data`
494 if len(r) > 68:
495 r = r[:32] + '...' + r[-32:]
496 print 'comment:', r
497
498 def syntax_error(self, lineno, message):
499 print 'error at line %d:' % lineno, message
500
501 def unknown_starttag(self, tag, attrs):
502 self.flush()
503 if not attrs:
504 print 'start tag: <' + tag + '>'
505 else:
506 print 'start tag: <' + tag,
507 for name, value in attrs:
508 print name + '=' + '"' + value + '"',
509 print '>'
510
511 def unknown_endtag(self, tag):
512 self.flush()
513 print 'end tag: </' + tag + '>'
514
515 def unknown_entityref(self, ref):
516 self.flush()
517 print '*** unknown entity ref: &' + ref + ';'
518
519 def unknown_charref(self, ref):
520 self.flush()
521 print '*** unknown char ref: &#' + ref + ';'
522
523 def close(self):
524 XMLParser.close(self)
525 self.flush()
526
527def test(args = None):
528 import sys
529
530 if not args:
531 args = sys.argv[1:]
532
533 if args and args[0] == '-s':
534 args = args[1:]
535 klass = XMLParser
536 else:
537 klass = TestXMLParser
538
539 if args:
540 file = args[0]
541 else:
542 file = 'test.xml'
543
544 if file == '-':
545 f = sys.stdin
546 else:
547 try:
548 f = open(file, 'r')
549 except IOError, msg:
550 print file, ":", msg
551 sys.exit(1)
552
553 data = f.read()
554 if f is not sys.stdin:
555 f.close()
556
557 x = klass()
558 for c in data:
559 x.feed(c)
560 x.close()
561
562
563if __name__ == '__main__':
564 test()