blob: 9f6e23ecca3da43cc026b894d744b014b9906ace [file] [log] [blame]
Guido van Rossuma219efa1997-11-18 15:09:54 +00001# A parser for XML, using the derived class as static DTD.
Guido van Rossum5d68e8e1997-11-18 15:27:20 +00002# Author: Sjoerd Mullender.
Guido van Rossuma219efa1997-11-18 15:09:54 +00003
4import re
5import string
6
7
8# Regular expressions used for parsing
9
10_S = '[ \t\r\n]+'
11_opS = '[ \t\r\n]*'
12_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
13interesting = re.compile('[&<]')
14incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
15 '<([a-zA-Z_:][^<>]*|'
16 '/([a-zA-Z_:][^<>]*)?|'
17 '![^<>]*|'
Guido van Rossum02505e41998-01-29 14:55:24 +000018 r'\?[^<>]*)?')
Guido van Rossuma219efa1997-11-18 15:09:54 +000019
20ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
21entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
22charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
23space = re.compile(_S)
24newline = re.compile('\n')
25
26starttagopen = re.compile('<' + _Name)
27endtagopen = re.compile('</')
28starttagend = re.compile(_opS + '(?P<slash>/?)>')
29endbracket = re.compile('>')
30tagfind = re.compile(_Name)
Guido van Rossum02505e41998-01-29 14:55:24 +000031cdataopen = re.compile(r'<!\[CDATA\[')
32cdataclose = re.compile(r'\]\]>')
33doctype = re.compile('<!DOCTYPE' + _S + '(?P<name>' + _Name + ')' + _S)
Guido van Rossuma219efa1997-11-18 15:09:54 +000034special = re.compile('<!(?P<special>[^<>]*)>')
Guido van Rossum02505e41998-01-29 14:55:24 +000035procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _S)
36procclose = re.compile(_opS + r'\?>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000037commentopen = re.compile('<!--')
38commentclose = re.compile('-->')
39doubledash = re.compile('--')
40attrfind = re.compile(
41 _S + '(?P<name>' + _Name + ')'
42 '(' + _opS + '=' + _opS +
43 '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
44
45
46# XML parser base class -- find tags and call handler functions.
47# Usage: p = XMLParser(); p.feed(data); ...; p.close().
Guido van Rossum5d68e8e1997-11-18 15:27:20 +000048# The dtd is defined by deriving a class which defines methods with
49# special names to handle tags: start_foo and end_foo to handle <foo>
50# and </foo>, respectively. The data between tags is passed to the
51# parser by calling self.handle_data() with some data as argument (the
52# data may be split up in arbutrary chunks). Entity references are
53# passed by calling self.handle_entityref() with the entity reference
54# as argument.
Guido van Rossuma219efa1997-11-18 15:09:54 +000055
56class XMLParser:
57
58 # Interface -- initialize and reset this instance
59 def __init__(self, verbose=0):
60 self.verbose = verbose
61 self.reset()
62
63 # Interface -- reset this instance. Loses all unprocessed data
64 def reset(self):
65 self.rawdata = ''
66 self.stack = []
Guido van Rossuma219efa1997-11-18 15:09:54 +000067 self.nomoretags = 0
68 self.literal = 0
69 self.lineno = 1
Guido van Rossum02505e41998-01-29 14:55:24 +000070 self.__at_start = 1
71 self.__seen_doctype = None
72 self.__seen_starttag = 0
Guido van Rossuma219efa1997-11-18 15:09:54 +000073
74 # For derived classes only -- enter literal mode (CDATA) till EOF
75 def setnomoretags(self):
76 self.nomoretags = self.literal = 1
77
78 # For derived classes only -- enter literal mode (CDATA)
79 def setliteral(self, *args):
80 self.literal = 1
81
82 # Interface -- feed some data to the parser. Call this as
83 # often as you want, with as little or as much text as you
84 # want (may include '\n'). (This just saves the text, all the
85 # processing is done by goahead().)
86 def feed(self, data):
87 self.rawdata = self.rawdata + data
88 self.goahead(0)
89
90 # Interface -- handle the remaining data
91 def close(self):
92 self.goahead(1)
93
94 # Interface -- translate references
95 def translate_references(self, data):
96 newdata = []
97 i = 0
98 while 1:
99 res = ref.search(data, i)
100 if res is None:
101 newdata.append(data[i:])
102 return string.join(newdata, '')
103 if data[res.end(0) - 1] != ';':
Guido van Rossum02505e41998-01-29 14:55:24 +0000104 self.syntax_error("`;' missing after entity/char reference")
Guido van Rossuma219efa1997-11-18 15:09:54 +0000105 newdata.append(data[i:res.start(0)])
106 str = res.group(1)
107 if str[0] == '#':
108 if str[1] == 'x':
109 newdata.append(chr(string.atoi(str[2:], 16)))
110 else:
111 newdata.append(chr(string.atoi(str[1:])))
112 else:
113 try:
114 newdata.append(self.entitydefs[str])
115 except KeyError:
116 # can't do it, so keep the entity ref in
117 newdata.append('&' + str + ';')
118 i = res.end(0)
119
120 # Internal -- handle data as far as reasonable. May leave state
121 # and data to be processed by a subsequent call. If 'end' is
122 # true, force handling all data as if followed by EOF marker.
123 def goahead(self, end):
124 rawdata = self.rawdata
125 i = 0
126 n = len(rawdata)
127 while i < n:
Guido van Rossum02505e41998-01-29 14:55:24 +0000128 if i > 0:
129 self.__at_start = 0
Guido van Rossuma219efa1997-11-18 15:09:54 +0000130 if self.nomoretags:
131 data = rawdata[i:n]
132 self.handle_data(data)
133 self.lineno = self.lineno + string.count(data, '\n')
134 i = n
135 break
136 res = interesting.search(rawdata, i)
137 if res:
138 j = res.start(0)
139 else:
140 j = n
141 if i < j:
Guido van Rossum02505e41998-01-29 14:55:24 +0000142 self.__at_start = 0
Guido van Rossuma219efa1997-11-18 15:09:54 +0000143 data = rawdata[i:j]
144 self.handle_data(data)
145 self.lineno = self.lineno + string.count(data, '\n')
146 i = j
147 if i == n: break
148 if rawdata[i] == '<':
149 if starttagopen.match(rawdata, i):
150 if self.literal:
151 data = rawdata[i]
152 self.handle_data(data)
153 self.lineno = self.lineno + string.count(data, '\n')
154 i = i+1
155 continue
156 k = self.parse_starttag(i)
157 if k < 0: break
Guido van Rossum02505e41998-01-29 14:55:24 +0000158 self.__seen_starttag = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000159 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
160 i = k
161 continue
162 if endtagopen.match(rawdata, i):
163 k = self.parse_endtag(i)
164 if k < 0: break
165 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
166 i = k
167 self.literal = 0
168 continue
169 if commentopen.match(rawdata, i):
170 if self.literal:
171 data = rawdata[i]
172 self.handle_data(data)
173 self.lineno = self.lineno + string.count(data, '\n')
174 i = i+1
175 continue
176 k = self.parse_comment(i)
177 if k < 0: break
178 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
179 i = k
180 continue
181 if cdataopen.match(rawdata, i):
182 k = self.parse_cdata(i)
183 if k < 0: break
184 self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
185 i = k
186 continue
187 res = procopen.match(rawdata, i)
188 if res:
Guido van Rossum02505e41998-01-29 14:55:24 +0000189 k = self.parse_proc(i)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000190 if k < 0: break
191 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
192 i = k
193 continue
Guido van Rossum02505e41998-01-29 14:55:24 +0000194 res = doctype.match(rawdata, i)
195 if res:
196 if self.literal:
197 data = rawdata[i]
198 self.handle_data(data)
199 self.lineno = self.lineno + string.count(data, '\n')
200 i = i+1
201 continue
202 if self.__seen_doctype:
203 self.syntax_error('multiple DOCTYPE elements')
204 if self.__seen_starttag:
205 self.syntax_error('DOCTYPE not at beginning of document')
206 k = self.parse_doctype(res)
207 if k < 0: break
208 self.__seen_doctype = res.group('name')
209 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
210 i = k
211 continue
Guido van Rossuma219efa1997-11-18 15:09:54 +0000212 res = special.match(rawdata, i)
213 if res:
214 if self.literal:
215 data = rawdata[i]
216 self.handle_data(data)
217 self.lineno = self.lineno + string.count(data, '\n')
218 i = i+1
219 continue
220 self.handle_special(res.group('special'))
221 self.lineno = self.lineno + string.count(res.group(0), '\n')
222 i = res.end(0)
223 continue
224 elif rawdata[i] == '&':
225 res = charref.match(rawdata, i)
226 if res is not None:
227 i = res.end(0)
228 if rawdata[i-1] != ';':
Guido van Rossum02505e41998-01-29 14:55:24 +0000229 self.syntax_error("`;' missing in charref")
Guido van Rossuma219efa1997-11-18 15:09:54 +0000230 i = i-1
231 self.handle_charref(res.group('char')[:-1])
232 self.lineno = self.lineno + string.count(res.group(0), '\n')
233 continue
234 res = entityref.match(rawdata, i)
235 if res is not None:
236 i = res.end(0)
237 if rawdata[i-1] != ';':
Guido van Rossum02505e41998-01-29 14:55:24 +0000238 self.syntax_error("`;' missing in entityref")
Guido van Rossuma219efa1997-11-18 15:09:54 +0000239 i = i-1
240 self.handle_entityref(res.group('name'))
241 self.lineno = self.lineno + string.count(res.group(0), '\n')
242 continue
243 else:
244 raise RuntimeError, 'neither < nor & ??'
245 # We get here only if incomplete matches but
246 # nothing else
247 res = incomplete.match(rawdata, i)
248 if not res:
249 data = rawdata[i]
250 self.handle_data(data)
251 self.lineno = self.lineno + string.count(data, '\n')
252 i = i+1
253 continue
254 j = res.end(0)
255 if j == n:
256 break # Really incomplete
Guido van Rossum02505e41998-01-29 14:55:24 +0000257 self.syntax_error("bogus `<' or `&'")
Guido van Rossuma219efa1997-11-18 15:09:54 +0000258 data = res.group(0)
259 self.handle_data(data)
260 self.lineno = self.lineno + string.count(data, '\n')
261 i = j
262 # end while
263 if end and i < n:
264 data = rawdata[i:n]
265 self.handle_data(data)
266 self.lineno = self.lineno + string.count(data, '\n')
267 i = n
268 self.rawdata = rawdata[i:]
Guido van Rossum02505e41998-01-29 14:55:24 +0000269 if end:
270 if self.stack:
271 self.syntax_error('missing end tags')
272 while self.stack:
273 self.finish_endtag(self.stack[-1])
Guido van Rossuma219efa1997-11-18 15:09:54 +0000274
275 # Internal -- parse comment, return length or -1 if not terminated
276 def parse_comment(self, i):
277 rawdata = self.rawdata
278 if rawdata[i:i+4] <> '<!--':
279 raise RuntimeError, 'unexpected call to handle_comment'
280 res = commentclose.search(rawdata, i+4)
281 if not res:
282 return -1
283 # doubledash search will succeed because it's a subset of commentclose
284 if doubledash.search(rawdata, i+4).start(0) < res.start(0):
Guido van Rossum02505e41998-01-29 14:55:24 +0000285 self.syntax_error("`--' inside comment")
Guido van Rossuma219efa1997-11-18 15:09:54 +0000286 self.handle_comment(rawdata[i+4: res.start(0)])
287 return res.end(0)
288
Guido van Rossum02505e41998-01-29 14:55:24 +0000289 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
290 def parse_doctype(self, res):
291 rawdata = self.rawdata
292 n = len(rawdata)
293 name = res.group('name')
294 j = k = res.end(0)
295 level = 0
296 while k < n:
297 c = rawdata[k]
298 if c == '<':
299 level = level + 1
300 elif c == '>':
301 if level == 0:
302 self.handle_doctype(name, rawdata[j:k])
303 return k+1
304 level = level - 1
305 k = k+1
306 return -1
307
308 # Internal -- handle CDATA tag, return length or -1 if not terminated
Guido van Rossuma219efa1997-11-18 15:09:54 +0000309 def parse_cdata(self, i):
310 rawdata = self.rawdata
311 if rawdata[i:i+9] <> '<![CDATA[':
312 raise RuntimeError, 'unexpected call to handle_cdata'
313 res = cdataclose.search(rawdata, i+9)
314 if not res:
315 return -1
316 self.handle_cdata(rawdata[i+9:res.start(0)])
317 return res.end(0)
318
Guido van Rossum02505e41998-01-29 14:55:24 +0000319 __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
320 # Internal -- handle a processing instruction tag
321 def parse_proc(self, i):
Guido van Rossuma219efa1997-11-18 15:09:54 +0000322 rawdata = self.rawdata
Guido van Rossum02505e41998-01-29 14:55:24 +0000323 end = procclose.search(rawdata, i)
324 if not end:
325 return -1
326 j = end.start(0)
327 res = tagfind.match(rawdata, i+2)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000328 if not res:
329 raise RuntimeError, 'unexpected call to parse_proc'
Guido van Rossum02505e41998-01-29 14:55:24 +0000330 k = res.end(0)
331 name = res.group(0)
332 if name == 'xml':
333 if self.__at_start:
334 attrdict, k = self.parse_attributes('xml', k, j,
335 self.__xml_attributes)
336 if k != j:
337 self.syntax_error('garbage at end of <?xml?>')
338 if attrdict['version'] != '1.0':
339 self.syntax_error('only XML version 1.0 supported')
340 self.handle_xml(attrdict.get('encoding', None),
341 attrdict['standalone'])
342 return end.end(0)
343 else:
344 self.syntax_error("<?xml?> tag not at start of document")
345 self.handle_proc(name, rawdata[k:j])
346 return end.end(0)
347
348 # Internal -- parse attributes between i and j
349 def parse_attributes(self, tag, k, j, attributes = None):
350 rawdata = self.rawdata
351 # Now parse the data between k and j into a tag and attrs
352 attrdict = {}
353 try:
354 # convert attributes list to dictionary
355 d = {}
356 for a in attributes:
357 d[a] = None
358 attributes = d
359 except TypeError:
360 pass
361 while k < j:
362 res = attrfind.match(rawdata, k)
363 if not res: break
364 attrname, attrvalue = res.group('name', 'value')
365 if attrvalue is None:
366 self.syntax_error('no attribute value specified')
367 attrvalue = attrname
368 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
369 attrvalue[:1] == '"' == attrvalue[-1:]:
370 attrvalue = attrvalue[1:-1]
371 else:
372 self.syntax_error('attribute value not quoted')
373 if attributes is not None and not attributes.has_key(attrname):
374 self.syntax_error('unknown attribute %s of element %s' %
375 (attrname, tag))
376 if attrdict.has_key(attrname):
377 self.syntax_error('attribute specified twice')
378 attrdict[attrname] = self.translate_references(attrvalue)
379 k = res.end(0)
380 if attributes is not None:
381 # fill in with default attributes
382 for key, val in attributes.items():
383 if val is not None and not attrdict.has_key(key):
384 attrdict[key] = val
385 return attrdict, k
Guido van Rossuma219efa1997-11-18 15:09:54 +0000386
387 # Internal -- handle starttag, return length or -1 if not terminated
388 def parse_starttag(self, i):
389 rawdata = self.rawdata
390 # i points to start of tag
391 end = endbracket.search(rawdata, i+1)
392 if not end:
393 return -1
394 j = end.start(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000395 res = tagfind.match(rawdata, i+1)
396 if not res:
397 raise RuntimeError, 'unexpected call to parse_starttag'
398 k = res.end(0)
399 tag = res.group(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000400 if not self.__seen_starttag and self.__seen_doctype:
401 if tag != self.__seen_doctype:
402 self.syntax_error('starttag does not match DOCTYPE')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000403 if hasattr(self, tag + '_attributes'):
Guido van Rossum02505e41998-01-29 14:55:24 +0000404 attributes = getattr(self, tag + '_attributes')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000405 else:
Guido van Rossum02505e41998-01-29 14:55:24 +0000406 attributes = None
407 attrdict, k = self.parse_attributes(tag, k, j, attributes)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000408 res = starttagend.match(rawdata, k)
409 if not res:
Guido van Rossum02505e41998-01-29 14:55:24 +0000410 self.syntax_error('garbage in start tag')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000411 self.finish_starttag(tag, attrdict)
412 if res and res.group('slash') == '/':
413 self.finish_endtag(tag)
414 return end.end(0)
415
416 # Internal -- parse endtag
417 def parse_endtag(self, i):
418 rawdata = self.rawdata
419 end = endbracket.search(rawdata, i+1)
420 if not end:
421 return -1
422 res = tagfind.match(rawdata, i+2)
423 if not res:
Guido van Rossum02505e41998-01-29 14:55:24 +0000424 self.syntax_error('no name specified in end tag')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000425 tag = ''
426 k = i+2
427 else:
428 tag = res.group(0)
429 k = res.end(0)
430 if k != end.start(0):
431 # check that there is only white space at end of tag
432 res = space.match(rawdata, k)
433 if res is None or res.end(0) != end.start(0):
Guido van Rossum02505e41998-01-29 14:55:24 +0000434 self.syntax_error('garbage in end tag')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000435 self.finish_endtag(tag)
436 return end.end(0)
437
438 # Internal -- finish processing of start tag
439 # Return -1 for unknown tag, 1 for balanced tag
440 def finish_starttag(self, tag, attrs):
441 self.stack.append(tag)
442 try:
443 method = getattr(self, 'start_' + tag)
444 except AttributeError:
445 self.unknown_starttag(tag, attrs)
446 return -1
447 else:
448 self.handle_starttag(tag, method, attrs)
449 return 1
450
451 # Internal -- finish processing of end tag
452 def finish_endtag(self, tag):
453 if not tag:
Guido van Rossum02505e41998-01-29 14:55:24 +0000454 self.syntax_error('name-less end tag')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000455 found = len(self.stack) - 1
456 if found < 0:
457 self.unknown_endtag(tag)
458 return
459 else:
460 if tag not in self.stack:
Guido van Rossum02505e41998-01-29 14:55:24 +0000461 self.syntax_error('unopened end tag')
Guido van Rossuma219efa1997-11-18 15:09:54 +0000462 try:
463 method = getattr(self, 'end_' + tag)
464 except AttributeError:
465 self.unknown_endtag(tag)
466 return
467 found = len(self.stack)
468 for i in range(found):
Guido van Rossum02505e41998-01-29 14:55:24 +0000469 if self.stack[i] == tag:
470 found = i
Guido van Rossuma219efa1997-11-18 15:09:54 +0000471 while len(self.stack) > found:
Guido van Rossum02505e41998-01-29 14:55:24 +0000472 if found < len(self.stack) - 1:
473 self.syntax_error('missing close tag for %s' % self.stack[-1])
Guido van Rossuma219efa1997-11-18 15:09:54 +0000474 tag = self.stack[-1]
475 try:
476 method = getattr(self, 'end_' + tag)
477 except AttributeError:
478 method = None
479 if method:
480 self.handle_endtag(tag, method)
481 else:
482 self.unknown_endtag(tag)
483 del self.stack[-1]
484
Guido van Rossum02505e41998-01-29 14:55:24 +0000485 # Overridable -- handle xml processing instruction
486 def handle_xml(self, encoding, standalone):
487 pass
488
489 # Overridable -- handle DOCTYPE
490 def handle_doctype(self, tag, data):
491 pass
492
Guido van Rossuma219efa1997-11-18 15:09:54 +0000493 # Overridable -- handle start tag
494 def handle_starttag(self, tag, method, attrs):
495 method(attrs)
496
497 # Overridable -- handle end tag
498 def handle_endtag(self, tag, method):
499 method()
500
501 # Example -- handle character reference, no need to override
502 def handle_charref(self, name):
503 try:
504 if name[0] == 'x':
505 n = string.atoi(name[1:], 16)
506 else:
507 n = string.atoi(name)
508 except string.atoi_error:
509 self.unknown_charref(name)
510 return
511 if not 0 <= n <= 255:
512 self.unknown_charref(name)
513 return
514 self.handle_data(chr(n))
515
516 # Definition of entities -- derived classes may override
Guido van Rossum02505e41998-01-29 14:55:24 +0000517 entitydefs = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}
Guido van Rossuma219efa1997-11-18 15:09:54 +0000518
519 # Example -- handle entity reference, no need to override
520 def handle_entityref(self, name):
521 table = self.entitydefs
522 if table.has_key(name):
523 self.handle_data(table[name])
524 else:
525 self.unknown_entityref(name)
526 return
527
528 # Example -- handle data, should be overridden
529 def handle_data(self, data):
530 pass
531
532 # Example -- handle cdata, could be overridden
533 def handle_cdata(self, data):
534 pass
535
536 # Example -- handle comment, could be overridden
537 def handle_comment(self, data):
538 pass
539
540 # Example -- handle processing instructions, could be overridden
541 def handle_proc(self, name, data):
542 pass
543
544 # Example -- handle special instructions, could be overridden
545 def handle_special(self, data):
546 pass
547
548 # Example -- handle relatively harmless syntax errors, could be overridden
Guido van Rossum02505e41998-01-29 14:55:24 +0000549 def syntax_error(self, message):
550 raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000551
552 # To be overridden -- handlers for unknown objects
553 def unknown_starttag(self, tag, attrs): pass
554 def unknown_endtag(self, tag): pass
555 def unknown_charref(self, ref): pass
556 def unknown_entityref(self, ref): pass
557
558
559class TestXMLParser(XMLParser):
560
561 def __init__(self, verbose=0):
562 self.testdata = ""
563 XMLParser.__init__(self, verbose)
564
Guido van Rossum02505e41998-01-29 14:55:24 +0000565 def handle_xml(self, encoding, standalone):
566 self.flush()
567 print 'xml: encoding =',encoding,'standalone =',standalone
568
569 def handle_doctype(self, tag, data):
570 self.flush()
571 print 'DOCTYPE:',tag, `data`
572
Guido van Rossuma219efa1997-11-18 15:09:54 +0000573 def handle_data(self, data):
574 self.testdata = self.testdata + data
575 if len(`self.testdata`) >= 70:
576 self.flush()
577
578 def flush(self):
579 data = self.testdata
580 if data:
581 self.testdata = ""
582 print 'data:', `data`
583
584 def handle_cdata(self, data):
585 self.flush()
586 print 'cdata:', `data`
587
588 def handle_proc(self, name, data):
589 self.flush()
590 print 'processing:',name,`data`
591
592 def handle_special(self, data):
593 self.flush()
594 print 'special:',`data`
595
596 def handle_comment(self, data):
597 self.flush()
598 r = `data`
599 if len(r) > 68:
600 r = r[:32] + '...' + r[-32:]
601 print 'comment:', r
602
Guido van Rossum02505e41998-01-29 14:55:24 +0000603 def syntax_error(self, message):
604 print 'error at line %d:' % self.lineno, message
Guido van Rossuma219efa1997-11-18 15:09:54 +0000605
606 def unknown_starttag(self, tag, attrs):
607 self.flush()
608 if not attrs:
609 print 'start tag: <' + tag + '>'
610 else:
611 print 'start tag: <' + tag,
Guido van Rossum02505e41998-01-29 14:55:24 +0000612 for name, value in attrs.items():
Guido van Rossuma219efa1997-11-18 15:09:54 +0000613 print name + '=' + '"' + value + '"',
614 print '>'
615
616 def unknown_endtag(self, tag):
617 self.flush()
618 print 'end tag: </' + tag + '>'
619
620 def unknown_entityref(self, ref):
621 self.flush()
622 print '*** unknown entity ref: &' + ref + ';'
623
624 def unknown_charref(self, ref):
625 self.flush()
626 print '*** unknown char ref: &#' + ref + ';'
627
628 def close(self):
629 XMLParser.close(self)
630 self.flush()
631
632def test(args = None):
633 import sys
634
635 if not args:
636 args = sys.argv[1:]
637
638 if args and args[0] == '-s':
639 args = args[1:]
640 klass = XMLParser
641 else:
642 klass = TestXMLParser
643
644 if args:
645 file = args[0]
646 else:
647 file = 'test.xml'
648
649 if file == '-':
650 f = sys.stdin
651 else:
652 try:
653 f = open(file, 'r')
654 except IOError, msg:
655 print file, ":", msg
656 sys.exit(1)
657
658 data = f.read()
659 if f is not sys.stdin:
660 f.close()
661
662 x = klass()
663 for c in data:
664 x.feed(c)
665 x.close()
666
667
668if __name__ == '__main__':
669 test()