blob: 355714fbbb8f2e7ab91b9b1b30a5382a1d7a063d [file] [log] [blame]
Guido van Rossuma219efa1997-11-18 15:09:54 +00001# A parser for XML, using the derived class as static DTD.
Guido van Rossum5d68e8e1997-11-18 15:27:20 +00002# Author: Sjoerd Mullender.
Guido van Rossuma219efa1997-11-18 15:09:54 +00003
4import re
5import string
6
7
Guido van Rossum7e07b381998-04-03 16:02:39 +00008version = '0.1'
9
Guido van Rossuma219efa1997-11-18 15:09:54 +000010# Regular expressions used for parsing
11
12_S = '[ \t\r\n]+'
13_opS = '[ \t\r\n]*'
14_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
Guido van Rossum7e07b381998-04-03 16:02:39 +000015illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
16interesting = re.compile('[]&<]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000017
Guido van Rossum7e07b381998-04-03 16:02:39 +000018amp = re.compile('&')
19ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000020entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
21charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
Guido van Rossum7e07b381998-04-03 16:02:39 +000022space = re.compile(_S + '$')
Guido van Rossuma219efa1997-11-18 15:09:54 +000023newline = re.compile('\n')
24
25starttagopen = re.compile('<' + _Name)
26endtagopen = re.compile('</')
27starttagend = re.compile(_opS + '(?P<slash>/?)>')
Guido van Rossum7e07b381998-04-03 16:02:39 +000028endbracket = re.compile(_opS + '>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000029tagfind = re.compile(_Name)
Guido van Rossum02505e41998-01-29 14:55:24 +000030cdataopen = re.compile(r'<!\[CDATA\[')
31cdataclose = re.compile(r'\]\]>')
Guido van Rossum7e07b381998-04-03 16:02:39 +000032# this matches one of the following:
33# SYSTEM SystemLiteral
34# PUBLIC PubidLiteral SystemLiteral
35_SystemLiteral = '(?P<%s>\'[^\']*\'|"[^"]*")'
36_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
37 "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
38_ExternalId = '(?:SYSTEM|' \
39 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
40 ')'+_S+_SystemLiteral%'syslit'
41doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
42 '(?:'+_S+_ExternalId+')?'+_opS)
43xmldecl = re.compile('<\?xml'+_S+
44 'version'+_opS+'='+_opS+'(?P<version>\'[^\']*\'|"[^"]*")'+
45 '(?:'+_S+'encoding'+_opS+'='+_opS+
46 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
47 '"[A-Za-z][-A-Za-z0-9._]*"))?'
48 '(?:'+_S+'standalone'+_opS+'='+_opS+
49 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
50 _opS+'\?>')
51procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
Guido van Rossum02505e41998-01-29 14:55:24 +000052procclose = re.compile(_opS + r'\?>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000053commentopen = re.compile('<!--')
54commentclose = re.compile('-->')
55doubledash = re.compile('--')
56attrfind = re.compile(
57 _S + '(?P<name>' + _Name + ')'
58 '(' + _opS + '=' + _opS +
59 '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
Guido van Rossum7e07b381998-04-03 16:02:39 +000060attrtrans = string.maketrans(' \r\n\t', ' ')
Guido van Rossuma219efa1997-11-18 15:09:54 +000061
62
63# XML parser base class -- find tags and call handler functions.
64# Usage: p = XMLParser(); p.feed(data); ...; p.close().
Guido van Rossum5d68e8e1997-11-18 15:27:20 +000065# The dtd is defined by deriving a class which defines methods with
66# special names to handle tags: start_foo and end_foo to handle <foo>
67# and </foo>, respectively. The data between tags is passed to the
68# parser by calling self.handle_data() with some data as argument (the
69# data may be split up in arbutrary chunks). Entity references are
70# passed by calling self.handle_entityref() with the entity reference
71# as argument.
Guido van Rossuma219efa1997-11-18 15:09:54 +000072
73class XMLParser:
74
75 # Interface -- initialize and reset this instance
76 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000077 self.verbose = verbose
78 self.reset()
Guido van Rossuma219efa1997-11-18 15:09:54 +000079
80 # Interface -- reset this instance. Loses all unprocessed data
81 def reset(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 self.rawdata = ''
83 self.stack = []
84 self.nomoretags = 0
85 self.literal = 0
86 self.lineno = 1
87 self.__at_start = 1
88 self.__seen_doctype = None
89 self.__seen_starttag = 0
Guido van Rossuma219efa1997-11-18 15:09:54 +000090
91 # For derived classes only -- enter literal mode (CDATA) till EOF
92 def setnomoretags(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 self.nomoretags = self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +000094
95 # For derived classes only -- enter literal mode (CDATA)
96 def setliteral(self, *args):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000097 self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +000098
99 # Interface -- feed some data to the parser. Call this as
100 # often as you want, with as little or as much text as you
101 # want (may include '\n'). (This just saves the text, all the
102 # processing is done by goahead().)
103 def feed(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000104 self.rawdata = self.rawdata + data
105 self.goahead(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000106
107 # Interface -- handle the remaining data
108 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000109 self.goahead(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000110
111 # Interface -- translate references
Guido van Rossum7e07b381998-04-03 16:02:39 +0000112 def translate_references(self, data, all = 1):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000113 i = 0
114 while 1:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000115 res = amp.search(data, i)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000116 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000117 return data
118 res = ref.match(data, res.start(0))
119 if res is None:
120 self.syntax_error("bogus `&'")
121 i =i+1
122 continue
123 i = res.end(0)
124 if data[i - 1] != ';':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000125 self.syntax_error("`;' missing after entity/char reference")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000126 i = i-1
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000127 str = res.group(1)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000128 pre = data[:res.start(0)]
129 post = data[i:]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 if str[0] == '#':
131 if str[1] == 'x':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000132 str = chr(string.atoi(str[2:], 16))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000133 else:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000134 str = chr(string.atoi(str[1:]))
135 data = pre + str + post
136 i = res.start(0)+len(str)
137 elif all:
138 if self.entitydefs.has_key(str):
139 data = pre + self.entitydefs[str] + post
140 i = res.start(0) # rescan substituted text
141 else:
142 self.syntax_error('reference to unknown entity')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000143 # can't do it, so keep the entity ref in
Guido van Rossum7e07b381998-04-03 16:02:39 +0000144 data = pre + '&' + str + ';' + post
145 i = res.start(0) + len(str) + 2
146 else:
147 # just translating character references
148 pass # i is already postioned correctly
Guido van Rossuma219efa1997-11-18 15:09:54 +0000149
150 # Internal -- handle data as far as reasonable. May leave state
151 # and data to be processed by a subsequent call. If 'end' is
152 # true, force handling all data as if followed by EOF marker.
153 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000154 rawdata = self.rawdata
155 i = 0
156 n = len(rawdata)
157 while i < n:
158 if i > 0:
159 self.__at_start = 0
160 if self.nomoretags:
161 data = rawdata[i:n]
162 self.handle_data(data)
163 self.lineno = self.lineno + string.count(data, '\n')
164 i = n
165 break
166 res = interesting.search(rawdata, i)
167 if res:
168 j = res.start(0)
169 else:
170 j = n
171 if i < j:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000172 if self.__at_start:
173 self.syntax_error('illegal data at start of file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000174 self.__at_start = 0
175 data = rawdata[i:j]
Guido van Rossum7e07b381998-04-03 16:02:39 +0000176 if not self.stack and not space.match(data):
177 self.syntax_error('data not in content')
178 if illegal.search(data):
179 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000180 self.handle_data(data)
181 self.lineno = self.lineno + string.count(data, '\n')
182 i = j
183 if i == n: break
184 if rawdata[i] == '<':
185 if starttagopen.match(rawdata, i):
186 if self.literal:
187 data = rawdata[i]
188 self.handle_data(data)
189 self.lineno = self.lineno + string.count(data, '\n')
190 i = i+1
191 continue
192 k = self.parse_starttag(i)
193 if k < 0: break
194 self.__seen_starttag = 1
195 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
196 i = k
197 continue
198 if endtagopen.match(rawdata, i):
199 k = self.parse_endtag(i)
200 if k < 0: break
201 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
202 i = k
203 self.literal = 0
204 continue
205 if commentopen.match(rawdata, i):
206 if self.literal:
207 data = rawdata[i]
208 self.handle_data(data)
209 self.lineno = self.lineno + string.count(data, '\n')
210 i = i+1
211 continue
212 k = self.parse_comment(i)
213 if k < 0: break
214 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
215 i = k
216 continue
217 if cdataopen.match(rawdata, i):
218 k = self.parse_cdata(i)
219 if k < 0: break
220 self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
221 i = k
222 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000223 res = xmldecl.match(rawdata, i)
224 if res:
225 if not self.__at_start:
226 self.syntax_error("<?xml?> declaration not at start of document")
227 version, encoding, standalone = res.group('version',
228 'encoding',
229 'standalone')
230 if version[1:-1] != '1.0':
231 raise RuntimeError, 'only XML version 1.0 supported'
232 if encoding: encoding = encoding[1:-1]
233 if standalone: standalone = standalone[1:-1]
234 self.handle_xml(encoding, standalone)
235 i = res.end(0)
236 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000237 res = procopen.match(rawdata, i)
238 if res:
239 k = self.parse_proc(i)
240 if k < 0: break
241 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
242 i = k
243 continue
244 res = doctype.match(rawdata, i)
245 if res:
246 if self.literal:
247 data = rawdata[i]
248 self.handle_data(data)
249 self.lineno = self.lineno + string.count(data, '\n')
250 i = i+1
251 continue
252 if self.__seen_doctype:
253 self.syntax_error('multiple DOCTYPE elements')
254 if self.__seen_starttag:
255 self.syntax_error('DOCTYPE not at beginning of document')
256 k = self.parse_doctype(res)
257 if k < 0: break
258 self.__seen_doctype = res.group('name')
259 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
260 i = k
261 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000262 elif rawdata[i] == '&':
263 res = charref.match(rawdata, i)
264 if res is not None:
265 i = res.end(0)
266 if rawdata[i-1] != ';':
267 self.syntax_error("`;' missing in charref")
268 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000269 if not self.stack:
270 self.syntax_error('data not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000271 self.handle_charref(res.group('char')[:-1])
272 self.lineno = self.lineno + string.count(res.group(0), '\n')
273 continue
274 res = entityref.match(rawdata, i)
275 if res is not None:
276 i = res.end(0)
277 if rawdata[i-1] != ';':
278 self.syntax_error("`;' missing in entityref")
279 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000280 name = res.group('name')
281 if self.entitydefs.has_key(name):
282 self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
283 n = len(rawdata)
284 i = res.start(0)
285 else:
286 self.syntax_error('reference to unknown entity')
287 self.unknown_entityref(name)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 self.lineno = self.lineno + string.count(res.group(0), '\n')
289 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000290 elif rawdata[i] == ']':
291 if n-i < 3:
292 break
293 if cdataclose.match(rawdata, i):
294 self.syntax_error("bogus `]]>'")
295 self.handle_data(rawdata[i])
296 i = i+1
297 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000298 else:
299 raise RuntimeError, 'neither < nor & ??'
300 # We get here only if incomplete matches but
301 # nothing else
Guido van Rossum7e07b381998-04-03 16:02:39 +0000302 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 # end while
Guido van Rossum7e07b381998-04-03 16:02:39 +0000304 if i > 0:
305 self.__at_start = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000306 if end and i < n:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000307 data = rawdata[i]
308 self.syntax_error("bogus `%s'" % data)
309 if illegal.search(data):
310 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000311 self.handle_data(data)
312 self.lineno = self.lineno + string.count(data, '\n')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000313 self.rawdata = rawdata[i+1:]
314 return self.goahead(end)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 self.rawdata = rawdata[i:]
316 if end:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000317 if not self.__seen_starttag:
318 self.syntax_error('no elements in file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 if self.stack:
320 self.syntax_error('missing end tags')
321 while self.stack:
322 self.finish_endtag(self.stack[-1])
Guido van Rossuma219efa1997-11-18 15:09:54 +0000323
324 # Internal -- parse comment, return length or -1 if not terminated
325 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000326 rawdata = self.rawdata
327 if rawdata[i:i+4] <> '<!--':
328 raise RuntimeError, 'unexpected call to handle_comment'
329 res = commentclose.search(rawdata, i+4)
330 if not res:
331 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000332 if doubledash.search(rawdata, i+4, res.start(0)):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 self.syntax_error("`--' inside comment")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000334 if rawdata[res.start(0)-1] == '-':
335 self.syntax_error('comment cannot end in three dashes')
336 if illegal.search(rawdata, i+4, res.start(0)):
337 self.syntax_error('illegal character in comment')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 self.handle_comment(rawdata[i+4: res.start(0)])
339 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000340
Guido van Rossum02505e41998-01-29 14:55:24 +0000341 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
342 def parse_doctype(self, res):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000343 rawdata = self.rawdata
344 n = len(rawdata)
345 name = res.group('name')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000346 pubid, syslit = res.group('pubid', 'syslit')
347 if pubid is not None:
348 pubid = pubid[1:-1] # remove quotes
349 pubid = string.join(string.split(pubid)) # normalize
350 if syslit is not None: syslit = syslit[1:-1] # remove quotes
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000351 j = k = res.end(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000352 if k >= n:
353 return -1
354 if rawdata[k] == '[':
355 level = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000356 k = k+1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000357 dq = sq = 0
358 while k < n:
359 c = rawdata[k]
360 if not sq and c == '"':
361 dq = not dq
362 elif not dq and c == "'":
363 sq = not sq
364 elif sq or dq:
365 pass
366 elif level <= 0 and c == ']':
367 res = endbracket.match(rawdata, k+1)
368 if not res:
369 return -1
370 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
371 return res.end(0)
372 elif c == '<':
373 level = level + 1
374 elif c == '>':
375 level = level - 1
376 if level < 0:
377 self.syntax_error("bogus `>' in DOCTYPE")
378 k = k+1
379 res = endbracket.search(rawdata, k)
380 if not res:
381 return -1
382 if res.start(0) != k:
383 self.syntax_error('garbage in DOCTYPE')
384 self.handle_doctype(name, pubid, syslit, None)
385 return res.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000386
387 # Internal -- handle CDATA tag, return length or -1 if not terminated
Guido van Rossuma219efa1997-11-18 15:09:54 +0000388 def parse_cdata(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000389 rawdata = self.rawdata
390 if rawdata[i:i+9] <> '<![CDATA[':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000391 raise RuntimeError, 'unexpected call to parse_cdata'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000392 res = cdataclose.search(rawdata, i+9)
393 if not res:
394 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000395 if illegal.search(rawdata, i+9, res.start(0)):
396 self.syntax_error('illegal character in CDATA')
397 if not self.stack:
398 self.syntax_error('CDATA not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000399 self.handle_cdata(rawdata[i+9:res.start(0)])
400 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000401
Guido van Rossum02505e41998-01-29 14:55:24 +0000402 __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
403 # Internal -- handle a processing instruction tag
404 def parse_proc(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000405 rawdata = self.rawdata
406 end = procclose.search(rawdata, i)
407 if not end:
408 return -1
409 j = end.start(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000410 if illegal.search(rawdata, i+2, j):
411 self.syntax_error('illegal character in processing instruction')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000412 res = tagfind.match(rawdata, i+2)
413 if not res:
414 raise RuntimeError, 'unexpected call to parse_proc'
415 k = res.end(0)
416 name = res.group(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000417 if string.find(string.lower(name), 'xml') >= 0:
418 self.syntax_error('illegal processing instruction target name')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000419 self.handle_proc(name, rawdata[k:j])
420 return end.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000421
422 # Internal -- parse attributes between i and j
423 def parse_attributes(self, tag, k, j, attributes = None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000424 rawdata = self.rawdata
425 # Now parse the data between k and j into a tag and attrs
426 attrdict = {}
427 try:
428 # convert attributes list to dictionary
429 d = {}
430 for a in attributes:
431 d[a] = None
432 attributes = d
433 except TypeError:
434 pass
435 while k < j:
436 res = attrfind.match(rawdata, k)
437 if not res: break
438 attrname, attrvalue = res.group('name', 'value')
439 if attrvalue is None:
440 self.syntax_error('no attribute value specified')
441 attrvalue = attrname
442 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
443 attrvalue[:1] == '"' == attrvalue[-1:]:
444 attrvalue = attrvalue[1:-1]
445 else:
446 self.syntax_error('attribute value not quoted')
447 if attributes is not None and not attributes.has_key(attrname):
448 self.syntax_error('unknown attribute %s of element %s' %
449 (attrname, tag))
450 if attrdict.has_key(attrname):
451 self.syntax_error('attribute specified twice')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000452 attrvalue = string.translate(attrvalue, attrtrans)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000453 attrdict[attrname] = self.translate_references(attrvalue)
454 k = res.end(0)
455 if attributes is not None:
456 # fill in with default attributes
457 for key, val in attributes.items():
458 if val is not None and not attrdict.has_key(key):
459 attrdict[key] = val
460 return attrdict, k
Guido van Rossuma219efa1997-11-18 15:09:54 +0000461
462 # Internal -- handle starttag, return length or -1 if not terminated
463 def parse_starttag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000464 rawdata = self.rawdata
465 # i points to start of tag
466 end = endbracket.search(rawdata, i+1)
467 if not end:
468 return -1
469 j = end.start(0)
470 res = tagfind.match(rawdata, i+1)
471 if not res:
472 raise RuntimeError, 'unexpected call to parse_starttag'
473 k = res.end(0)
474 tag = res.group(0)
475 if not self.__seen_starttag and self.__seen_doctype:
476 if tag != self.__seen_doctype:
477 self.syntax_error('starttag does not match DOCTYPE')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000478 if self.__seen_starttag and not self.stack:
479 self.syntax_error('multiple elements on top level')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000480 if hasattr(self, tag + '_attributes'):
481 attributes = getattr(self, tag + '_attributes')
482 else:
483 attributes = None
484 attrdict, k = self.parse_attributes(tag, k, j, attributes)
485 res = starttagend.match(rawdata, k)
486 if not res:
487 self.syntax_error('garbage in start tag')
488 self.finish_starttag(tag, attrdict)
489 if res and res.group('slash') == '/':
490 self.finish_endtag(tag)
491 return end.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000492
493 # Internal -- parse endtag
494 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000495 rawdata = self.rawdata
496 end = endbracket.search(rawdata, i+1)
497 if not end:
498 return -1
499 res = tagfind.match(rawdata, i+2)
500 if not res:
501 self.syntax_error('no name specified in end tag')
502 tag = ''
503 k = i+2
504 else:
505 tag = res.group(0)
506 k = res.end(0)
507 if k != end.start(0):
Guido van Rossum7e07b381998-04-03 16:02:39 +0000508 self.syntax_error('garbage in end tag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000509 self.finish_endtag(tag)
510 return end.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000511
512 # Internal -- finish processing of start tag
513 # Return -1 for unknown tag, 1 for balanced tag
514 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000515 self.stack.append(tag)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000516 methodname = 'start_' + tag
517 if hasattr(self, methodname):
518 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000519 self.handle_starttag(tag, method, attrs)
520 return 1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000521 else:
522 self.unknown_starttag(tag, attrs)
523 return -1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000524
525 # Internal -- finish processing of end tag
526 def finish_endtag(self, tag):
Guido van Rossum7e07b381998-04-03 16:02:39 +0000527 methodname = 'end_' + tag
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000528 if not tag:
529 self.syntax_error('name-less end tag')
530 found = len(self.stack) - 1
531 if found < 0:
532 self.unknown_endtag(tag)
533 return
534 else:
535 if tag not in self.stack:
536 self.syntax_error('unopened end tag')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000537 if hasattr(self, methodname):
538 method = getattr(self, methodname)
539 self.handle_endtag(tag, method)
540 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000541 self.unknown_endtag(tag)
542 return
543 found = len(self.stack)
544 for i in range(found):
545 if self.stack[i] == tag:
546 found = i
547 while len(self.stack) > found:
548 if found < len(self.stack) - 1:
549 self.syntax_error('missing close tag for %s' % self.stack[-1])
550 tag = self.stack[-1]
Guido van Rossum7e07b381998-04-03 16:02:39 +0000551 if hasattr(self, methodname):
552 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000553 self.handle_endtag(tag, method)
554 else:
555 self.unknown_endtag(tag)
556 del self.stack[-1]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000557
Guido van Rossum02505e41998-01-29 14:55:24 +0000558 # Overridable -- handle xml processing instruction
559 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000560 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000561
562 # Overridable -- handle DOCTYPE
Guido van Rossum7e07b381998-04-03 16:02:39 +0000563 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000564 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000565
Guido van Rossuma219efa1997-11-18 15:09:54 +0000566 # Overridable -- handle start tag
567 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000568 method(attrs)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000569
570 # Overridable -- handle end tag
571 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000572 method()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000573
574 # Example -- handle character reference, no need to override
575 def handle_charref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000576 try:
577 if name[0] == 'x':
578 n = string.atoi(name[1:], 16)
579 else:
580 n = string.atoi(name)
581 except string.atoi_error:
582 self.unknown_charref(name)
583 return
584 if not 0 <= n <= 255:
585 self.unknown_charref(name)
586 return
587 self.handle_data(chr(n))
Guido van Rossuma219efa1997-11-18 15:09:54 +0000588
589 # Definition of entities -- derived classes may override
Guido van Rossum7e07b381998-04-03 16:02:39 +0000590 entitydefs = {'lt': '&#60;', # must use charref
591 'gt': '&#62;',
592 'amp': '&#38;', # must use charref
593 'quot': '&#34;',
594 'apos': '&#39;',
595 }
Guido van Rossuma219efa1997-11-18 15:09:54 +0000596
597 # Example -- handle entity reference, no need to override
598 def handle_entityref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000599 table = self.entitydefs
600 if table.has_key(name):
601 self.handle_data(table[name])
602 else:
603 self.unknown_entityref(name)
604 return
Guido van Rossuma219efa1997-11-18 15:09:54 +0000605
606 # Example -- handle data, should be overridden
607 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000608 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000609
610 # Example -- handle cdata, could be overridden
611 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000612 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000613
614 # Example -- handle comment, could be overridden
615 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000616 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000617
618 # Example -- handle processing instructions, could be overridden
619 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000620 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000621
Guido van Rossuma219efa1997-11-18 15:09:54 +0000622 # Example -- handle relatively harmless syntax errors, could be overridden
Guido van Rossum02505e41998-01-29 14:55:24 +0000623 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000624 raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000625
626 # To be overridden -- handlers for unknown objects
627 def unknown_starttag(self, tag, attrs): pass
628 def unknown_endtag(self, tag): pass
629 def unknown_charref(self, ref): pass
630 def unknown_entityref(self, ref): pass
631
632
633class TestXMLParser(XMLParser):
634
635 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000636 self.testdata = ""
637 XMLParser.__init__(self, verbose)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000638
Guido van Rossum02505e41998-01-29 14:55:24 +0000639 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000640 self.flush()
641 print 'xml: encoding =',encoding,'standalone =',standalone
Guido van Rossum02505e41998-01-29 14:55:24 +0000642
Guido van Rossum7e07b381998-04-03 16:02:39 +0000643 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000644 self.flush()
645 print 'DOCTYPE:',tag, `data`
Guido van Rossum02505e41998-01-29 14:55:24 +0000646
Guido van Rossum7e07b381998-04-03 16:02:39 +0000647 def handle_entity(self, name, strval, pubid, syslit, ndata):
648 self.flush()
649 print 'ENTITY:',`data`
650
Guido van Rossuma219efa1997-11-18 15:09:54 +0000651 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000652 self.testdata = self.testdata + data
653 if len(`self.testdata`) >= 70:
654 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000655
656 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000657 data = self.testdata
658 if data:
659 self.testdata = ""
660 print 'data:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000661
662 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000663 self.flush()
664 print 'cdata:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000665
666 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000667 self.flush()
668 print 'processing:',name,`data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000669
Guido van Rossuma219efa1997-11-18 15:09:54 +0000670 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000671 self.flush()
672 r = `data`
673 if len(r) > 68:
674 r = r[:32] + '...' + r[-32:]
675 print 'comment:', r
Guido van Rossuma219efa1997-11-18 15:09:54 +0000676
Guido van Rossum02505e41998-01-29 14:55:24 +0000677 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000678 print 'error at line %d:' % self.lineno, message
Guido van Rossuma219efa1997-11-18 15:09:54 +0000679
680 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000681 self.flush()
682 if not attrs:
683 print 'start tag: <' + tag + '>'
684 else:
685 print 'start tag: <' + tag,
686 for name, value in attrs.items():
687 print name + '=' + '"' + value + '"',
688 print '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000689
690 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000691 self.flush()
692 print 'end tag: </' + tag + '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000693
694 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000695 self.flush()
696 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000697
698 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000699 self.flush()
700 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000701
702 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000703 XMLParser.close(self)
704 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000705
706def test(args = None):
707 import sys
708
709 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000710 args = sys.argv[1:]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000711
712 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000713 args = args[1:]
714 klass = XMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000715 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000716 klass = TestXMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000717
718 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000719 file = args[0]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000720 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000721 file = 'test.xml'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000722
723 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000724 f = sys.stdin
Guido van Rossuma219efa1997-11-18 15:09:54 +0000725 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000726 try:
727 f = open(file, 'r')
728 except IOError, msg:
729 print file, ":", msg
730 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000731
732 data = f.read()
733 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000734 f.close()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000735
736 x = klass()
Guido van Rossum7e07b381998-04-03 16:02:39 +0000737 try:
738 for c in data:
739 x.feed(c)
740 x.close()
741 except RuntimeError, msg:
742 print msg
743 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000744
745
746if __name__ == '__main__':
747 test()