blob: 06dc373c8e65f9f4919f4f2eb49919de4668e076 [file] [log] [blame]
Guido van Rossuma219efa1997-11-18 15:09:54 +00001# A parser for XML, using the derived class as static DTD.
Guido van Rossum5d68e8e1997-11-18 15:27:20 +00002# Author: Sjoerd Mullender.
Guido van Rossuma219efa1997-11-18 15:09:54 +00003
4import re
5import string
6
7
Guido van Rossum7e07b381998-04-03 16:02:39 +00008version = '0.1'
9
Guido van Rossuma219efa1997-11-18 15:09:54 +000010# Regular expressions used for parsing
11
Guido van Rossumeeb2f321998-10-19 13:28:26 +000012_S = '[ \t\r\n]+' # white space
13_opS = '[ \t\r\n]*' # optional white space
14_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
15_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
Guido van Rossum7e07b381998-04-03 16:02:39 +000016illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
17interesting = re.compile('[]&<]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000018
Guido van Rossum7e07b381998-04-03 16:02:39 +000019amp = re.compile('&')
20ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000021entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
22charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
Guido van Rossum7e07b381998-04-03 16:02:39 +000023space = re.compile(_S + '$')
Guido van Rossuma219efa1997-11-18 15:09:54 +000024newline = re.compile('\n')
25
Guido van Rossumeeb2f321998-10-19 13:28:26 +000026attrfind = re.compile(
27 _S + '(?P<name>' + _Name + ')'
28 '(' + _opS + '=' + _opS +
29 '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!()_#=~]+))?')
Guido van Rossuma219efa1997-11-18 15:09:54 +000030starttagopen = re.compile('<' + _Name)
Guido van Rossuma219efa1997-11-18 15:09:54 +000031starttagend = re.compile(_opS + '(?P<slash>/?)>')
Guido van Rossumeeb2f321998-10-19 13:28:26 +000032starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
33 '(?P<attrs>(?:'+attrfind.pattern+')*)'+
34 starttagend.pattern)
35endtagopen = re.compile('</')
Guido van Rossum7e07b381998-04-03 16:02:39 +000036endbracket = re.compile(_opS + '>')
Guido van Rossumeeb2f321998-10-19 13:28:26 +000037endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000038tagfind = re.compile(_Name)
Guido van Rossum02505e41998-01-29 14:55:24 +000039cdataopen = re.compile(r'<!\[CDATA\[')
40cdataclose = re.compile(r'\]\]>')
Guido van Rossum7e07b381998-04-03 16:02:39 +000041# this matches one of the following:
42# SYSTEM SystemLiteral
43# PUBLIC PubidLiteral SystemLiteral
Guido van Rossumeeb2f321998-10-19 13:28:26 +000044_SystemLiteral = '(?P<%s>'+_QStr+')'
Guido van Rossum7e07b381998-04-03 16:02:39 +000045_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
46 "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
47_ExternalId = '(?:SYSTEM|' \
48 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
49 ')'+_S+_SystemLiteral%'syslit'
50doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
51 '(?:'+_S+_ExternalId+')?'+_opS)
52xmldecl = re.compile('<\?xml'+_S+
Guido van Rossumeeb2f321998-10-19 13:28:26 +000053 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
Guido van Rossum7e07b381998-04-03 16:02:39 +000054 '(?:'+_S+'encoding'+_opS+'='+_opS+
55 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
56 '"[A-Za-z][-A-Za-z0-9._]*"))?'
57 '(?:'+_S+'standalone'+_opS+'='+_opS+
58 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
59 _opS+'\?>')
60procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
Guido van Rossum02505e41998-01-29 14:55:24 +000061procclose = re.compile(_opS + r'\?>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000062commentopen = re.compile('<!--')
63commentclose = re.compile('-->')
64doubledash = re.compile('--')
Guido van Rossum7e07b381998-04-03 16:02:39 +000065attrtrans = string.maketrans(' \r\n\t', ' ')
Guido van Rossuma219efa1997-11-18 15:09:54 +000066
67
68# XML parser base class -- find tags and call handler functions.
69# Usage: p = XMLParser(); p.feed(data); ...; p.close().
Guido van Rossum5d68e8e1997-11-18 15:27:20 +000070# The dtd is defined by deriving a class which defines methods with
71# special names to handle tags: start_foo and end_foo to handle <foo>
72# and </foo>, respectively. The data between tags is passed to the
73# parser by calling self.handle_data() with some data as argument (the
74# data may be split up in arbutrary chunks). Entity references are
75# passed by calling self.handle_entityref() with the entity reference
76# as argument.
Guido van Rossuma219efa1997-11-18 15:09:54 +000077
78class XMLParser:
79
80 # Interface -- initialize and reset this instance
81 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 self.verbose = verbose
83 self.reset()
Guido van Rossuma219efa1997-11-18 15:09:54 +000084
85 # Interface -- reset this instance. Loses all unprocessed data
86 def reset(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 self.rawdata = ''
88 self.stack = []
89 self.nomoretags = 0
90 self.literal = 0
91 self.lineno = 1
92 self.__at_start = 1
93 self.__seen_doctype = None
94 self.__seen_starttag = 0
Guido van Rossuma219efa1997-11-18 15:09:54 +000095
96 # For derived classes only -- enter literal mode (CDATA) till EOF
97 def setnomoretags(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000098 self.nomoretags = self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +000099
100 # For derived classes only -- enter literal mode (CDATA)
101 def setliteral(self, *args):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000102 self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000103
104 # Interface -- feed some data to the parser. Call this as
105 # often as you want, with as little or as much text as you
106 # want (may include '\n'). (This just saves the text, all the
107 # processing is done by goahead().)
108 def feed(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000109 self.rawdata = self.rawdata + data
110 self.goahead(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000111
112 # Interface -- handle the remaining data
113 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000114 self.goahead(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000115
116 # Interface -- translate references
Guido van Rossum7e07b381998-04-03 16:02:39 +0000117 def translate_references(self, data, all = 1):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 i = 0
119 while 1:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000120 res = amp.search(data, i)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000121 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000122 return data
123 res = ref.match(data, res.start(0))
124 if res is None:
125 self.syntax_error("bogus `&'")
126 i =i+1
127 continue
128 i = res.end(0)
129 if data[i - 1] != ';':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 self.syntax_error("`;' missing after entity/char reference")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000131 i = i-1
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000132 str = res.group(1)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000133 pre = data[:res.start(0)]
134 post = data[i:]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000135 if str[0] == '#':
136 if str[1] == 'x':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000137 str = chr(string.atoi(str[2:], 16))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000138 else:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000139 str = chr(string.atoi(str[1:]))
140 data = pre + str + post
141 i = res.start(0)+len(str)
142 elif all:
143 if self.entitydefs.has_key(str):
144 data = pre + self.entitydefs[str] + post
145 i = res.start(0) # rescan substituted text
146 else:
147 self.syntax_error('reference to unknown entity')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000148 # can't do it, so keep the entity ref in
Guido van Rossum7e07b381998-04-03 16:02:39 +0000149 data = pre + '&' + str + ';' + post
150 i = res.start(0) + len(str) + 2
151 else:
152 # just translating character references
153 pass # i is already postioned correctly
Guido van Rossuma219efa1997-11-18 15:09:54 +0000154
155 # Internal -- handle data as far as reasonable. May leave state
156 # and data to be processed by a subsequent call. If 'end' is
157 # true, force handling all data as if followed by EOF marker.
158 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 rawdata = self.rawdata
160 i = 0
161 n = len(rawdata)
162 while i < n:
163 if i > 0:
164 self.__at_start = 0
165 if self.nomoretags:
166 data = rawdata[i:n]
167 self.handle_data(data)
168 self.lineno = self.lineno + string.count(data, '\n')
169 i = n
170 break
171 res = interesting.search(rawdata, i)
172 if res:
173 j = res.start(0)
174 else:
175 j = n
176 if i < j:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000177 if self.__at_start:
178 self.syntax_error('illegal data at start of file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 self.__at_start = 0
180 data = rawdata[i:j]
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000181 if not self.stack and space.match(data) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000182 self.syntax_error('data not in content')
183 if illegal.search(data):
184 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.handle_data(data)
186 self.lineno = self.lineno + string.count(data, '\n')
187 i = j
188 if i == n: break
189 if rawdata[i] == '<':
190 if starttagopen.match(rawdata, i):
191 if self.literal:
192 data = rawdata[i]
193 self.handle_data(data)
194 self.lineno = self.lineno + string.count(data, '\n')
195 i = i+1
196 continue
197 k = self.parse_starttag(i)
198 if k < 0: break
199 self.__seen_starttag = 1
200 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
201 i = k
202 continue
203 if endtagopen.match(rawdata, i):
204 k = self.parse_endtag(i)
205 if k < 0: break
206 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
207 i = k
208 self.literal = 0
209 continue
210 if commentopen.match(rawdata, i):
211 if self.literal:
212 data = rawdata[i]
213 self.handle_data(data)
214 self.lineno = self.lineno + string.count(data, '\n')
215 i = i+1
216 continue
217 k = self.parse_comment(i)
218 if k < 0: break
219 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
220 i = k
221 continue
222 if cdataopen.match(rawdata, i):
223 k = self.parse_cdata(i)
224 if k < 0: break
225 self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
226 i = k
227 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000228 res = xmldecl.match(rawdata, i)
229 if res:
230 if not self.__at_start:
231 self.syntax_error("<?xml?> declaration not at start of document")
232 version, encoding, standalone = res.group('version',
233 'encoding',
234 'standalone')
235 if version[1:-1] != '1.0':
236 raise RuntimeError, 'only XML version 1.0 supported'
237 if encoding: encoding = encoding[1:-1]
238 if standalone: standalone = standalone[1:-1]
239 self.handle_xml(encoding, standalone)
240 i = res.end(0)
241 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000242 res = procopen.match(rawdata, i)
243 if res:
244 k = self.parse_proc(i)
245 if k < 0: break
246 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
247 i = k
248 continue
249 res = doctype.match(rawdata, i)
250 if res:
251 if self.literal:
252 data = rawdata[i]
253 self.handle_data(data)
254 self.lineno = self.lineno + string.count(data, '\n')
255 i = i+1
256 continue
257 if self.__seen_doctype:
258 self.syntax_error('multiple DOCTYPE elements')
259 if self.__seen_starttag:
260 self.syntax_error('DOCTYPE not at beginning of document')
261 k = self.parse_doctype(res)
262 if k < 0: break
263 self.__seen_doctype = res.group('name')
264 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
265 i = k
266 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000267 elif rawdata[i] == '&':
268 res = charref.match(rawdata, i)
269 if res is not None:
270 i = res.end(0)
271 if rawdata[i-1] != ';':
272 self.syntax_error("`;' missing in charref")
273 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000274 if not self.stack:
275 self.syntax_error('data not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 self.handle_charref(res.group('char')[:-1])
277 self.lineno = self.lineno + string.count(res.group(0), '\n')
278 continue
279 res = entityref.match(rawdata, i)
280 if res is not None:
281 i = res.end(0)
282 if rawdata[i-1] != ';':
283 self.syntax_error("`;' missing in entityref")
284 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000285 name = res.group('name')
286 if self.entitydefs.has_key(name):
287 self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
288 n = len(rawdata)
289 i = res.start(0)
290 else:
291 self.syntax_error('reference to unknown entity')
292 self.unknown_entityref(name)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000293 self.lineno = self.lineno + string.count(res.group(0), '\n')
294 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000295 elif rawdata[i] == ']':
296 if n-i < 3:
297 break
298 if cdataclose.match(rawdata, i):
299 self.syntax_error("bogus `]]>'")
300 self.handle_data(rawdata[i])
301 i = i+1
302 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000303 else:
304 raise RuntimeError, 'neither < nor & ??'
305 # We get here only if incomplete matches but
306 # nothing else
Guido van Rossum7e07b381998-04-03 16:02:39 +0000307 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000308 # end while
Guido van Rossum7e07b381998-04-03 16:02:39 +0000309 if i > 0:
310 self.__at_start = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000311 if end and i < n:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000312 data = rawdata[i]
313 self.syntax_error("bogus `%s'" % data)
314 if illegal.search(data):
315 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 self.handle_data(data)
317 self.lineno = self.lineno + string.count(data, '\n')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000318 self.rawdata = rawdata[i+1:]
319 return self.goahead(end)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000320 self.rawdata = rawdata[i:]
321 if end:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000322 if not self.__seen_starttag:
323 self.syntax_error('no elements in file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 if self.stack:
325 self.syntax_error('missing end tags')
326 while self.stack:
327 self.finish_endtag(self.stack[-1])
Guido van Rossuma219efa1997-11-18 15:09:54 +0000328
329 # Internal -- parse comment, return length or -1 if not terminated
330 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000331 rawdata = self.rawdata
332 if rawdata[i:i+4] <> '<!--':
333 raise RuntimeError, 'unexpected call to handle_comment'
334 res = commentclose.search(rawdata, i+4)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000335 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000336 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000337 if doubledash.search(rawdata, i+4, res.start(0)):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 self.syntax_error("`--' inside comment")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000339 if rawdata[res.start(0)-1] == '-':
340 self.syntax_error('comment cannot end in three dashes')
341 if illegal.search(rawdata, i+4, res.start(0)):
342 self.syntax_error('illegal character in comment')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000343 self.handle_comment(rawdata[i+4: res.start(0)])
344 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000345
Guido van Rossum02505e41998-01-29 14:55:24 +0000346 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
347 def parse_doctype(self, res):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000348 rawdata = self.rawdata
349 n = len(rawdata)
350 name = res.group('name')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000351 pubid, syslit = res.group('pubid', 'syslit')
352 if pubid is not None:
353 pubid = pubid[1:-1] # remove quotes
354 pubid = string.join(string.split(pubid)) # normalize
355 if syslit is not None: syslit = syslit[1:-1] # remove quotes
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000356 j = k = res.end(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000357 if k >= n:
358 return -1
359 if rawdata[k] == '[':
360 level = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000361 k = k+1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000362 dq = sq = 0
363 while k < n:
364 c = rawdata[k]
365 if not sq and c == '"':
366 dq = not dq
367 elif not dq and c == "'":
368 sq = not sq
369 elif sq or dq:
370 pass
371 elif level <= 0 and c == ']':
372 res = endbracket.match(rawdata, k+1)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000373 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000374 return -1
375 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
376 return res.end(0)
377 elif c == '<':
378 level = level + 1
379 elif c == '>':
380 level = level - 1
381 if level < 0:
382 self.syntax_error("bogus `>' in DOCTYPE")
383 k = k+1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000384 res = endbracketfind.match(rawdata, k)
385 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000386 return -1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000387 if endbracket.match(rawdata, k) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000388 self.syntax_error('garbage in DOCTYPE')
389 self.handle_doctype(name, pubid, syslit, None)
390 return res.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000391
392 # Internal -- handle CDATA tag, return length or -1 if not terminated
Guido van Rossuma219efa1997-11-18 15:09:54 +0000393 def parse_cdata(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000394 rawdata = self.rawdata
395 if rawdata[i:i+9] <> '<![CDATA[':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000396 raise RuntimeError, 'unexpected call to parse_cdata'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000397 res = cdataclose.search(rawdata, i+9)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000398 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000399 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000400 if illegal.search(rawdata, i+9, res.start(0)):
401 self.syntax_error('illegal character in CDATA')
402 if not self.stack:
403 self.syntax_error('CDATA not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000404 self.handle_cdata(rawdata[i+9:res.start(0)])
405 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000406
Guido van Rossum02505e41998-01-29 14:55:24 +0000407 __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
408 # Internal -- handle a processing instruction tag
409 def parse_proc(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000410 rawdata = self.rawdata
411 end = procclose.search(rawdata, i)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000412 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000413 return -1
414 j = end.start(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000415 if illegal.search(rawdata, i+2, j):
416 self.syntax_error('illegal character in processing instruction')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000417 res = tagfind.match(rawdata, i+2)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000418 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000419 raise RuntimeError, 'unexpected call to parse_proc'
420 k = res.end(0)
421 name = res.group(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000422 if string.find(string.lower(name), 'xml') >= 0:
423 self.syntax_error('illegal processing instruction target name')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000424 self.handle_proc(name, rawdata[k:j])
425 return end.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000426
427 # Internal -- parse attributes between i and j
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000428 def parse_attributes(self, tag, i, j, attributes = None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000429 rawdata = self.rawdata
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000430 # Now parse the data between i and j into a tag and attrs
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000431 attrdict = {}
432 try:
433 # convert attributes list to dictionary
434 d = {}
435 for a in attributes:
436 d[a] = None
437 attributes = d
438 except TypeError:
439 pass
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000440 while i < j:
441 res = attrfind.match(rawdata, i)
442 if res is None:
443 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000444 attrname, attrvalue = res.group('name', 'value')
445 if attrvalue is None:
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000446 self.syntax_error("no value specified for attribute `%s'" % attrname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000447 attrvalue = attrname
448 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
449 attrvalue[:1] == '"' == attrvalue[-1:]:
450 attrvalue = attrvalue[1:-1]
451 else:
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000452 self.syntax_error("attribute `%s' value not quoted" % attrname)
453 if '<' in attrvalue:
454 self.syntax_error("`<' illegal in attribute value")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000455 if attributes is not None and not attributes.has_key(attrname):
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000456 self.syntax_error("unknown attribute `%s' of element `%s'" %
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000457 (attrname, tag))
458 if attrdict.has_key(attrname):
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000459 self.syntax_error("attribute `%s' specified twice" % attrname)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000460 attrvalue = string.translate(attrvalue, attrtrans)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000461 attrdict[attrname] = self.translate_references(attrvalue)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000462 i = res.end(0)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000463 if attributes is not None:
464 # fill in with default attributes
465 for key, val in attributes.items():
466 if val is not None and not attrdict.has_key(key):
467 attrdict[key] = val
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000468 return attrdict, i
Guido van Rossuma219efa1997-11-18 15:09:54 +0000469
470 # Internal -- handle starttag, return length or -1 if not terminated
471 def parse_starttag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000472 rawdata = self.rawdata
473 # i points to start of tag
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000474 end = endbracketfind.match(rawdata, i+1)
475 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000476 return -1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000477 tag = starttagmatch.match(rawdata, i)
478 if tag is None or tag.end(0) != end.end(0):
479 self.syntax_error('garbage in starttag')
480 return end.end(0)
481 tagname = tag.group('tagname')
482 if not self.__seen_starttag and self.__seen_doctype and \
483 tagname != self.__seen_doctype:
484 self.syntax_error('starttag does not match DOCTYPE')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000485 if self.__seen_starttag and not self.stack:
486 self.syntax_error('multiple elements on top level')
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000487 if hasattr(self, tagname + '_attributes'):
488 attributes = getattr(self, tagname + '_attributes')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000489 else:
490 attributes = None
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000491 k, j = tag.span('attrs')
492 attrdict, k = self.parse_attributes(tagname, k, j, attributes)
493 self.finish_starttag(tagname, attrdict)
494 if tag.group('slash') == '/':
495 self.finish_endtag(tagname)
496 return tag.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000497
498 # Internal -- parse endtag
499 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000500 rawdata = self.rawdata
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000501 end = endbracketfind.match(rawdata, i+1)
502 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000503 return -1
504 res = tagfind.match(rawdata, i+2)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000505 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000506 self.syntax_error('no name specified in end tag')
507 tag = ''
508 k = i+2
509 else:
510 tag = res.group(0)
511 k = res.end(0)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000512 if endbracket.match(rawdata, k) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000513 self.syntax_error('garbage in end tag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000514 self.finish_endtag(tag)
515 return end.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000516
517 # Internal -- finish processing of start tag
518 # Return -1 for unknown tag, 1 for balanced tag
519 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000520 self.stack.append(tag)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000521 methodname = 'start_' + tag
522 if hasattr(self, methodname):
523 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000524 self.handle_starttag(tag, method, attrs)
525 return 1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000526 else:
527 self.unknown_starttag(tag, attrs)
528 return -1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000529
530 # Internal -- finish processing of end tag
531 def finish_endtag(self, tag):
Guido van Rossum7e07b381998-04-03 16:02:39 +0000532 methodname = 'end_' + tag
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000533 if not tag:
534 self.syntax_error('name-less end tag')
535 found = len(self.stack) - 1
536 if found < 0:
537 self.unknown_endtag(tag)
538 return
539 else:
540 if tag not in self.stack:
541 self.syntax_error('unopened end tag')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000542 if hasattr(self, methodname):
543 method = getattr(self, methodname)
544 self.handle_endtag(tag, method)
545 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000546 self.unknown_endtag(tag)
547 return
548 found = len(self.stack)
549 for i in range(found):
550 if self.stack[i] == tag:
551 found = i
552 while len(self.stack) > found:
553 if found < len(self.stack) - 1:
554 self.syntax_error('missing close tag for %s' % self.stack[-1])
555 tag = self.stack[-1]
Guido van Rossum7e07b381998-04-03 16:02:39 +0000556 if hasattr(self, methodname):
557 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000558 self.handle_endtag(tag, method)
559 else:
560 self.unknown_endtag(tag)
561 del self.stack[-1]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000562
Guido van Rossum02505e41998-01-29 14:55:24 +0000563 # Overridable -- handle xml processing instruction
564 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000565 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000566
567 # Overridable -- handle DOCTYPE
Guido van Rossum7e07b381998-04-03 16:02:39 +0000568 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000569 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000570
Guido van Rossuma219efa1997-11-18 15:09:54 +0000571 # Overridable -- handle start tag
572 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000573 method(attrs)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000574
575 # Overridable -- handle end tag
576 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000577 method()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000578
579 # Example -- handle character reference, no need to override
580 def handle_charref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000581 try:
582 if name[0] == 'x':
583 n = string.atoi(name[1:], 16)
584 else:
585 n = string.atoi(name)
586 except string.atoi_error:
587 self.unknown_charref(name)
588 return
589 if not 0 <= n <= 255:
590 self.unknown_charref(name)
591 return
592 self.handle_data(chr(n))
Guido van Rossuma219efa1997-11-18 15:09:54 +0000593
594 # Definition of entities -- derived classes may override
Guido van Rossum7e07b381998-04-03 16:02:39 +0000595 entitydefs = {'lt': '&#60;', # must use charref
596 'gt': '&#62;',
597 'amp': '&#38;', # must use charref
598 'quot': '&#34;',
599 'apos': '&#39;',
600 }
Guido van Rossuma219efa1997-11-18 15:09:54 +0000601
602 # Example -- handle entity reference, no need to override
603 def handle_entityref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000604 table = self.entitydefs
605 if table.has_key(name):
606 self.handle_data(table[name])
607 else:
608 self.unknown_entityref(name)
609 return
Guido van Rossuma219efa1997-11-18 15:09:54 +0000610
611 # Example -- handle data, should be overridden
612 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000613 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000614
615 # Example -- handle cdata, could be overridden
616 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000617 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000618
619 # Example -- handle comment, could be overridden
620 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000621 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000622
623 # Example -- handle processing instructions, could be overridden
624 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000625 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000626
Guido van Rossuma219efa1997-11-18 15:09:54 +0000627 # Example -- handle relatively harmless syntax errors, could be overridden
Guido van Rossum02505e41998-01-29 14:55:24 +0000628 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000629 raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000630
631 # To be overridden -- handlers for unknown objects
632 def unknown_starttag(self, tag, attrs): pass
633 def unknown_endtag(self, tag): pass
634 def unknown_charref(self, ref): pass
635 def unknown_entityref(self, ref): pass
636
637
638class TestXMLParser(XMLParser):
639
640 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000641 self.testdata = ""
642 XMLParser.__init__(self, verbose)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000643
Guido van Rossum02505e41998-01-29 14:55:24 +0000644 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000645 self.flush()
646 print 'xml: encoding =',encoding,'standalone =',standalone
Guido van Rossum02505e41998-01-29 14:55:24 +0000647
Guido van Rossum7e07b381998-04-03 16:02:39 +0000648 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000649 self.flush()
650 print 'DOCTYPE:',tag, `data`
Guido van Rossum02505e41998-01-29 14:55:24 +0000651
Guido van Rossum7e07b381998-04-03 16:02:39 +0000652 def handle_entity(self, name, strval, pubid, syslit, ndata):
653 self.flush()
654 print 'ENTITY:',`data`
655
Guido van Rossuma219efa1997-11-18 15:09:54 +0000656 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000657 self.testdata = self.testdata + data
658 if len(`self.testdata`) >= 70:
659 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000660
661 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000662 data = self.testdata
663 if data:
664 self.testdata = ""
665 print 'data:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000666
667 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000668 self.flush()
669 print 'cdata:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000670
671 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000672 self.flush()
673 print 'processing:',name,`data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000674
Guido van Rossuma219efa1997-11-18 15:09:54 +0000675 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000676 self.flush()
677 r = `data`
678 if len(r) > 68:
679 r = r[:32] + '...' + r[-32:]
680 print 'comment:', r
Guido van Rossuma219efa1997-11-18 15:09:54 +0000681
Guido van Rossum02505e41998-01-29 14:55:24 +0000682 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000683 print 'error at line %d:' % self.lineno, message
Guido van Rossuma219efa1997-11-18 15:09:54 +0000684
685 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000686 self.flush()
687 if not attrs:
688 print 'start tag: <' + tag + '>'
689 else:
690 print 'start tag: <' + tag,
691 for name, value in attrs.items():
692 print name + '=' + '"' + value + '"',
693 print '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000694
695 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000696 self.flush()
697 print 'end tag: </' + tag + '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000698
699 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000700 self.flush()
701 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000702
703 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000704 self.flush()
705 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000706
707 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000708 XMLParser.close(self)
709 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000710
711def test(args = None):
712 import sys
713
714 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000715 args = sys.argv[1:]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000716
717 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000718 args = args[1:]
719 klass = XMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000720 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000721 klass = TestXMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000722
723 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000724 file = args[0]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000725 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000726 file = 'test.xml'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000727
728 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000729 f = sys.stdin
Guido van Rossuma219efa1997-11-18 15:09:54 +0000730 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000731 try:
732 f = open(file, 'r')
733 except IOError, msg:
734 print file, ":", msg
735 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000736
737 data = f.read()
738 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000739 f.close()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000740
741 x = klass()
Guido van Rossum7e07b381998-04-03 16:02:39 +0000742 try:
743 for c in data:
744 x.feed(c)
745 x.close()
746 except RuntimeError, msg:
747 print msg
748 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000749
750
751if __name__ == '__main__':
752 test()