blob: 4e62ae5486ebf0a77f1bd7a0e3b9543d9f3f2124 [file] [log] [blame]
Guido van Rossuma219efa1997-11-18 15:09:54 +00001# A parser for XML, using the derived class as static DTD.
Guido van Rossum5d68e8e1997-11-18 15:27:20 +00002# Author: Sjoerd Mullender.
Guido van Rossuma219efa1997-11-18 15:09:54 +00003
4import re
5import string
6
7
Guido van Rossum7e07b381998-04-03 16:02:39 +00008version = '0.1'
9
Guido van Rossuma219efa1997-11-18 15:09:54 +000010# Regular expressions used for parsing
11
Guido van Rossumeeb2f321998-10-19 13:28:26 +000012_S = '[ \t\r\n]+' # white space
13_opS = '[ \t\r\n]*' # optional white space
14_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
15_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
Guido van Rossum7e07b381998-04-03 16:02:39 +000016illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
17interesting = re.compile('[]&<]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000018
Guido van Rossum7e07b381998-04-03 16:02:39 +000019amp = re.compile('&')
20ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000021entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
22charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
Guido van Rossum7e07b381998-04-03 16:02:39 +000023space = re.compile(_S + '$')
Guido van Rossuma219efa1997-11-18 15:09:54 +000024newline = re.compile('\n')
25
Guido van Rossumeeb2f321998-10-19 13:28:26 +000026attrfind = re.compile(
27 _S + '(?P<name>' + _Name + ')'
28 '(' + _opS + '=' + _opS +
29 '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!()_#=~]+))?')
Guido van Rossuma219efa1997-11-18 15:09:54 +000030starttagopen = re.compile('<' + _Name)
Guido van Rossuma219efa1997-11-18 15:09:54 +000031starttagend = re.compile(_opS + '(?P<slash>/?)>')
Guido van Rossumeeb2f321998-10-19 13:28:26 +000032starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
33 '(?P<attrs>(?:'+attrfind.pattern+')*)'+
34 starttagend.pattern)
35endtagopen = re.compile('</')
Guido van Rossum7e07b381998-04-03 16:02:39 +000036endbracket = re.compile(_opS + '>')
Guido van Rossumeeb2f321998-10-19 13:28:26 +000037endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000038tagfind = re.compile(_Name)
Guido van Rossum02505e41998-01-29 14:55:24 +000039cdataopen = re.compile(r'<!\[CDATA\[')
40cdataclose = re.compile(r'\]\]>')
Guido van Rossum7e07b381998-04-03 16:02:39 +000041# this matches one of the following:
42# SYSTEM SystemLiteral
43# PUBLIC PubidLiteral SystemLiteral
Guido van Rossumeeb2f321998-10-19 13:28:26 +000044_SystemLiteral = '(?P<%s>'+_QStr+')'
Guido van Rossum7e07b381998-04-03 16:02:39 +000045_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
46 "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
47_ExternalId = '(?:SYSTEM|' \
48 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
49 ')'+_S+_SystemLiteral%'syslit'
50doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
51 '(?:'+_S+_ExternalId+')?'+_opS)
52xmldecl = re.compile('<\?xml'+_S+
Guido van Rossumeeb2f321998-10-19 13:28:26 +000053 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
Guido van Rossum7e07b381998-04-03 16:02:39 +000054 '(?:'+_S+'encoding'+_opS+'='+_opS+
55 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
56 '"[A-Za-z][-A-Za-z0-9._]*"))?'
57 '(?:'+_S+'standalone'+_opS+'='+_opS+
58 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
59 _opS+'\?>')
60procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
Guido van Rossum02505e41998-01-29 14:55:24 +000061procclose = re.compile(_opS + r'\?>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000062commentopen = re.compile('<!--')
63commentclose = re.compile('-->')
64doubledash = re.compile('--')
Guido van Rossum7e07b381998-04-03 16:02:39 +000065attrtrans = string.maketrans(' \r\n\t', ' ')
Guido van Rossuma219efa1997-11-18 15:09:54 +000066
67
68# XML parser base class -- find tags and call handler functions.
69# Usage: p = XMLParser(); p.feed(data); ...; p.close().
Guido van Rossum5d68e8e1997-11-18 15:27:20 +000070# The dtd is defined by deriving a class which defines methods with
71# special names to handle tags: start_foo and end_foo to handle <foo>
72# and </foo>, respectively. The data between tags is passed to the
73# parser by calling self.handle_data() with some data as argument (the
74# data may be split up in arbutrary chunks). Entity references are
75# passed by calling self.handle_entityref() with the entity reference
76# as argument.
Guido van Rossuma219efa1997-11-18 15:09:54 +000077
78class XMLParser:
79
80 # Interface -- initialize and reset this instance
81 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 self.verbose = verbose
83 self.reset()
Guido van Rossuma219efa1997-11-18 15:09:54 +000084
85 # Interface -- reset this instance. Loses all unprocessed data
86 def reset(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 self.rawdata = ''
88 self.stack = []
89 self.nomoretags = 0
90 self.literal = 0
91 self.lineno = 1
92 self.__at_start = 1
93 self.__seen_doctype = None
94 self.__seen_starttag = 0
Guido van Rossuma219efa1997-11-18 15:09:54 +000095
96 # For derived classes only -- enter literal mode (CDATA) till EOF
97 def setnomoretags(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000098 self.nomoretags = self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +000099
100 # For derived classes only -- enter literal mode (CDATA)
101 def setliteral(self, *args):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000102 self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000103
104 # Interface -- feed some data to the parser. Call this as
105 # often as you want, with as little or as much text as you
106 # want (may include '\n'). (This just saves the text, all the
107 # processing is done by goahead().)
108 def feed(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000109 self.rawdata = self.rawdata + data
110 self.goahead(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000111
112 # Interface -- handle the remaining data
113 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000114 self.goahead(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000115
116 # Interface -- translate references
Guido van Rossum7e07b381998-04-03 16:02:39 +0000117 def translate_references(self, data, all = 1):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 i = 0
119 while 1:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000120 res = amp.search(data, i)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000121 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000122 return data
123 res = ref.match(data, res.start(0))
124 if res is None:
125 self.syntax_error("bogus `&'")
126 i =i+1
127 continue
128 i = res.end(0)
129 if data[i - 1] != ';':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 self.syntax_error("`;' missing after entity/char reference")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000131 i = i-1
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000132 str = res.group(1)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000133 pre = data[:res.start(0)]
134 post = data[i:]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000135 if str[0] == '#':
136 if str[1] == 'x':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000137 str = chr(string.atoi(str[2:], 16))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000138 else:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000139 str = chr(string.atoi(str[1:]))
140 data = pre + str + post
141 i = res.start(0)+len(str)
142 elif all:
143 if self.entitydefs.has_key(str):
144 data = pre + self.entitydefs[str] + post
145 i = res.start(0) # rescan substituted text
146 else:
147 self.syntax_error('reference to unknown entity')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000148 # can't do it, so keep the entity ref in
Guido van Rossum7e07b381998-04-03 16:02:39 +0000149 data = pre + '&' + str + ';' + post
150 i = res.start(0) + len(str) + 2
151 else:
152 # just translating character references
153 pass # i is already postioned correctly
Guido van Rossuma219efa1997-11-18 15:09:54 +0000154
155 # Internal -- handle data as far as reasonable. May leave state
156 # and data to be processed by a subsequent call. If 'end' is
157 # true, force handling all data as if followed by EOF marker.
158 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 rawdata = self.rawdata
160 i = 0
161 n = len(rawdata)
162 while i < n:
163 if i > 0:
164 self.__at_start = 0
165 if self.nomoretags:
166 data = rawdata[i:n]
167 self.handle_data(data)
168 self.lineno = self.lineno + string.count(data, '\n')
169 i = n
170 break
171 res = interesting.search(rawdata, i)
172 if res:
173 j = res.start(0)
174 else:
175 j = n
176 if i < j:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000177 if self.__at_start:
178 self.syntax_error('illegal data at start of file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 self.__at_start = 0
180 data = rawdata[i:j]
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000181 if not self.stack and space.match(data) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000182 self.syntax_error('data not in content')
183 if illegal.search(data):
184 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.handle_data(data)
186 self.lineno = self.lineno + string.count(data, '\n')
187 i = j
188 if i == n: break
189 if rawdata[i] == '<':
190 if starttagopen.match(rawdata, i):
191 if self.literal:
192 data = rawdata[i]
193 self.handle_data(data)
194 self.lineno = self.lineno + string.count(data, '\n')
195 i = i+1
196 continue
197 k = self.parse_starttag(i)
198 if k < 0: break
199 self.__seen_starttag = 1
200 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
201 i = k
202 continue
203 if endtagopen.match(rawdata, i):
204 k = self.parse_endtag(i)
205 if k < 0: break
206 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
207 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000208 continue
209 if commentopen.match(rawdata, i):
210 if self.literal:
211 data = rawdata[i]
212 self.handle_data(data)
213 self.lineno = self.lineno + string.count(data, '\n')
214 i = i+1
215 continue
216 k = self.parse_comment(i)
217 if k < 0: break
218 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
219 i = k
220 continue
221 if cdataopen.match(rawdata, i):
222 k = self.parse_cdata(i)
223 if k < 0: break
224 self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
225 i = k
226 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000227 res = xmldecl.match(rawdata, i)
228 if res:
229 if not self.__at_start:
230 self.syntax_error("<?xml?> declaration not at start of document")
231 version, encoding, standalone = res.group('version',
232 'encoding',
233 'standalone')
234 if version[1:-1] != '1.0':
235 raise RuntimeError, 'only XML version 1.0 supported'
236 if encoding: encoding = encoding[1:-1]
237 if standalone: standalone = standalone[1:-1]
238 self.handle_xml(encoding, standalone)
239 i = res.end(0)
240 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000241 res = procopen.match(rawdata, i)
242 if res:
243 k = self.parse_proc(i)
244 if k < 0: break
245 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
246 i = k
247 continue
248 res = doctype.match(rawdata, i)
249 if res:
250 if self.literal:
251 data = rawdata[i]
252 self.handle_data(data)
253 self.lineno = self.lineno + string.count(data, '\n')
254 i = i+1
255 continue
256 if self.__seen_doctype:
257 self.syntax_error('multiple DOCTYPE elements')
258 if self.__seen_starttag:
259 self.syntax_error('DOCTYPE not at beginning of document')
260 k = self.parse_doctype(res)
261 if k < 0: break
262 self.__seen_doctype = res.group('name')
263 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
264 i = k
265 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 elif rawdata[i] == '&':
267 res = charref.match(rawdata, i)
268 if res is not None:
269 i = res.end(0)
270 if rawdata[i-1] != ';':
271 self.syntax_error("`;' missing in charref")
272 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000273 if not self.stack:
274 self.syntax_error('data not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000275 self.handle_charref(res.group('char')[:-1])
276 self.lineno = self.lineno + string.count(res.group(0), '\n')
277 continue
278 res = entityref.match(rawdata, i)
279 if res is not None:
280 i = res.end(0)
281 if rawdata[i-1] != ';':
282 self.syntax_error("`;' missing in entityref")
283 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000284 name = res.group('name')
285 if self.entitydefs.has_key(name):
286 self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
287 n = len(rawdata)
288 i = res.start(0)
289 else:
290 self.syntax_error('reference to unknown entity')
291 self.unknown_entityref(name)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 self.lineno = self.lineno + string.count(res.group(0), '\n')
293 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000294 elif rawdata[i] == ']':
295 if n-i < 3:
296 break
297 if cdataclose.match(rawdata, i):
298 self.syntax_error("bogus `]]>'")
299 self.handle_data(rawdata[i])
300 i = i+1
301 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000302 else:
303 raise RuntimeError, 'neither < nor & ??'
304 # We get here only if incomplete matches but
305 # nothing else
Guido van Rossum7e07b381998-04-03 16:02:39 +0000306 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 # end while
Guido van Rossum7e07b381998-04-03 16:02:39 +0000308 if i > 0:
309 self.__at_start = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000310 if end and i < n:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000311 data = rawdata[i]
312 self.syntax_error("bogus `%s'" % data)
313 if illegal.search(data):
314 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 self.handle_data(data)
316 self.lineno = self.lineno + string.count(data, '\n')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000317 self.rawdata = rawdata[i+1:]
318 return self.goahead(end)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000319 self.rawdata = rawdata[i:]
320 if end:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000321 if not self.__seen_starttag:
322 self.syntax_error('no elements in file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000323 if self.stack:
324 self.syntax_error('missing end tags')
325 while self.stack:
326 self.finish_endtag(self.stack[-1])
Guido van Rossuma219efa1997-11-18 15:09:54 +0000327
328 # Internal -- parse comment, return length or -1 if not terminated
329 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000330 rawdata = self.rawdata
331 if rawdata[i:i+4] <> '<!--':
332 raise RuntimeError, 'unexpected call to handle_comment'
333 res = commentclose.search(rawdata, i+4)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000334 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000335 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000336 if doubledash.search(rawdata, i+4, res.start(0)):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 self.syntax_error("`--' inside comment")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000338 if rawdata[res.start(0)-1] == '-':
339 self.syntax_error('comment cannot end in three dashes')
340 if illegal.search(rawdata, i+4, res.start(0)):
341 self.syntax_error('illegal character in comment')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000342 self.handle_comment(rawdata[i+4: res.start(0)])
343 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000344
Guido van Rossum02505e41998-01-29 14:55:24 +0000345 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
346 def parse_doctype(self, res):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000347 rawdata = self.rawdata
348 n = len(rawdata)
349 name = res.group('name')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000350 pubid, syslit = res.group('pubid', 'syslit')
351 if pubid is not None:
352 pubid = pubid[1:-1] # remove quotes
353 pubid = string.join(string.split(pubid)) # normalize
354 if syslit is not None: syslit = syslit[1:-1] # remove quotes
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000355 j = k = res.end(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000356 if k >= n:
357 return -1
358 if rawdata[k] == '[':
359 level = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000360 k = k+1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000361 dq = sq = 0
362 while k < n:
363 c = rawdata[k]
364 if not sq and c == '"':
365 dq = not dq
366 elif not dq and c == "'":
367 sq = not sq
368 elif sq or dq:
369 pass
370 elif level <= 0 and c == ']':
371 res = endbracket.match(rawdata, k+1)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000372 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000373 return -1
374 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
375 return res.end(0)
376 elif c == '<':
377 level = level + 1
378 elif c == '>':
379 level = level - 1
380 if level < 0:
381 self.syntax_error("bogus `>' in DOCTYPE")
382 k = k+1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000383 res = endbracketfind.match(rawdata, k)
384 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000385 return -1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000386 if endbracket.match(rawdata, k) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000387 self.syntax_error('garbage in DOCTYPE')
388 self.handle_doctype(name, pubid, syslit, None)
389 return res.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000390
391 # Internal -- handle CDATA tag, return length or -1 if not terminated
Guido van Rossuma219efa1997-11-18 15:09:54 +0000392 def parse_cdata(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000393 rawdata = self.rawdata
394 if rawdata[i:i+9] <> '<![CDATA[':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000395 raise RuntimeError, 'unexpected call to parse_cdata'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000396 res = cdataclose.search(rawdata, i+9)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000397 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000399 if illegal.search(rawdata, i+9, res.start(0)):
400 self.syntax_error('illegal character in CDATA')
401 if not self.stack:
402 self.syntax_error('CDATA not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000403 self.handle_cdata(rawdata[i+9:res.start(0)])
404 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000405
Guido van Rossum02505e41998-01-29 14:55:24 +0000406 __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
407 # Internal -- handle a processing instruction tag
408 def parse_proc(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000409 rawdata = self.rawdata
410 end = procclose.search(rawdata, i)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000411 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000412 return -1
413 j = end.start(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000414 if illegal.search(rawdata, i+2, j):
415 self.syntax_error('illegal character in processing instruction')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000416 res = tagfind.match(rawdata, i+2)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000417 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000418 raise RuntimeError, 'unexpected call to parse_proc'
419 k = res.end(0)
420 name = res.group(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000421 if string.find(string.lower(name), 'xml') >= 0:
422 self.syntax_error('illegal processing instruction target name')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000423 self.handle_proc(name, rawdata[k:j])
424 return end.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000425
426 # Internal -- parse attributes between i and j
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000427 def parse_attributes(self, tag, i, j, attributes = None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000428 rawdata = self.rawdata
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000429 # Now parse the data between i and j into a tag and attrs
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000430 attrdict = {}
431 try:
432 # convert attributes list to dictionary
433 d = {}
434 for a in attributes:
435 d[a] = None
436 attributes = d
437 except TypeError:
438 pass
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000439 while i < j:
440 res = attrfind.match(rawdata, i)
441 if res is None:
442 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000443 attrname, attrvalue = res.group('name', 'value')
444 if attrvalue is None:
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000445 self.syntax_error("no value specified for attribute `%s'" % attrname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 attrvalue = attrname
447 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
448 attrvalue[:1] == '"' == attrvalue[-1:]:
449 attrvalue = attrvalue[1:-1]
450 else:
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000451 self.syntax_error("attribute `%s' value not quoted" % attrname)
452 if '<' in attrvalue:
453 self.syntax_error("`<' illegal in attribute value")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000454 if attributes is not None and not attributes.has_key(attrname):
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000455 self.syntax_error("unknown attribute `%s' of element `%s'" %
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000456 (attrname, tag))
457 if attrdict.has_key(attrname):
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000458 self.syntax_error("attribute `%s' specified twice" % attrname)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000459 attrvalue = string.translate(attrvalue, attrtrans)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000460 attrdict[attrname] = self.translate_references(attrvalue)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000461 i = res.end(0)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000462 if attributes is not None:
463 # fill in with default attributes
464 for key, val in attributes.items():
465 if val is not None and not attrdict.has_key(key):
466 attrdict[key] = val
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000467 return attrdict, i
Guido van Rossuma219efa1997-11-18 15:09:54 +0000468
469 # Internal -- handle starttag, return length or -1 if not terminated
470 def parse_starttag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000471 rawdata = self.rawdata
472 # i points to start of tag
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000473 end = endbracketfind.match(rawdata, i+1)
474 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000475 return -1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000476 tag = starttagmatch.match(rawdata, i)
477 if tag is None or tag.end(0) != end.end(0):
478 self.syntax_error('garbage in starttag')
479 return end.end(0)
480 tagname = tag.group('tagname')
481 if not self.__seen_starttag and self.__seen_doctype and \
482 tagname != self.__seen_doctype:
483 self.syntax_error('starttag does not match DOCTYPE')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000484 if self.__seen_starttag and not self.stack:
485 self.syntax_error('multiple elements on top level')
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000486 if hasattr(self, tagname + '_attributes'):
487 attributes = getattr(self, tagname + '_attributes')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000488 else:
489 attributes = None
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000490 k, j = tag.span('attrs')
491 attrdict, k = self.parse_attributes(tagname, k, j, attributes)
492 self.finish_starttag(tagname, attrdict)
493 if tag.group('slash') == '/':
494 self.finish_endtag(tagname)
495 return tag.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000496
497 # Internal -- parse endtag
498 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000499 rawdata = self.rawdata
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000500 end = endbracketfind.match(rawdata, i+1)
501 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000502 return -1
503 res = tagfind.match(rawdata, i+2)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000504 if res is None:
Guido van Rossumf484a331998-12-07 21:59:56 +0000505 if self.literal:
506 self.handle_data(rawdata[i])
507 return i+1
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000508 self.syntax_error('no name specified in end tag')
509 tag = ''
510 k = i+2
511 else:
512 tag = res.group(0)
Guido van Rossumf484a331998-12-07 21:59:56 +0000513 if self.literal:
514 if not self.stack or tag != self.stack[-1]:
515 self.handle_data(rawdata[i])
516 return i+1
517 self.literal = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000518 k = res.end(0)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000519 if endbracket.match(rawdata, k) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000520 self.syntax_error('garbage in end tag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000521 self.finish_endtag(tag)
522 return end.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000523
524 # Internal -- finish processing of start tag
525 # Return -1 for unknown tag, 1 for balanced tag
526 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000527 self.stack.append(tag)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000528 methodname = 'start_' + tag
529 if hasattr(self, methodname):
530 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000531 self.handle_starttag(tag, method, attrs)
532 return 1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000533 else:
534 self.unknown_starttag(tag, attrs)
535 return -1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000536
537 # Internal -- finish processing of end tag
538 def finish_endtag(self, tag):
Guido van Rossum7e07b381998-04-03 16:02:39 +0000539 methodname = 'end_' + tag
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000540 if not tag:
541 self.syntax_error('name-less end tag')
542 found = len(self.stack) - 1
543 if found < 0:
544 self.unknown_endtag(tag)
545 return
546 else:
547 if tag not in self.stack:
548 self.syntax_error('unopened end tag')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000549 if hasattr(self, methodname):
550 method = getattr(self, methodname)
551 self.handle_endtag(tag, method)
552 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000553 self.unknown_endtag(tag)
554 return
555 found = len(self.stack)
556 for i in range(found):
557 if self.stack[i] == tag:
558 found = i
559 while len(self.stack) > found:
560 if found < len(self.stack) - 1:
561 self.syntax_error('missing close tag for %s' % self.stack[-1])
562 tag = self.stack[-1]
Guido van Rossum7e07b381998-04-03 16:02:39 +0000563 if hasattr(self, methodname):
564 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000565 self.handle_endtag(tag, method)
566 else:
567 self.unknown_endtag(tag)
568 del self.stack[-1]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000569
Guido van Rossum02505e41998-01-29 14:55:24 +0000570 # Overridable -- handle xml processing instruction
571 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000572 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000573
574 # Overridable -- handle DOCTYPE
Guido van Rossum7e07b381998-04-03 16:02:39 +0000575 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000576 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000577
Guido van Rossuma219efa1997-11-18 15:09:54 +0000578 # Overridable -- handle start tag
579 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000580 method(attrs)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000581
582 # Overridable -- handle end tag
583 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000584 method()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000585
586 # Example -- handle character reference, no need to override
587 def handle_charref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000588 try:
589 if name[0] == 'x':
590 n = string.atoi(name[1:], 16)
591 else:
592 n = string.atoi(name)
593 except string.atoi_error:
594 self.unknown_charref(name)
595 return
596 if not 0 <= n <= 255:
597 self.unknown_charref(name)
598 return
599 self.handle_data(chr(n))
Guido van Rossuma219efa1997-11-18 15:09:54 +0000600
601 # Definition of entities -- derived classes may override
Guido van Rossum7e07b381998-04-03 16:02:39 +0000602 entitydefs = {'lt': '&#60;', # must use charref
603 'gt': '&#62;',
604 'amp': '&#38;', # must use charref
605 'quot': '&#34;',
606 'apos': '&#39;',
607 }
Guido van Rossuma219efa1997-11-18 15:09:54 +0000608
609 # Example -- handle entity reference, no need to override
610 def handle_entityref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000611 table = self.entitydefs
612 if table.has_key(name):
613 self.handle_data(table[name])
614 else:
615 self.unknown_entityref(name)
616 return
Guido van Rossuma219efa1997-11-18 15:09:54 +0000617
618 # Example -- handle data, should be overridden
619 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000620 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000621
622 # Example -- handle cdata, could be overridden
623 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000624 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000625
626 # Example -- handle comment, could be overridden
627 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000628 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000629
630 # Example -- handle processing instructions, could be overridden
631 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000632 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000633
Guido van Rossuma219efa1997-11-18 15:09:54 +0000634 # Example -- handle relatively harmless syntax errors, could be overridden
Guido van Rossum02505e41998-01-29 14:55:24 +0000635 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000636 raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000637
638 # To be overridden -- handlers for unknown objects
639 def unknown_starttag(self, tag, attrs): pass
640 def unknown_endtag(self, tag): pass
641 def unknown_charref(self, ref): pass
642 def unknown_entityref(self, ref): pass
643
644
645class TestXMLParser(XMLParser):
646
647 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000648 self.testdata = ""
649 XMLParser.__init__(self, verbose)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000650
Guido van Rossum02505e41998-01-29 14:55:24 +0000651 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000652 self.flush()
653 print 'xml: encoding =',encoding,'standalone =',standalone
Guido van Rossum02505e41998-01-29 14:55:24 +0000654
Guido van Rossum7e07b381998-04-03 16:02:39 +0000655 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000656 self.flush()
657 print 'DOCTYPE:',tag, `data`
Guido van Rossum02505e41998-01-29 14:55:24 +0000658
Guido van Rossum7e07b381998-04-03 16:02:39 +0000659 def handle_entity(self, name, strval, pubid, syslit, ndata):
660 self.flush()
661 print 'ENTITY:',`data`
662
Guido van Rossuma219efa1997-11-18 15:09:54 +0000663 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000664 self.testdata = self.testdata + data
665 if len(`self.testdata`) >= 70:
666 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000667
668 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000669 data = self.testdata
670 if data:
671 self.testdata = ""
672 print 'data:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000673
674 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000675 self.flush()
676 print 'cdata:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000677
678 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000679 self.flush()
680 print 'processing:',name,`data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000681
Guido van Rossuma219efa1997-11-18 15:09:54 +0000682 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000683 self.flush()
684 r = `data`
685 if len(r) > 68:
686 r = r[:32] + '...' + r[-32:]
687 print 'comment:', r
Guido van Rossuma219efa1997-11-18 15:09:54 +0000688
Guido van Rossum02505e41998-01-29 14:55:24 +0000689 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000690 print 'error at line %d:' % self.lineno, message
Guido van Rossuma219efa1997-11-18 15:09:54 +0000691
692 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000693 self.flush()
694 if not attrs:
695 print 'start tag: <' + tag + '>'
696 else:
697 print 'start tag: <' + tag,
698 for name, value in attrs.items():
699 print name + '=' + '"' + value + '"',
700 print '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000701
702 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000703 self.flush()
704 print 'end tag: </' + tag + '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000705
706 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000707 self.flush()
708 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000709
710 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000711 self.flush()
712 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000713
714 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000715 XMLParser.close(self)
716 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000717
718def test(args = None):
719 import sys
720
721 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000722 args = sys.argv[1:]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000723
724 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000725 args = args[1:]
726 klass = XMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000727 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000728 klass = TestXMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000729
730 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000731 file = args[0]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000732 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000733 file = 'test.xml'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000734
735 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000736 f = sys.stdin
Guido van Rossuma219efa1997-11-18 15:09:54 +0000737 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000738 try:
739 f = open(file, 'r')
740 except IOError, msg:
741 print file, ":", msg
742 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000743
744 data = f.read()
745 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000746 f.close()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000747
748 x = klass()
Guido van Rossum7e07b381998-04-03 16:02:39 +0000749 try:
750 for c in data:
751 x.feed(c)
752 x.close()
753 except RuntimeError, msg:
754 print msg
755 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000756
757
758if __name__ == '__main__':
759 test()