blob: bea210b6fc24cdfa214686f8cf56de3bf7a197eb [file] [log] [blame]
Guido van Rossuma219efa1997-11-18 15:09:54 +00001# A parser for XML, using the derived class as static DTD.
Guido van Rossum5d68e8e1997-11-18 15:27:20 +00002# Author: Sjoerd Mullender.
Guido van Rossuma219efa1997-11-18 15:09:54 +00003
4import re
5import string
6
7
Guido van Rossum7e07b381998-04-03 16:02:39 +00008version = '0.1'
9
Guido van Rossuma219efa1997-11-18 15:09:54 +000010# Regular expressions used for parsing
11
Guido van Rossumeeb2f321998-10-19 13:28:26 +000012_S = '[ \t\r\n]+' # white space
13_opS = '[ \t\r\n]*' # optional white space
14_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name
15_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
Guido van Rossum7e07b381998-04-03 16:02:39 +000016illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
17interesting = re.compile('[]&<]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000018
Guido van Rossum7e07b381998-04-03 16:02:39 +000019amp = re.compile('&')
20ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
Guido van Rossuma219efa1997-11-18 15:09:54 +000021entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
22charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
Guido van Rossum7e07b381998-04-03 16:02:39 +000023space = re.compile(_S + '$')
Guido van Rossuma219efa1997-11-18 15:09:54 +000024newline = re.compile('\n')
25
Guido van Rossumeeb2f321998-10-19 13:28:26 +000026attrfind = re.compile(
27 _S + '(?P<name>' + _Name + ')'
28 '(' + _opS + '=' + _opS +
29 '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!()_#=~]+))?')
Guido van Rossuma219efa1997-11-18 15:09:54 +000030starttagopen = re.compile('<' + _Name)
Guido van Rossuma219efa1997-11-18 15:09:54 +000031starttagend = re.compile(_opS + '(?P<slash>/?)>')
Guido van Rossumeeb2f321998-10-19 13:28:26 +000032starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
33 '(?P<attrs>(?:'+attrfind.pattern+')*)'+
34 starttagend.pattern)
35endtagopen = re.compile('</')
Guido van Rossum7e07b381998-04-03 16:02:39 +000036endbracket = re.compile(_opS + '>')
Guido van Rossumeeb2f321998-10-19 13:28:26 +000037endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000038tagfind = re.compile(_Name)
Guido van Rossum02505e41998-01-29 14:55:24 +000039cdataopen = re.compile(r'<!\[CDATA\[')
40cdataclose = re.compile(r'\]\]>')
Guido van Rossum7e07b381998-04-03 16:02:39 +000041# this matches one of the following:
42# SYSTEM SystemLiteral
43# PUBLIC PubidLiteral SystemLiteral
Guido van Rossumeeb2f321998-10-19 13:28:26 +000044_SystemLiteral = '(?P<%s>'+_QStr+')'
Guido van Rossum7e07b381998-04-03 16:02:39 +000045_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
46 "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
47_ExternalId = '(?:SYSTEM|' \
48 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
49 ')'+_S+_SystemLiteral%'syslit'
50doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
51 '(?:'+_S+_ExternalId+')?'+_opS)
52xmldecl = re.compile('<\?xml'+_S+
Guido van Rossumeeb2f321998-10-19 13:28:26 +000053 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
Guido van Rossum7e07b381998-04-03 16:02:39 +000054 '(?:'+_S+'encoding'+_opS+'='+_opS+
55 "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
56 '"[A-Za-z][-A-Za-z0-9._]*"))?'
57 '(?:'+_S+'standalone'+_opS+'='+_opS+
58 '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
59 _opS+'\?>')
60procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
Guido van Rossum02505e41998-01-29 14:55:24 +000061procclose = re.compile(_opS + r'\?>')
Guido van Rossuma219efa1997-11-18 15:09:54 +000062commentopen = re.compile('<!--')
63commentclose = re.compile('-->')
64doubledash = re.compile('--')
Guido van Rossum7e07b381998-04-03 16:02:39 +000065attrtrans = string.maketrans(' \r\n\t', ' ')
Guido van Rossuma219efa1997-11-18 15:09:54 +000066
67
68# XML parser base class -- find tags and call handler functions.
69# Usage: p = XMLParser(); p.feed(data); ...; p.close().
Guido van Rossum5d68e8e1997-11-18 15:27:20 +000070# The dtd is defined by deriving a class which defines methods with
71# special names to handle tags: start_foo and end_foo to handle <foo>
72# and </foo>, respectively. The data between tags is passed to the
73# parser by calling self.handle_data() with some data as argument (the
74# data may be split up in arbutrary chunks). Entity references are
75# passed by calling self.handle_entityref() with the entity reference
76# as argument.
Guido van Rossuma219efa1997-11-18 15:09:54 +000077
78class XMLParser:
79
80 # Interface -- initialize and reset this instance
81 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000082 self.verbose = verbose
83 self.reset()
Guido van Rossuma219efa1997-11-18 15:09:54 +000084
85 # Interface -- reset this instance. Loses all unprocessed data
86 def reset(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 self.rawdata = ''
88 self.stack = []
89 self.nomoretags = 0
90 self.literal = 0
91 self.lineno = 1
92 self.__at_start = 1
93 self.__seen_doctype = None
94 self.__seen_starttag = 0
Guido van Rossuma219efa1997-11-18 15:09:54 +000095
96 # For derived classes only -- enter literal mode (CDATA) till EOF
97 def setnomoretags(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000098 self.nomoretags = self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +000099
100 # For derived classes only -- enter literal mode (CDATA)
101 def setliteral(self, *args):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000102 self.literal = 1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000103
104 # Interface -- feed some data to the parser. Call this as
105 # often as you want, with as little or as much text as you
106 # want (may include '\n'). (This just saves the text, all the
107 # processing is done by goahead().)
108 def feed(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000109 self.rawdata = self.rawdata + data
110 self.goahead(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000111
112 # Interface -- handle the remaining data
113 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000114 self.goahead(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000115
116 # Interface -- translate references
Guido van Rossum7e07b381998-04-03 16:02:39 +0000117 def translate_references(self, data, all = 1):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 i = 0
119 while 1:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000120 res = amp.search(data, i)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000121 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000122 return data
123 res = ref.match(data, res.start(0))
124 if res is None:
125 self.syntax_error("bogus `&'")
126 i =i+1
127 continue
128 i = res.end(0)
129 if data[i - 1] != ';':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 self.syntax_error("`;' missing after entity/char reference")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000131 i = i-1
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000132 str = res.group(1)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000133 pre = data[:res.start(0)]
134 post = data[i:]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000135 if str[0] == '#':
136 if str[1] == 'x':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000137 str = chr(string.atoi(str[2:], 16))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000138 else:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000139 str = chr(string.atoi(str[1:]))
140 data = pre + str + post
141 i = res.start(0)+len(str)
142 elif all:
143 if self.entitydefs.has_key(str):
144 data = pre + self.entitydefs[str] + post
145 i = res.start(0) # rescan substituted text
146 else:
147 self.syntax_error('reference to unknown entity')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000148 # can't do it, so keep the entity ref in
Guido van Rossum7e07b381998-04-03 16:02:39 +0000149 data = pre + '&' + str + ';' + post
150 i = res.start(0) + len(str) + 2
151 else:
152 # just translating character references
153 pass # i is already postioned correctly
Guido van Rossuma219efa1997-11-18 15:09:54 +0000154
155 # Internal -- handle data as far as reasonable. May leave state
156 # and data to be processed by a subsequent call. If 'end' is
157 # true, force handling all data as if followed by EOF marker.
158 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 rawdata = self.rawdata
160 i = 0
161 n = len(rawdata)
162 while i < n:
163 if i > 0:
164 self.__at_start = 0
165 if self.nomoretags:
166 data = rawdata[i:n]
167 self.handle_data(data)
168 self.lineno = self.lineno + string.count(data, '\n')
169 i = n
170 break
171 res = interesting.search(rawdata, i)
172 if res:
173 j = res.start(0)
174 else:
175 j = n
176 if i < j:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000177 if self.__at_start:
178 self.syntax_error('illegal data at start of file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 self.__at_start = 0
180 data = rawdata[i:j]
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000181 if not self.stack and space.match(data) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000182 self.syntax_error('data not in content')
183 if illegal.search(data):
184 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000185 self.handle_data(data)
186 self.lineno = self.lineno + string.count(data, '\n')
187 i = j
188 if i == n: break
189 if rawdata[i] == '<':
190 if starttagopen.match(rawdata, i):
191 if self.literal:
192 data = rawdata[i]
193 self.handle_data(data)
194 self.lineno = self.lineno + string.count(data, '\n')
195 i = i+1
196 continue
197 k = self.parse_starttag(i)
198 if k < 0: break
199 self.__seen_starttag = 1
200 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
201 i = k
202 continue
203 if endtagopen.match(rawdata, i):
204 k = self.parse_endtag(i)
205 if k < 0: break
206 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
207 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000208 continue
209 if commentopen.match(rawdata, i):
210 if self.literal:
211 data = rawdata[i]
212 self.handle_data(data)
213 self.lineno = self.lineno + string.count(data, '\n')
214 i = i+1
215 continue
216 k = self.parse_comment(i)
217 if k < 0: break
218 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
219 i = k
220 continue
221 if cdataopen.match(rawdata, i):
222 k = self.parse_cdata(i)
223 if k < 0: break
224 self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
225 i = k
226 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000227 res = xmldecl.match(rawdata, i)
228 if res:
229 if not self.__at_start:
230 self.syntax_error("<?xml?> declaration not at start of document")
231 version, encoding, standalone = res.group('version',
232 'encoding',
233 'standalone')
234 if version[1:-1] != '1.0':
235 raise RuntimeError, 'only XML version 1.0 supported'
236 if encoding: encoding = encoding[1:-1]
237 if standalone: standalone = standalone[1:-1]
238 self.handle_xml(encoding, standalone)
239 i = res.end(0)
240 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000241 res = procopen.match(rawdata, i)
242 if res:
243 k = self.parse_proc(i)
244 if k < 0: break
245 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
246 i = k
247 continue
248 res = doctype.match(rawdata, i)
249 if res:
250 if self.literal:
251 data = rawdata[i]
252 self.handle_data(data)
253 self.lineno = self.lineno + string.count(data, '\n')
254 i = i+1
255 continue
256 if self.__seen_doctype:
257 self.syntax_error('multiple DOCTYPE elements')
258 if self.__seen_starttag:
259 self.syntax_error('DOCTYPE not at beginning of document')
260 k = self.parse_doctype(res)
261 if k < 0: break
262 self.__seen_doctype = res.group('name')
263 self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
264 i = k
265 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 elif rawdata[i] == '&':
Guido van Rossum204b65c1998-12-08 13:25:00 +0000267 if self.literal:
268 data = rawdata[i]
269 self.handle_data(data)
270 i = i+1
271 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 res = charref.match(rawdata, i)
273 if res is not None:
274 i = res.end(0)
275 if rawdata[i-1] != ';':
276 self.syntax_error("`;' missing in charref")
277 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000278 if not self.stack:
279 self.syntax_error('data not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000280 self.handle_charref(res.group('char')[:-1])
281 self.lineno = self.lineno + string.count(res.group(0), '\n')
282 continue
283 res = entityref.match(rawdata, i)
284 if res is not None:
285 i = res.end(0)
286 if rawdata[i-1] != ';':
287 self.syntax_error("`;' missing in entityref")
288 i = i-1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000289 name = res.group('name')
290 if self.entitydefs.has_key(name):
291 self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
292 n = len(rawdata)
293 i = res.start(0)
294 else:
295 self.syntax_error('reference to unknown entity')
296 self.unknown_entityref(name)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000297 self.lineno = self.lineno + string.count(res.group(0), '\n')
298 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000299 elif rawdata[i] == ']':
Guido van Rossum204b65c1998-12-08 13:25:00 +0000300 if self.literal:
301 data = rawdata[i]
302 self.handle_data(data)
303 i = i+1
304 continue
Guido van Rossum7e07b381998-04-03 16:02:39 +0000305 if n-i < 3:
306 break
307 if cdataclose.match(rawdata, i):
308 self.syntax_error("bogus `]]>'")
309 self.handle_data(rawdata[i])
310 i = i+1
311 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 else:
313 raise RuntimeError, 'neither < nor & ??'
314 # We get here only if incomplete matches but
315 # nothing else
Guido van Rossum7e07b381998-04-03 16:02:39 +0000316 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 # end while
Guido van Rossum7e07b381998-04-03 16:02:39 +0000318 if i > 0:
319 self.__at_start = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000320 if end and i < n:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000321 data = rawdata[i]
322 self.syntax_error("bogus `%s'" % data)
323 if illegal.search(data):
324 self.syntax_error('illegal character in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000325 self.handle_data(data)
326 self.lineno = self.lineno + string.count(data, '\n')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000327 self.rawdata = rawdata[i+1:]
328 return self.goahead(end)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 self.rawdata = rawdata[i:]
330 if end:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000331 if not self.__seen_starttag:
332 self.syntax_error('no elements in file')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 if self.stack:
334 self.syntax_error('missing end tags')
335 while self.stack:
336 self.finish_endtag(self.stack[-1])
Guido van Rossuma219efa1997-11-18 15:09:54 +0000337
338 # Internal -- parse comment, return length or -1 if not terminated
339 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000340 rawdata = self.rawdata
341 if rawdata[i:i+4] <> '<!--':
342 raise RuntimeError, 'unexpected call to handle_comment'
343 res = commentclose.search(rawdata, i+4)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000344 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000345 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000346 if doubledash.search(rawdata, i+4, res.start(0)):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000347 self.syntax_error("`--' inside comment")
Guido van Rossum7e07b381998-04-03 16:02:39 +0000348 if rawdata[res.start(0)-1] == '-':
349 self.syntax_error('comment cannot end in three dashes')
350 if illegal.search(rawdata, i+4, res.start(0)):
351 self.syntax_error('illegal character in comment')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000352 self.handle_comment(rawdata[i+4: res.start(0)])
353 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000354
Guido van Rossum02505e41998-01-29 14:55:24 +0000355 # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
356 def parse_doctype(self, res):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000357 rawdata = self.rawdata
358 n = len(rawdata)
359 name = res.group('name')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000360 pubid, syslit = res.group('pubid', 'syslit')
361 if pubid is not None:
362 pubid = pubid[1:-1] # remove quotes
363 pubid = string.join(string.split(pubid)) # normalize
364 if syslit is not None: syslit = syslit[1:-1] # remove quotes
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000365 j = k = res.end(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000366 if k >= n:
367 return -1
368 if rawdata[k] == '[':
369 level = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000370 k = k+1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000371 dq = sq = 0
372 while k < n:
373 c = rawdata[k]
374 if not sq and c == '"':
375 dq = not dq
376 elif not dq and c == "'":
377 sq = not sq
378 elif sq or dq:
379 pass
380 elif level <= 0 and c == ']':
381 res = endbracket.match(rawdata, k+1)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000382 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000383 return -1
384 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
385 return res.end(0)
386 elif c == '<':
387 level = level + 1
388 elif c == '>':
389 level = level - 1
390 if level < 0:
391 self.syntax_error("bogus `>' in DOCTYPE")
392 k = k+1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000393 res = endbracketfind.match(rawdata, k)
394 if res is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000395 return -1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000396 if endbracket.match(rawdata, k) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000397 self.syntax_error('garbage in DOCTYPE')
398 self.handle_doctype(name, pubid, syslit, None)
399 return res.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000400
401 # Internal -- handle CDATA tag, return length or -1 if not terminated
Guido van Rossuma219efa1997-11-18 15:09:54 +0000402 def parse_cdata(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000403 rawdata = self.rawdata
404 if rawdata[i:i+9] <> '<![CDATA[':
Guido van Rossum7e07b381998-04-03 16:02:39 +0000405 raise RuntimeError, 'unexpected call to parse_cdata'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000406 res = cdataclose.search(rawdata, i+9)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000407 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000408 return -1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000409 if illegal.search(rawdata, i+9, res.start(0)):
410 self.syntax_error('illegal character in CDATA')
411 if not self.stack:
412 self.syntax_error('CDATA not in content')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000413 self.handle_cdata(rawdata[i+9:res.start(0)])
414 return res.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000415
Guido van Rossum02505e41998-01-29 14:55:24 +0000416 __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
417 # Internal -- handle a processing instruction tag
418 def parse_proc(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000419 rawdata = self.rawdata
420 end = procclose.search(rawdata, i)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000421 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000422 return -1
423 j = end.start(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000424 if illegal.search(rawdata, i+2, j):
425 self.syntax_error('illegal character in processing instruction')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000426 res = tagfind.match(rawdata, i+2)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000427 if res is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000428 raise RuntimeError, 'unexpected call to parse_proc'
429 k = res.end(0)
430 name = res.group(0)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000431 if string.find(string.lower(name), 'xml') >= 0:
432 self.syntax_error('illegal processing instruction target name')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000433 self.handle_proc(name, rawdata[k:j])
434 return end.end(0)
Guido van Rossum02505e41998-01-29 14:55:24 +0000435
436 # Internal -- parse attributes between i and j
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000437 def parse_attributes(self, tag, i, j, attributes = None):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000438 rawdata = self.rawdata
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000439 # Now parse the data between i and j into a tag and attrs
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000440 attrdict = {}
441 try:
442 # convert attributes list to dictionary
443 d = {}
444 for a in attributes:
445 d[a] = None
446 attributes = d
447 except TypeError:
448 pass
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000449 while i < j:
450 res = attrfind.match(rawdata, i)
451 if res is None:
452 break
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000453 attrname, attrvalue = res.group('name', 'value')
454 if attrvalue is None:
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000455 self.syntax_error("no value specified for attribute `%s'" % attrname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000456 attrvalue = attrname
457 elif attrvalue[:1] == "'" == attrvalue[-1:] or \
458 attrvalue[:1] == '"' == attrvalue[-1:]:
459 attrvalue = attrvalue[1:-1]
460 else:
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000461 self.syntax_error("attribute `%s' value not quoted" % attrname)
462 if '<' in attrvalue:
463 self.syntax_error("`<' illegal in attribute value")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000464 if attributes is not None and not attributes.has_key(attrname):
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000465 self.syntax_error("unknown attribute `%s' of element `%s'" %
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000466 (attrname, tag))
467 if attrdict.has_key(attrname):
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000468 self.syntax_error("attribute `%s' specified twice" % attrname)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000469 attrvalue = string.translate(attrvalue, attrtrans)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000470 attrdict[attrname] = self.translate_references(attrvalue)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000471 i = res.end(0)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000472 if attributes is not None:
473 # fill in with default attributes
474 for key, val in attributes.items():
475 if val is not None and not attrdict.has_key(key):
476 attrdict[key] = val
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000477 return attrdict, i
Guido van Rossuma219efa1997-11-18 15:09:54 +0000478
479 # Internal -- handle starttag, return length or -1 if not terminated
480 def parse_starttag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000481 rawdata = self.rawdata
482 # i points to start of tag
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000483 end = endbracketfind.match(rawdata, i+1)
484 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000485 return -1
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000486 tag = starttagmatch.match(rawdata, i)
487 if tag is None or tag.end(0) != end.end(0):
488 self.syntax_error('garbage in starttag')
489 return end.end(0)
490 tagname = tag.group('tagname')
491 if not self.__seen_starttag and self.__seen_doctype and \
492 tagname != self.__seen_doctype:
493 self.syntax_error('starttag does not match DOCTYPE')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000494 if self.__seen_starttag and not self.stack:
495 self.syntax_error('multiple elements on top level')
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000496 if hasattr(self, tagname + '_attributes'):
497 attributes = getattr(self, tagname + '_attributes')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000498 else:
499 attributes = None
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000500 k, j = tag.span('attrs')
501 attrdict, k = self.parse_attributes(tagname, k, j, attributes)
502 self.finish_starttag(tagname, attrdict)
503 if tag.group('slash') == '/':
504 self.finish_endtag(tagname)
505 return tag.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000506
507 # Internal -- parse endtag
508 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000509 rawdata = self.rawdata
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000510 end = endbracketfind.match(rawdata, i+1)
511 if end is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000512 return -1
513 res = tagfind.match(rawdata, i+2)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000514 if res is None:
Guido van Rossumf484a331998-12-07 21:59:56 +0000515 if self.literal:
516 self.handle_data(rawdata[i])
517 return i+1
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000518 self.syntax_error('no name specified in end tag')
519 tag = ''
520 k = i+2
521 else:
522 tag = res.group(0)
Guido van Rossumf484a331998-12-07 21:59:56 +0000523 if self.literal:
524 if not self.stack or tag != self.stack[-1]:
525 self.handle_data(rawdata[i])
526 return i+1
527 self.literal = 0
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000528 k = res.end(0)
Guido van Rossumeeb2f321998-10-19 13:28:26 +0000529 if endbracket.match(rawdata, k) is None:
Guido van Rossum7e07b381998-04-03 16:02:39 +0000530 self.syntax_error('garbage in end tag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000531 self.finish_endtag(tag)
532 return end.end(0)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000533
534 # Internal -- finish processing of start tag
535 # Return -1 for unknown tag, 1 for balanced tag
536 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000537 self.stack.append(tag)
Guido van Rossum7e07b381998-04-03 16:02:39 +0000538 methodname = 'start_' + tag
539 if hasattr(self, methodname):
540 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000541 self.handle_starttag(tag, method, attrs)
542 return 1
Guido van Rossum7e07b381998-04-03 16:02:39 +0000543 else:
544 self.unknown_starttag(tag, attrs)
545 return -1
Guido van Rossuma219efa1997-11-18 15:09:54 +0000546
547 # Internal -- finish processing of end tag
548 def finish_endtag(self, tag):
Guido van Rossum7e07b381998-04-03 16:02:39 +0000549 methodname = 'end_' + tag
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000550 if not tag:
551 self.syntax_error('name-less end tag')
552 found = len(self.stack) - 1
553 if found < 0:
554 self.unknown_endtag(tag)
555 return
556 else:
557 if tag not in self.stack:
558 self.syntax_error('unopened end tag')
Guido van Rossum7e07b381998-04-03 16:02:39 +0000559 if hasattr(self, methodname):
560 method = getattr(self, methodname)
561 self.handle_endtag(tag, method)
562 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000563 self.unknown_endtag(tag)
564 return
565 found = len(self.stack)
566 for i in range(found):
567 if self.stack[i] == tag:
568 found = i
569 while len(self.stack) > found:
570 if found < len(self.stack) - 1:
571 self.syntax_error('missing close tag for %s' % self.stack[-1])
572 tag = self.stack[-1]
Guido van Rossum7e07b381998-04-03 16:02:39 +0000573 if hasattr(self, methodname):
574 method = getattr(self, methodname)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000575 self.handle_endtag(tag, method)
576 else:
577 self.unknown_endtag(tag)
578 del self.stack[-1]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000579
Guido van Rossum02505e41998-01-29 14:55:24 +0000580 # Overridable -- handle xml processing instruction
581 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000582 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000583
584 # Overridable -- handle DOCTYPE
Guido van Rossum7e07b381998-04-03 16:02:39 +0000585 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000586 pass
Guido van Rossum02505e41998-01-29 14:55:24 +0000587
Guido van Rossuma219efa1997-11-18 15:09:54 +0000588 # Overridable -- handle start tag
589 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000590 method(attrs)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000591
592 # Overridable -- handle end tag
593 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000594 method()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000595
596 # Example -- handle character reference, no need to override
597 def handle_charref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000598 try:
599 if name[0] == 'x':
600 n = string.atoi(name[1:], 16)
601 else:
602 n = string.atoi(name)
603 except string.atoi_error:
604 self.unknown_charref(name)
605 return
606 if not 0 <= n <= 255:
607 self.unknown_charref(name)
608 return
609 self.handle_data(chr(n))
Guido van Rossuma219efa1997-11-18 15:09:54 +0000610
611 # Definition of entities -- derived classes may override
Guido van Rossum7e07b381998-04-03 16:02:39 +0000612 entitydefs = {'lt': '&#60;', # must use charref
613 'gt': '&#62;',
614 'amp': '&#38;', # must use charref
615 'quot': '&#34;',
616 'apos': '&#39;',
617 }
Guido van Rossuma219efa1997-11-18 15:09:54 +0000618
619 # Example -- handle entity reference, no need to override
620 def handle_entityref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000621 table = self.entitydefs
622 if table.has_key(name):
623 self.handle_data(table[name])
624 else:
625 self.unknown_entityref(name)
626 return
Guido van Rossuma219efa1997-11-18 15:09:54 +0000627
628 # Example -- handle data, should be overridden
629 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000630 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000631
632 # Example -- handle cdata, could be overridden
633 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000634 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000635
636 # Example -- handle comment, could be overridden
637 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000638 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000639
640 # Example -- handle processing instructions, could be overridden
641 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000642 pass
Guido van Rossuma219efa1997-11-18 15:09:54 +0000643
Guido van Rossuma219efa1997-11-18 15:09:54 +0000644 # Example -- handle relatively harmless syntax errors, could be overridden
Guido van Rossum02505e41998-01-29 14:55:24 +0000645 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000646 raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000647
648 # To be overridden -- handlers for unknown objects
649 def unknown_starttag(self, tag, attrs): pass
650 def unknown_endtag(self, tag): pass
651 def unknown_charref(self, ref): pass
652 def unknown_entityref(self, ref): pass
653
654
655class TestXMLParser(XMLParser):
656
657 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000658 self.testdata = ""
659 XMLParser.__init__(self, verbose)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000660
Guido van Rossum02505e41998-01-29 14:55:24 +0000661 def handle_xml(self, encoding, standalone):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000662 self.flush()
663 print 'xml: encoding =',encoding,'standalone =',standalone
Guido van Rossum02505e41998-01-29 14:55:24 +0000664
Guido van Rossum7e07b381998-04-03 16:02:39 +0000665 def handle_doctype(self, tag, pubid, syslit, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000666 self.flush()
667 print 'DOCTYPE:',tag, `data`
Guido van Rossum02505e41998-01-29 14:55:24 +0000668
Guido van Rossum7e07b381998-04-03 16:02:39 +0000669 def handle_entity(self, name, strval, pubid, syslit, ndata):
670 self.flush()
671 print 'ENTITY:',`data`
672
Guido van Rossuma219efa1997-11-18 15:09:54 +0000673 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000674 self.testdata = self.testdata + data
675 if len(`self.testdata`) >= 70:
676 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000677
678 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000679 data = self.testdata
680 if data:
681 self.testdata = ""
682 print 'data:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000683
684 def handle_cdata(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000685 self.flush()
686 print 'cdata:', `data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000687
688 def handle_proc(self, name, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000689 self.flush()
690 print 'processing:',name,`data`
Guido van Rossuma219efa1997-11-18 15:09:54 +0000691
Guido van Rossuma219efa1997-11-18 15:09:54 +0000692 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000693 self.flush()
694 r = `data`
695 if len(r) > 68:
696 r = r[:32] + '...' + r[-32:]
697 print 'comment:', r
Guido van Rossuma219efa1997-11-18 15:09:54 +0000698
Guido van Rossum02505e41998-01-29 14:55:24 +0000699 def syntax_error(self, message):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000700 print 'error at line %d:' % self.lineno, message
Guido van Rossuma219efa1997-11-18 15:09:54 +0000701
702 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000703 self.flush()
704 if not attrs:
705 print 'start tag: <' + tag + '>'
706 else:
707 print 'start tag: <' + tag,
708 for name, value in attrs.items():
709 print name + '=' + '"' + value + '"',
710 print '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000711
712 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000713 self.flush()
714 print 'end tag: </' + tag + '>'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000715
716 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000717 self.flush()
718 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000719
720 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000721 self.flush()
722 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000723
724 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000725 XMLParser.close(self)
726 self.flush()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000727
728def test(args = None):
729 import sys
730
731 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000732 args = sys.argv[1:]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000733
734 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000735 args = args[1:]
736 klass = XMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000737 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000738 klass = TestXMLParser
Guido van Rossuma219efa1997-11-18 15:09:54 +0000739
740 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000741 file = args[0]
Guido van Rossuma219efa1997-11-18 15:09:54 +0000742 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000743 file = 'test.xml'
Guido van Rossuma219efa1997-11-18 15:09:54 +0000744
745 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000746 f = sys.stdin
Guido van Rossuma219efa1997-11-18 15:09:54 +0000747 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000748 try:
749 f = open(file, 'r')
750 except IOError, msg:
751 print file, ":", msg
752 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000753
754 data = f.read()
755 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000756 f.close()
Guido van Rossuma219efa1997-11-18 15:09:54 +0000757
758 x = klass()
Guido van Rossum7e07b381998-04-03 16:02:39 +0000759 try:
760 for c in data:
761 x.feed(c)
762 x.close()
763 except RuntimeError, msg:
764 print msg
765 sys.exit(1)
Guido van Rossuma219efa1997-11-18 15:09:54 +0000766
767
768if __name__ == '__main__':
769 test()