blob: 6c7b77ed50f093ff837416769e3ef4d35099e967 [file] [log] [blame]
Fred Drake45cd9de2000-06-29 19:34:54 +00001"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers
2should be based on this code. """
Fred Drake07cbc4e2000-09-21 17:43:48 +00003
4import handler
Martin v. Löwis22adac52001-06-07 05:49:05 +00005from _exceptions import *
Fred Drake904f2fc2001-03-14 22:43:47 +00006
Fred Drake45cd9de2000-06-29 19:34:54 +00007# ===== XMLREADER =====
8
9class XMLReader:
Fred Drake16f63292000-10-23 18:09:50 +000010 """Interface for reading an XML document using callbacks.
Lars Gustäbelbb757132000-09-24 20:38:18 +000011
12 XMLReader is the interface that an XML parser's SAX2 driver must
13 implement. This interface allows an application to set and query
14 features and properties in the parser, to register event handlers
15 for document processing, and to initiate a document parse.
16
17 All SAX interfaces are assumed to be synchronous: the parse
18 methods must not return until parsing is complete, and readers
19 must wait for an event-handler callback to return before reporting
20 the next event."""
Fred Drake16f63292000-10-23 18:09:50 +000021
Fred Drake45cd9de2000-06-29 19:34:54 +000022 def __init__(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +000023 self._cont_handler = handler.ContentHandler()
Lars Gustäbele292a242000-09-24 20:19:45 +000024 self._dtd_handler = handler.DTDHandler()
25 self._ent_handler = handler.EntityResolver()
Fred Drake07cbc4e2000-09-21 17:43:48 +000026 self._err_handler = handler.ErrorHandler()
Fred Drake45cd9de2000-06-29 19:34:54 +000027
28 def parse(self, source):
Skip Montanarof9059eb2000-07-06 03:01:40 +000029 "Parse an XML document from a system identifier or an InputSource."
Fred Drake45cd9de2000-06-29 19:34:54 +000030 raise NotImplementedError("This method must be implemented!")
31
32 def getContentHandler(self):
33 "Returns the current ContentHandler."
34 return self._cont_handler
35
36 def setContentHandler(self, handler):
37 "Registers a new object to receive document content events."
38 self._cont_handler = handler
Fred Drake16f63292000-10-23 18:09:50 +000039
Fred Drake45cd9de2000-06-29 19:34:54 +000040 def getDTDHandler(self):
41 "Returns the current DTD handler."
42 return self._dtd_handler
Fred Drake16f63292000-10-23 18:09:50 +000043
Fred Drake45cd9de2000-06-29 19:34:54 +000044 def setDTDHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000045 "Register an object to receive basic DTD-related events."
46 self._dtd_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000047
48 def getEntityResolver(self):
49 "Returns the current EntityResolver."
50 return self._ent_handler
Fred Drake16f63292000-10-23 18:09:50 +000051
Fred Drake45cd9de2000-06-29 19:34:54 +000052 def setEntityResolver(self, resolver):
Skip Montanarof9059eb2000-07-06 03:01:40 +000053 "Register an object to resolve external entities."
54 self._ent_handler = resolver
Fred Drake45cd9de2000-06-29 19:34:54 +000055
56 def getErrorHandler(self):
57 "Returns the current ErrorHandler."
58 return self._err_handler
Fred Drake16f63292000-10-23 18:09:50 +000059
Fred Drake45cd9de2000-06-29 19:34:54 +000060 def setErrorHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000061 "Register an object to receive error-message events."
62 self._err_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000063
64 def setLocale(self, locale):
Fred Drake16f63292000-10-23 18:09:50 +000065 """Allow an application to set the locale for errors and warnings.
66
Thomas Wouters7e474022000-07-16 12:04:32 +000067 SAX parsers are not required to provide localization for errors
Fred Drake45cd9de2000-06-29 19:34:54 +000068 and warnings; if they cannot support the requested locale,
69 however, they must throw a SAX exception. Applications may
70 request a locale change in the middle of a parse."""
71 raise SAXNotSupportedException("Locale support not implemented")
Fred Drake16f63292000-10-23 18:09:50 +000072
Fred Drake45cd9de2000-06-29 19:34:54 +000073 def getFeature(self, name):
74 "Looks up and returns the state of a SAX2 feature."
75 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
76
77 def setFeature(self, name, state):
78 "Sets the state of a SAX2 feature."
79 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
80
81 def getProperty(self, name):
82 "Looks up and returns the value of a SAX2 property."
83 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
84
85 def setProperty(self, name, value):
86 "Sets the value of a SAX2 property."
87 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
88
Fred Drake45cd9de2000-06-29 19:34:54 +000089class IncrementalParser(XMLReader):
90 """This interface adds three extra methods to the XMLReader
91 interface that allow XML parsers to support incremental
92 parsing. Support for this interface is optional, since not all
93 underlying XML parsers support this functionality.
94
95 When the parser is instantiated it is ready to begin accepting
96 data from the feed method immediately. After parsing has been
97 finished with a call to close the reset method must be called to
98 make the parser ready to accept new data, either from feed or
99 using the parse method.
100
101 Note that these methods must _not_ be called during parsing, that
102 is, after parse has been called and before it returns.
103
104 By default, the class also implements the parse method of the XMLReader
105 interface using the feed, close and reset methods of the
106 IncrementalParser interface as a convenience to SAX 2.0 driver
107 writers."""
Fred Drake07cbc4e2000-09-21 17:43:48 +0000108
109 def __init__(self, bufsize=2**16):
110 self._bufsize = bufsize
111 XMLReader.__init__(self)
112
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000113 def parse(self, source):
Martin v. Löwis491ded72000-09-29 18:59:50 +0000114 import saxutils
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000115 source = saxutils.prepare_input_source(source)
Fred Drake16f63292000-10-23 18:09:50 +0000116
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000117 self.prepareParser(source)
118 file = source.getByteStream()
119 buffer = file.read(self._bufsize)
Fred Drake45cd9de2000-06-29 19:34:54 +0000120 while buffer != "":
121 self.feed(buffer)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000122 buffer = file.read(self._bufsize)
Martin v. Löwis31b485f2000-10-06 21:12:12 +0000123 self.close()
Fred Drake45cd9de2000-06-29 19:34:54 +0000124
Fred Drake16f63292000-10-23 18:09:50 +0000125 def feed(self, data):
Fred Drake45cd9de2000-06-29 19:34:54 +0000126 """This method gives the raw XML data in the data parameter to
127 the parser and makes it parse the data, emitting the
128 corresponding events. It is allowed for XML constructs to be
129 split across several calls to feed.
130
131 feed may raise SAXException."""
132 raise NotImplementedError("This method must be implemented!")
Fred Drake07cbc4e2000-09-21 17:43:48 +0000133
Fred Drake45cd9de2000-06-29 19:34:54 +0000134 def prepareParser(self, source):
135 """This method is called by the parse implementation to allow
136 the SAX 2.0 driver to prepare itself for parsing."""
137 raise NotImplementedError("prepareParser must be overridden!")
138
139 def close(self):
140 """This method is called when the entire XML document has been
141 passed to the parser through the feed method, to notify the
142 parser that there are no more data. This allows the parser to
143 do the final checks on the document and empty the internal
144 data buffer.
145
146 The parser will not be ready to parse another document until
147 the reset method has been called.
148
149 close may raise SAXException."""
150 raise NotImplementedError("This method must be implemented!")
151
152 def reset(self):
153 """This method is called after close has been called to reset
154 the parser so that it is ready to parse new documents. The
155 results of calling parse or feed after close without calling
156 reset are undefined."""
157 raise NotImplementedError("This method must be implemented!")
158
159# ===== LOCATOR =====
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000160
Fred Drake45cd9de2000-06-29 19:34:54 +0000161class Locator:
162 """Interface for associating a SAX event with a document
163 location. A locator object will return valid results only during
164 calls to DocumentHandler methods; at any other time, the
165 results are unpredictable."""
166
167 def getColumnNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000168 "Return the column number where the current event ends."
169 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000170
171 def getLineNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000172 "Return the line number where the current event ends."
173 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000174
175 def getPublicId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000176 "Return the public identifier for the current event."
177 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000178
179 def getSystemId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000180 "Return the system identifier for the current event."
181 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000182
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000183# ===== INPUTSOURCE =====
184
185class InputSource:
186 """Encapsulation of the information needed by the XMLReader to
187 read entities.
188
189 This class may include information about the public identifier,
190 system identifier, byte stream (possibly with character encoding
191 information) and/or the character stream of an entity.
192
193 Applications will create objects of this class for use in the
194 XMLReader.parse method and for returning from
195 EntityResolver.resolveEntity.
196
197 An InputSource belongs to the application, the XMLReader is not
198 allowed to modify InputSource objects passed to it from the
199 application, although it may make copies and modify those."""
200
201 def __init__(self, system_id = None):
202 self.__system_id = system_id
203 self.__public_id = None
204 self.__encoding = None
205 self.__bytefile = None
206 self.__charfile = None
207
208 def setPublicId(self, public_id):
209 "Sets the public identifier of this InputSource."
210 self.__public_id = public_id
211
212 def getPublicId(self):
213 "Returns the public identifier of this InputSource."
214 return self.__public_id
215
216 def setSystemId(self, system_id):
217 "Sets the system identifier of this InputSource."
218 self.__system_id = system_id
219
220 def getSystemId(self):
221 "Returns the system identifier of this InputSource."
222 return self.__system_id
223
224 def setEncoding(self, encoding):
225 """Sets the character encoding of this InputSource.
226
227 The encoding must be a string acceptable for an XML encoding
228 declaration (see section 4.3.3 of the XML recommendation).
229
230 The encoding attribute of the InputSource is ignored if the
231 InputSource also contains a character stream."""
232 self.__encoding = encoding
233
234 def getEncoding(self):
235 "Get the character encoding of this InputSource."
236 return self.__encoding
237
238 def setByteStream(self, bytefile):
239 """Set the byte stream (a Python file-like object which does
240 not perform byte-to-character conversion) for this input
241 source.
Fred Drake16f63292000-10-23 18:09:50 +0000242
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000243 The SAX parser will ignore this if there is also a character
244 stream specified, but it will use a byte stream in preference
245 to opening a URI connection itself.
246
247 If the application knows the character encoding of the byte
248 stream, it should set it with the setEncoding method."""
249 self.__bytefile = bytefile
250
251 def getByteStream(self):
252 """Get the byte stream for this input source.
Fred Drake16f63292000-10-23 18:09:50 +0000253
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000254 The getEncoding method will return the character encoding for
Fred Drake16f63292000-10-23 18:09:50 +0000255 this byte stream, or None if unknown."""
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000256 return self.__bytefile
Fred Drake16f63292000-10-23 18:09:50 +0000257
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000258 def setCharacterStream(self, charfile):
259 """Set the character stream for this input source. (The stream
Martin v. Löwis711a5bd2001-01-27 08:56:24 +0000260 must be a Python 2.0 Unicode-wrapped file-like that performs
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000261 conversion to Unicode strings.)
Fred Drake16f63292000-10-23 18:09:50 +0000262
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000263 If there is a character stream specified, the SAX parser will
264 ignore any byte stream and will not attempt to open a URI
265 connection to the system identifier."""
266 self.__charfile = charfile
267
268 def getCharacterStream(self):
269 "Get the character stream for this input source."
270 return self.__charfile
Fred Drake16f63292000-10-23 18:09:50 +0000271
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000272# ===== ATTRIBUTESIMPL =====
273
Fred Drake45cd9de2000-06-29 19:34:54 +0000274class AttributesImpl:
Fred Drake16f63292000-10-23 18:09:50 +0000275
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000276 def __init__(self, attrs):
277 """Non-NS-aware implementation.
278
279 attrs should be of the form {name : value}."""
Fred Drake45cd9de2000-06-29 19:34:54 +0000280 self._attrs = attrs
Fred Drake45cd9de2000-06-29 19:34:54 +0000281
282 def getLength(self):
283 return len(self._attrs)
284
285 def getType(self, name):
286 return "CDATA"
287
288 def getValue(self, name):
289 return self._attrs[name]
290
291 def getValueByQName(self, name):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000292 return self._attrs[name]
Fred Drake45cd9de2000-06-29 19:34:54 +0000293
294 def getNameByQName(self, name):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000295 if not self._attrs.has_key(name):
Fred Drakeec126da2000-12-13 20:48:29 +0000296 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000297 return name
Fred Drake45cd9de2000-06-29 19:34:54 +0000298
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000299 def getQNameByName(self, name):
300 if not self._attrs.has_key(name):
Fred Drakeec126da2000-12-13 20:48:29 +0000301 raise KeyError, name
Fred Drake16f63292000-10-23 18:09:50 +0000302 return name
303
Fred Drake45cd9de2000-06-29 19:34:54 +0000304 def getNames(self):
305 return self._attrs.keys()
306
307 def getQNames(self):
Fred Drake16f63292000-10-23 18:09:50 +0000308 return self._attrs.keys()
Fred Drake45cd9de2000-06-29 19:34:54 +0000309
310 def __len__(self):
311 return len(self._attrs)
312
313 def __getitem__(self, name):
314 return self._attrs[name]
315
316 def keys(self):
317 return self._attrs.keys()
318
319 def has_key(self, name):
320 return self._attrs.has_key(name)
321
322 def get(self, name, alternative=None):
323 return self._attrs.get(name, alternative)
324
325 def copy(self):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000326 return self.__class__(self._attrs)
Fred Drake45cd9de2000-06-29 19:34:54 +0000327
328 def items(self):
329 return self._attrs.items()
330
331 def values(self):
332 return self._attrs.values()
333
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000334# ===== ATTRIBUTESNSIMPL =====
335
336class AttributesNSImpl(AttributesImpl):
Fred Drake16f63292000-10-23 18:09:50 +0000337
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000338 def __init__(self, attrs, qnames):
339 """NS-aware implementation.
340
341 attrs should be of the form {(ns_uri, lname): value, ...}.
342 qnames of the form {(ns_uri, lname): qname, ...}."""
343 self._attrs = attrs
344 self._qnames = qnames
345
346 def getValueByQName(self, name):
347 for (nsname, qname) in self._qnames.items():
348 if qname == name:
349 return self._attrs[nsname]
Fred Drake16f63292000-10-23 18:09:50 +0000350
Fred Drakeec126da2000-12-13 20:48:29 +0000351 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000352
353 def getNameByQName(self, name):
354 for (nsname, qname) in self._qnames.items():
355 if qname == name:
356 return nsname
Fred Drake16f63292000-10-23 18:09:50 +0000357
Fred Drakeec126da2000-12-13 20:48:29 +0000358 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000359
360 def getQNameByName(self, name):
361 return self._qnames[name]
Fred Drake16f63292000-10-23 18:09:50 +0000362
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000363 def getQNames(self):
364 return self._qnames.values()
365
366 def copy(self):
367 return self.__class__(self._attrs, self._qnames)
Fred Drake16f63292000-10-23 18:09:50 +0000368
Fred Drake07cbc4e2000-09-21 17:43:48 +0000369
Fred Drake45cd9de2000-06-29 19:34:54 +0000370def _test():
371 XMLReader()
372 IncrementalParser()
373 Locator()
Fred Drake45cd9de2000-06-29 19:34:54 +0000374
Fred Drake07cbc4e2000-09-21 17:43:48 +0000375if __name__ == "__main__":
Fred Drake45cd9de2000-06-29 19:34:54 +0000376 _test()