blob: 58ee814baf2f14ae8cebe655496fd8c5c01773f4 [file] [log] [blame]
Fred Drake45cd9de2000-06-29 19:34:54 +00001"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers
2should be based on this code. """
Fred Drake07cbc4e2000-09-21 17:43:48 +00003
4import handler
5
Fred Drake45cd9de2000-06-29 19:34:54 +00006# ===== XMLREADER =====
7
8class XMLReader:
Fred Drake16f63292000-10-23 18:09:50 +00009 """Interface for reading an XML document using callbacks.
Lars Gustäbelbb757132000-09-24 20:38:18 +000010
11 XMLReader is the interface that an XML parser's SAX2 driver must
12 implement. This interface allows an application to set and query
13 features and properties in the parser, to register event handlers
14 for document processing, and to initiate a document parse.
15
16 All SAX interfaces are assumed to be synchronous: the parse
17 methods must not return until parsing is complete, and readers
18 must wait for an event-handler callback to return before reporting
19 the next event."""
Fred Drake16f63292000-10-23 18:09:50 +000020
Fred Drake45cd9de2000-06-29 19:34:54 +000021 def __init__(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +000022 self._cont_handler = handler.ContentHandler()
Lars Gustäbele292a242000-09-24 20:19:45 +000023 self._dtd_handler = handler.DTDHandler()
24 self._ent_handler = handler.EntityResolver()
Fred Drake07cbc4e2000-09-21 17:43:48 +000025 self._err_handler = handler.ErrorHandler()
Fred Drake45cd9de2000-06-29 19:34:54 +000026
27 def parse(self, source):
Skip Montanarof9059eb2000-07-06 03:01:40 +000028 "Parse an XML document from a system identifier or an InputSource."
Fred Drake45cd9de2000-06-29 19:34:54 +000029 raise NotImplementedError("This method must be implemented!")
30
31 def getContentHandler(self):
32 "Returns the current ContentHandler."
33 return self._cont_handler
34
35 def setContentHandler(self, handler):
36 "Registers a new object to receive document content events."
37 self._cont_handler = handler
Fred Drake16f63292000-10-23 18:09:50 +000038
Fred Drake45cd9de2000-06-29 19:34:54 +000039 def getDTDHandler(self):
40 "Returns the current DTD handler."
41 return self._dtd_handler
Fred Drake16f63292000-10-23 18:09:50 +000042
Fred Drake45cd9de2000-06-29 19:34:54 +000043 def setDTDHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000044 "Register an object to receive basic DTD-related events."
45 self._dtd_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000046
47 def getEntityResolver(self):
48 "Returns the current EntityResolver."
49 return self._ent_handler
Fred Drake16f63292000-10-23 18:09:50 +000050
Fred Drake45cd9de2000-06-29 19:34:54 +000051 def setEntityResolver(self, resolver):
Skip Montanarof9059eb2000-07-06 03:01:40 +000052 "Register an object to resolve external entities."
53 self._ent_handler = resolver
Fred Drake45cd9de2000-06-29 19:34:54 +000054
55 def getErrorHandler(self):
56 "Returns the current ErrorHandler."
57 return self._err_handler
Fred Drake16f63292000-10-23 18:09:50 +000058
Fred Drake45cd9de2000-06-29 19:34:54 +000059 def setErrorHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000060 "Register an object to receive error-message events."
61 self._err_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000062
63 def setLocale(self, locale):
Fred Drake16f63292000-10-23 18:09:50 +000064 """Allow an application to set the locale for errors and warnings.
65
Thomas Wouters7e474022000-07-16 12:04:32 +000066 SAX parsers are not required to provide localization for errors
Fred Drake45cd9de2000-06-29 19:34:54 +000067 and warnings; if they cannot support the requested locale,
68 however, they must throw a SAX exception. Applications may
69 request a locale change in the middle of a parse."""
70 raise SAXNotSupportedException("Locale support not implemented")
Fred Drake16f63292000-10-23 18:09:50 +000071
Fred Drake45cd9de2000-06-29 19:34:54 +000072 def getFeature(self, name):
73 "Looks up and returns the state of a SAX2 feature."
74 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
75
76 def setFeature(self, name, state):
77 "Sets the state of a SAX2 feature."
78 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
79
80 def getProperty(self, name):
81 "Looks up and returns the value of a SAX2 property."
82 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
83
84 def setProperty(self, name, value):
85 "Sets the value of a SAX2 property."
86 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
87
Fred Drake45cd9de2000-06-29 19:34:54 +000088class IncrementalParser(XMLReader):
89 """This interface adds three extra methods to the XMLReader
90 interface that allow XML parsers to support incremental
91 parsing. Support for this interface is optional, since not all
92 underlying XML parsers support this functionality.
93
94 When the parser is instantiated it is ready to begin accepting
95 data from the feed method immediately. After parsing has been
96 finished with a call to close the reset method must be called to
97 make the parser ready to accept new data, either from feed or
98 using the parse method.
99
100 Note that these methods must _not_ be called during parsing, that
101 is, after parse has been called and before it returns.
102
103 By default, the class also implements the parse method of the XMLReader
104 interface using the feed, close and reset methods of the
105 IncrementalParser interface as a convenience to SAX 2.0 driver
106 writers."""
Fred Drake07cbc4e2000-09-21 17:43:48 +0000107
108 def __init__(self, bufsize=2**16):
109 self._bufsize = bufsize
110 XMLReader.__init__(self)
111
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000112 def parse(self, source):
Martin v. Löwis491ded72000-09-29 18:59:50 +0000113 import saxutils
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000114 source = saxutils.prepare_input_source(source)
Fred Drake16f63292000-10-23 18:09:50 +0000115
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000116 self.prepareParser(source)
117 file = source.getByteStream()
118 buffer = file.read(self._bufsize)
Fred Drake45cd9de2000-06-29 19:34:54 +0000119 while buffer != "":
120 self.feed(buffer)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000121 buffer = file.read(self._bufsize)
Martin v. Löwis31b485f2000-10-06 21:12:12 +0000122 self.close()
Fred Drake45cd9de2000-06-29 19:34:54 +0000123
Fred Drake16f63292000-10-23 18:09:50 +0000124 def feed(self, data):
Fred Drake45cd9de2000-06-29 19:34:54 +0000125 """This method gives the raw XML data in the data parameter to
126 the parser and makes it parse the data, emitting the
127 corresponding events. It is allowed for XML constructs to be
128 split across several calls to feed.
129
130 feed may raise SAXException."""
131 raise NotImplementedError("This method must be implemented!")
Fred Drake07cbc4e2000-09-21 17:43:48 +0000132
Fred Drake45cd9de2000-06-29 19:34:54 +0000133 def prepareParser(self, source):
134 """This method is called by the parse implementation to allow
135 the SAX 2.0 driver to prepare itself for parsing."""
136 raise NotImplementedError("prepareParser must be overridden!")
137
138 def close(self):
139 """This method is called when the entire XML document has been
140 passed to the parser through the feed method, to notify the
141 parser that there are no more data. This allows the parser to
142 do the final checks on the document and empty the internal
143 data buffer.
144
145 The parser will not be ready to parse another document until
146 the reset method has been called.
147
148 close may raise SAXException."""
149 raise NotImplementedError("This method must be implemented!")
150
151 def reset(self):
152 """This method is called after close has been called to reset
153 the parser so that it is ready to parse new documents. The
154 results of calling parse or feed after close without calling
155 reset are undefined."""
156 raise NotImplementedError("This method must be implemented!")
157
158# ===== LOCATOR =====
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000159
Fred Drake45cd9de2000-06-29 19:34:54 +0000160class Locator:
161 """Interface for associating a SAX event with a document
162 location. A locator object will return valid results only during
163 calls to DocumentHandler methods; at any other time, the
164 results are unpredictable."""
165
166 def getColumnNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000167 "Return the column number where the current event ends."
168 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000169
170 def getLineNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000171 "Return the line number where the current event ends."
172 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000173
174 def getPublicId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000175 "Return the public identifier for the current event."
176 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000177
178 def getSystemId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000179 "Return the system identifier for the current event."
180 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000181
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000182# ===== INPUTSOURCE =====
183
184class InputSource:
185 """Encapsulation of the information needed by the XMLReader to
186 read entities.
187
188 This class may include information about the public identifier,
189 system identifier, byte stream (possibly with character encoding
190 information) and/or the character stream of an entity.
191
192 Applications will create objects of this class for use in the
193 XMLReader.parse method and for returning from
194 EntityResolver.resolveEntity.
195
196 An InputSource belongs to the application, the XMLReader is not
197 allowed to modify InputSource objects passed to it from the
198 application, although it may make copies and modify those."""
199
200 def __init__(self, system_id = None):
201 self.__system_id = system_id
202 self.__public_id = None
203 self.__encoding = None
204 self.__bytefile = None
205 self.__charfile = None
206
207 def setPublicId(self, public_id):
208 "Sets the public identifier of this InputSource."
209 self.__public_id = public_id
210
211 def getPublicId(self):
212 "Returns the public identifier of this InputSource."
213 return self.__public_id
214
215 def setSystemId(self, system_id):
216 "Sets the system identifier of this InputSource."
217 self.__system_id = system_id
218
219 def getSystemId(self):
220 "Returns the system identifier of this InputSource."
221 return self.__system_id
222
223 def setEncoding(self, encoding):
224 """Sets the character encoding of this InputSource.
225
226 The encoding must be a string acceptable for an XML encoding
227 declaration (see section 4.3.3 of the XML recommendation).
228
229 The encoding attribute of the InputSource is ignored if the
230 InputSource also contains a character stream."""
231 self.__encoding = encoding
232
233 def getEncoding(self):
234 "Get the character encoding of this InputSource."
235 return self.__encoding
236
237 def setByteStream(self, bytefile):
238 """Set the byte stream (a Python file-like object which does
239 not perform byte-to-character conversion) for this input
240 source.
Fred Drake16f63292000-10-23 18:09:50 +0000241
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000242 The SAX parser will ignore this if there is also a character
243 stream specified, but it will use a byte stream in preference
244 to opening a URI connection itself.
245
246 If the application knows the character encoding of the byte
247 stream, it should set it with the setEncoding method."""
248 self.__bytefile = bytefile
249
250 def getByteStream(self):
251 """Get the byte stream for this input source.
Fred Drake16f63292000-10-23 18:09:50 +0000252
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000253 The getEncoding method will return the character encoding for
Fred Drake16f63292000-10-23 18:09:50 +0000254 this byte stream, or None if unknown."""
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000255 return self.__bytefile
Fred Drake16f63292000-10-23 18:09:50 +0000256
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000257 def setCharacterStream(self, charfile):
258 """Set the character stream for this input source. (The stream
259 must be a Python 1.6 Unicode-wrapped file-like that performs
260 conversion to Unicode strings.)
Fred Drake16f63292000-10-23 18:09:50 +0000261
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000262 If there is a character stream specified, the SAX parser will
263 ignore any byte stream and will not attempt to open a URI
264 connection to the system identifier."""
265 self.__charfile = charfile
266
267 def getCharacterStream(self):
268 "Get the character stream for this input source."
269 return self.__charfile
Fred Drake16f63292000-10-23 18:09:50 +0000270
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000271# ===== ATTRIBUTESIMPL =====
272
Fred Drake45cd9de2000-06-29 19:34:54 +0000273class AttributesImpl:
Fred Drake16f63292000-10-23 18:09:50 +0000274
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000275 def __init__(self, attrs):
276 """Non-NS-aware implementation.
277
278 attrs should be of the form {name : value}."""
Fred Drake45cd9de2000-06-29 19:34:54 +0000279 self._attrs = attrs
Fred Drake45cd9de2000-06-29 19:34:54 +0000280
281 def getLength(self):
282 return len(self._attrs)
283
284 def getType(self, name):
285 return "CDATA"
286
287 def getValue(self, name):
288 return self._attrs[name]
289
290 def getValueByQName(self, name):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000291 return self._attrs[name]
Fred Drake45cd9de2000-06-29 19:34:54 +0000292
293 def getNameByQName(self, name):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000294 if not self._attrs.has_key(name):
Fred Drakeec126da2000-12-13 20:48:29 +0000295 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000296 return name
Fred Drake45cd9de2000-06-29 19:34:54 +0000297
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000298 def getQNameByName(self, name):
299 if not self._attrs.has_key(name):
Fred Drakeec126da2000-12-13 20:48:29 +0000300 raise KeyError, name
Fred Drake16f63292000-10-23 18:09:50 +0000301 return name
302
Fred Drake45cd9de2000-06-29 19:34:54 +0000303 def getNames(self):
304 return self._attrs.keys()
305
306 def getQNames(self):
Fred Drake16f63292000-10-23 18:09:50 +0000307 return self._attrs.keys()
Fred Drake45cd9de2000-06-29 19:34:54 +0000308
309 def __len__(self):
310 return len(self._attrs)
311
312 def __getitem__(self, name):
313 return self._attrs[name]
314
315 def keys(self):
316 return self._attrs.keys()
317
318 def has_key(self, name):
319 return self._attrs.has_key(name)
320
321 def get(self, name, alternative=None):
322 return self._attrs.get(name, alternative)
323
324 def copy(self):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000325 return self.__class__(self._attrs)
Fred Drake45cd9de2000-06-29 19:34:54 +0000326
327 def items(self):
328 return self._attrs.items()
329
330 def values(self):
331 return self._attrs.values()
332
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000333# ===== ATTRIBUTESNSIMPL =====
334
335class AttributesNSImpl(AttributesImpl):
Fred Drake16f63292000-10-23 18:09:50 +0000336
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000337 def __init__(self, attrs, qnames):
338 """NS-aware implementation.
339
340 attrs should be of the form {(ns_uri, lname): value, ...}.
341 qnames of the form {(ns_uri, lname): qname, ...}."""
342 self._attrs = attrs
343 self._qnames = qnames
344
345 def getValueByQName(self, name):
346 for (nsname, qname) in self._qnames.items():
347 if qname == name:
348 return self._attrs[nsname]
Fred Drake16f63292000-10-23 18:09:50 +0000349
Fred Drakeec126da2000-12-13 20:48:29 +0000350 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000351
352 def getNameByQName(self, name):
353 for (nsname, qname) in self._qnames.items():
354 if qname == name:
355 return nsname
Fred Drake16f63292000-10-23 18:09:50 +0000356
Fred Drakeec126da2000-12-13 20:48:29 +0000357 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000358
359 def getQNameByName(self, name):
360 return self._qnames[name]
Fred Drake16f63292000-10-23 18:09:50 +0000361
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000362 def getQNames(self):
363 return self._qnames.values()
364
365 def copy(self):
366 return self.__class__(self._attrs, self._qnames)
Fred Drake16f63292000-10-23 18:09:50 +0000367
Fred Drake07cbc4e2000-09-21 17:43:48 +0000368
Fred Drake45cd9de2000-06-29 19:34:54 +0000369def _test():
370 XMLReader()
371 IncrementalParser()
372 Locator()
Fred Drake45cd9de2000-06-29 19:34:54 +0000373
Fred Drake07cbc4e2000-09-21 17:43:48 +0000374if __name__ == "__main__":
Fred Drake45cd9de2000-06-29 19:34:54 +0000375 _test()