blob: f1c9d9db2ed26a6abc9b9ee307a64fbcd86d7169 [file] [log] [blame]
Fred Drake45cd9de2000-06-29 19:34:54 +00001"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers
2should be based on this code. """
Fred Drake07cbc4e2000-09-21 17:43:48 +00003
4import handler
5
Fred Drake904f2fc2001-03-14 22:43:47 +00006from _exceptions import SAXNotSupportedException, SAXNotRecognizedException
7
8
Fred Drake45cd9de2000-06-29 19:34:54 +00009# ===== XMLREADER =====
10
11class XMLReader:
Fred Drake16f63292000-10-23 18:09:50 +000012 """Interface for reading an XML document using callbacks.
Lars Gustäbelbb757132000-09-24 20:38:18 +000013
14 XMLReader is the interface that an XML parser's SAX2 driver must
15 implement. This interface allows an application to set and query
16 features and properties in the parser, to register event handlers
17 for document processing, and to initiate a document parse.
18
19 All SAX interfaces are assumed to be synchronous: the parse
20 methods must not return until parsing is complete, and readers
21 must wait for an event-handler callback to return before reporting
22 the next event."""
Fred Drake16f63292000-10-23 18:09:50 +000023
Fred Drake45cd9de2000-06-29 19:34:54 +000024 def __init__(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +000025 self._cont_handler = handler.ContentHandler()
Lars Gustäbele292a242000-09-24 20:19:45 +000026 self._dtd_handler = handler.DTDHandler()
27 self._ent_handler = handler.EntityResolver()
Fred Drake07cbc4e2000-09-21 17:43:48 +000028 self._err_handler = handler.ErrorHandler()
Fred Drake45cd9de2000-06-29 19:34:54 +000029
30 def parse(self, source):
Skip Montanarof9059eb2000-07-06 03:01:40 +000031 "Parse an XML document from a system identifier or an InputSource."
Fred Drake45cd9de2000-06-29 19:34:54 +000032 raise NotImplementedError("This method must be implemented!")
33
34 def getContentHandler(self):
35 "Returns the current ContentHandler."
36 return self._cont_handler
37
38 def setContentHandler(self, handler):
39 "Registers a new object to receive document content events."
40 self._cont_handler = handler
Fred Drake16f63292000-10-23 18:09:50 +000041
Fred Drake45cd9de2000-06-29 19:34:54 +000042 def getDTDHandler(self):
43 "Returns the current DTD handler."
44 return self._dtd_handler
Fred Drake16f63292000-10-23 18:09:50 +000045
Fred Drake45cd9de2000-06-29 19:34:54 +000046 def setDTDHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000047 "Register an object to receive basic DTD-related events."
48 self._dtd_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000049
50 def getEntityResolver(self):
51 "Returns the current EntityResolver."
52 return self._ent_handler
Fred Drake16f63292000-10-23 18:09:50 +000053
Fred Drake45cd9de2000-06-29 19:34:54 +000054 def setEntityResolver(self, resolver):
Skip Montanarof9059eb2000-07-06 03:01:40 +000055 "Register an object to resolve external entities."
56 self._ent_handler = resolver
Fred Drake45cd9de2000-06-29 19:34:54 +000057
58 def getErrorHandler(self):
59 "Returns the current ErrorHandler."
60 return self._err_handler
Fred Drake16f63292000-10-23 18:09:50 +000061
Fred Drake45cd9de2000-06-29 19:34:54 +000062 def setErrorHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000063 "Register an object to receive error-message events."
64 self._err_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000065
66 def setLocale(self, locale):
Fred Drake16f63292000-10-23 18:09:50 +000067 """Allow an application to set the locale for errors and warnings.
68
Thomas Wouters7e474022000-07-16 12:04:32 +000069 SAX parsers are not required to provide localization for errors
Fred Drake45cd9de2000-06-29 19:34:54 +000070 and warnings; if they cannot support the requested locale,
71 however, they must throw a SAX exception. Applications may
72 request a locale change in the middle of a parse."""
73 raise SAXNotSupportedException("Locale support not implemented")
Fred Drake16f63292000-10-23 18:09:50 +000074
Fred Drake45cd9de2000-06-29 19:34:54 +000075 def getFeature(self, name):
76 "Looks up and returns the state of a SAX2 feature."
77 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
78
79 def setFeature(self, name, state):
80 "Sets the state of a SAX2 feature."
81 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
82
83 def getProperty(self, name):
84 "Looks up and returns the value of a SAX2 property."
85 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
86
87 def setProperty(self, name, value):
88 "Sets the value of a SAX2 property."
89 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
90
Fred Drake45cd9de2000-06-29 19:34:54 +000091class IncrementalParser(XMLReader):
92 """This interface adds three extra methods to the XMLReader
93 interface that allow XML parsers to support incremental
94 parsing. Support for this interface is optional, since not all
95 underlying XML parsers support this functionality.
96
97 When the parser is instantiated it is ready to begin accepting
98 data from the feed method immediately. After parsing has been
99 finished with a call to close the reset method must be called to
100 make the parser ready to accept new data, either from feed or
101 using the parse method.
102
103 Note that these methods must _not_ be called during parsing, that
104 is, after parse has been called and before it returns.
105
106 By default, the class also implements the parse method of the XMLReader
107 interface using the feed, close and reset methods of the
108 IncrementalParser interface as a convenience to SAX 2.0 driver
109 writers."""
Fred Drake07cbc4e2000-09-21 17:43:48 +0000110
111 def __init__(self, bufsize=2**16):
112 self._bufsize = bufsize
113 XMLReader.__init__(self)
114
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000115 def parse(self, source):
Martin v. Löwis491ded72000-09-29 18:59:50 +0000116 import saxutils
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000117 source = saxutils.prepare_input_source(source)
Fred Drake16f63292000-10-23 18:09:50 +0000118
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000119 self.prepareParser(source)
120 file = source.getByteStream()
121 buffer = file.read(self._bufsize)
Fred Drake45cd9de2000-06-29 19:34:54 +0000122 while buffer != "":
123 self.feed(buffer)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000124 buffer = file.read(self._bufsize)
Martin v. Löwis31b485f2000-10-06 21:12:12 +0000125 self.close()
Fred Drake45cd9de2000-06-29 19:34:54 +0000126
Fred Drake16f63292000-10-23 18:09:50 +0000127 def feed(self, data):
Fred Drake45cd9de2000-06-29 19:34:54 +0000128 """This method gives the raw XML data in the data parameter to
129 the parser and makes it parse the data, emitting the
130 corresponding events. It is allowed for XML constructs to be
131 split across several calls to feed.
132
133 feed may raise SAXException."""
134 raise NotImplementedError("This method must be implemented!")
Fred Drake07cbc4e2000-09-21 17:43:48 +0000135
Fred Drake45cd9de2000-06-29 19:34:54 +0000136 def prepareParser(self, source):
137 """This method is called by the parse implementation to allow
138 the SAX 2.0 driver to prepare itself for parsing."""
139 raise NotImplementedError("prepareParser must be overridden!")
140
141 def close(self):
142 """This method is called when the entire XML document has been
143 passed to the parser through the feed method, to notify the
144 parser that there are no more data. This allows the parser to
145 do the final checks on the document and empty the internal
146 data buffer.
147
148 The parser will not be ready to parse another document until
149 the reset method has been called.
150
151 close may raise SAXException."""
152 raise NotImplementedError("This method must be implemented!")
153
154 def reset(self):
155 """This method is called after close has been called to reset
156 the parser so that it is ready to parse new documents. The
157 results of calling parse or feed after close without calling
158 reset are undefined."""
159 raise NotImplementedError("This method must be implemented!")
160
161# ===== LOCATOR =====
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000162
Fred Drake45cd9de2000-06-29 19:34:54 +0000163class Locator:
164 """Interface for associating a SAX event with a document
165 location. A locator object will return valid results only during
166 calls to DocumentHandler methods; at any other time, the
167 results are unpredictable."""
168
169 def getColumnNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000170 "Return the column number where the current event ends."
171 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000172
173 def getLineNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000174 "Return the line number where the current event ends."
175 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000176
177 def getPublicId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000178 "Return the public identifier for the current event."
179 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000180
181 def getSystemId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000182 "Return the system identifier for the current event."
183 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000184
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000185# ===== INPUTSOURCE =====
186
187class InputSource:
188 """Encapsulation of the information needed by the XMLReader to
189 read entities.
190
191 This class may include information about the public identifier,
192 system identifier, byte stream (possibly with character encoding
193 information) and/or the character stream of an entity.
194
195 Applications will create objects of this class for use in the
196 XMLReader.parse method and for returning from
197 EntityResolver.resolveEntity.
198
199 An InputSource belongs to the application, the XMLReader is not
200 allowed to modify InputSource objects passed to it from the
201 application, although it may make copies and modify those."""
202
203 def __init__(self, system_id = None):
204 self.__system_id = system_id
205 self.__public_id = None
206 self.__encoding = None
207 self.__bytefile = None
208 self.__charfile = None
209
210 def setPublicId(self, public_id):
211 "Sets the public identifier of this InputSource."
212 self.__public_id = public_id
213
214 def getPublicId(self):
215 "Returns the public identifier of this InputSource."
216 return self.__public_id
217
218 def setSystemId(self, system_id):
219 "Sets the system identifier of this InputSource."
220 self.__system_id = system_id
221
222 def getSystemId(self):
223 "Returns the system identifier of this InputSource."
224 return self.__system_id
225
226 def setEncoding(self, encoding):
227 """Sets the character encoding of this InputSource.
228
229 The encoding must be a string acceptable for an XML encoding
230 declaration (see section 4.3.3 of the XML recommendation).
231
232 The encoding attribute of the InputSource is ignored if the
233 InputSource also contains a character stream."""
234 self.__encoding = encoding
235
236 def getEncoding(self):
237 "Get the character encoding of this InputSource."
238 return self.__encoding
239
240 def setByteStream(self, bytefile):
241 """Set the byte stream (a Python file-like object which does
242 not perform byte-to-character conversion) for this input
243 source.
Fred Drake16f63292000-10-23 18:09:50 +0000244
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000245 The SAX parser will ignore this if there is also a character
246 stream specified, but it will use a byte stream in preference
247 to opening a URI connection itself.
248
249 If the application knows the character encoding of the byte
250 stream, it should set it with the setEncoding method."""
251 self.__bytefile = bytefile
252
253 def getByteStream(self):
254 """Get the byte stream for this input source.
Fred Drake16f63292000-10-23 18:09:50 +0000255
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000256 The getEncoding method will return the character encoding for
Fred Drake16f63292000-10-23 18:09:50 +0000257 this byte stream, or None if unknown."""
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000258 return self.__bytefile
Fred Drake16f63292000-10-23 18:09:50 +0000259
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000260 def setCharacterStream(self, charfile):
261 """Set the character stream for this input source. (The stream
Martin v. Löwis711a5bd2001-01-27 08:56:24 +0000262 must be a Python 2.0 Unicode-wrapped file-like that performs
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000263 conversion to Unicode strings.)
Fred Drake16f63292000-10-23 18:09:50 +0000264
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000265 If there is a character stream specified, the SAX parser will
266 ignore any byte stream and will not attempt to open a URI
267 connection to the system identifier."""
268 self.__charfile = charfile
269
270 def getCharacterStream(self):
271 "Get the character stream for this input source."
272 return self.__charfile
Fred Drake16f63292000-10-23 18:09:50 +0000273
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000274# ===== ATTRIBUTESIMPL =====
275
Fred Drake45cd9de2000-06-29 19:34:54 +0000276class AttributesImpl:
Fred Drake16f63292000-10-23 18:09:50 +0000277
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000278 def __init__(self, attrs):
279 """Non-NS-aware implementation.
280
281 attrs should be of the form {name : value}."""
Fred Drake45cd9de2000-06-29 19:34:54 +0000282 self._attrs = attrs
Fred Drake45cd9de2000-06-29 19:34:54 +0000283
284 def getLength(self):
285 return len(self._attrs)
286
287 def getType(self, name):
288 return "CDATA"
289
290 def getValue(self, name):
291 return self._attrs[name]
292
293 def getValueByQName(self, name):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000294 return self._attrs[name]
Fred Drake45cd9de2000-06-29 19:34:54 +0000295
296 def getNameByQName(self, name):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000297 if not self._attrs.has_key(name):
Fred Drakeec126da2000-12-13 20:48:29 +0000298 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000299 return name
Fred Drake45cd9de2000-06-29 19:34:54 +0000300
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000301 def getQNameByName(self, name):
302 if not self._attrs.has_key(name):
Fred Drakeec126da2000-12-13 20:48:29 +0000303 raise KeyError, name
Fred Drake16f63292000-10-23 18:09:50 +0000304 return name
305
Fred Drake45cd9de2000-06-29 19:34:54 +0000306 def getNames(self):
307 return self._attrs.keys()
308
309 def getQNames(self):
Fred Drake16f63292000-10-23 18:09:50 +0000310 return self._attrs.keys()
Fred Drake45cd9de2000-06-29 19:34:54 +0000311
312 def __len__(self):
313 return len(self._attrs)
314
315 def __getitem__(self, name):
316 return self._attrs[name]
317
318 def keys(self):
319 return self._attrs.keys()
320
321 def has_key(self, name):
322 return self._attrs.has_key(name)
323
324 def get(self, name, alternative=None):
325 return self._attrs.get(name, alternative)
326
327 def copy(self):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000328 return self.__class__(self._attrs)
Fred Drake45cd9de2000-06-29 19:34:54 +0000329
330 def items(self):
331 return self._attrs.items()
332
333 def values(self):
334 return self._attrs.values()
335
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000336# ===== ATTRIBUTESNSIMPL =====
337
338class AttributesNSImpl(AttributesImpl):
Fred Drake16f63292000-10-23 18:09:50 +0000339
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000340 def __init__(self, attrs, qnames):
341 """NS-aware implementation.
342
343 attrs should be of the form {(ns_uri, lname): value, ...}.
344 qnames of the form {(ns_uri, lname): qname, ...}."""
345 self._attrs = attrs
346 self._qnames = qnames
347
348 def getValueByQName(self, name):
349 for (nsname, qname) in self._qnames.items():
350 if qname == name:
351 return self._attrs[nsname]
Fred Drake16f63292000-10-23 18:09:50 +0000352
Fred Drakeec126da2000-12-13 20:48:29 +0000353 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000354
355 def getNameByQName(self, name):
356 for (nsname, qname) in self._qnames.items():
357 if qname == name:
358 return nsname
Fred Drake16f63292000-10-23 18:09:50 +0000359
Fred Drakeec126da2000-12-13 20:48:29 +0000360 raise KeyError, name
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000361
362 def getQNameByName(self, name):
363 return self._qnames[name]
Fred Drake16f63292000-10-23 18:09:50 +0000364
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000365 def getQNames(self):
366 return self._qnames.values()
367
368 def copy(self):
369 return self.__class__(self._attrs, self._qnames)
Fred Drake16f63292000-10-23 18:09:50 +0000370
Fred Drake07cbc4e2000-09-21 17:43:48 +0000371
Fred Drake45cd9de2000-06-29 19:34:54 +0000372def _test():
373 XMLReader()
374 IncrementalParser()
375 Locator()
Fred Drake45cd9de2000-06-29 19:34:54 +0000376
Fred Drake07cbc4e2000-09-21 17:43:48 +0000377if __name__ == "__main__":
Fred Drake45cd9de2000-06-29 19:34:54 +0000378 _test()