blob: 716f22840414e604ab38661f91eb07f094309833 [file] [log] [blame]
Fred Drake45cd9de2000-06-29 19:34:54 +00001"""An XML Reader is the SAX 2 name for an XML parser. XML Parsers
2should be based on this code. """
Fred Drake07cbc4e2000-09-21 17:43:48 +00003
Guido van Rossum3b271052006-08-17 09:10:09 +00004from . import handler
Martin v. Löwis2c071952001-06-07 05:52:17 +00005
Guido van Rossum3b271052006-08-17 09:10:09 +00006from ._exceptions import SAXNotSupportedException, SAXNotRecognizedException
Martin v. Löwis2c071952001-06-07 05:52:17 +00007
Fred Drake904f2fc2001-03-14 22:43:47 +00008
Fred Drake45cd9de2000-06-29 19:34:54 +00009# ===== XMLREADER =====
10
11class XMLReader:
Fred Drake16f63292000-10-23 18:09:50 +000012 """Interface for reading an XML document using callbacks.
Lars Gustäbelbb757132000-09-24 20:38:18 +000013
14 XMLReader is the interface that an XML parser's SAX2 driver must
15 implement. This interface allows an application to set and query
16 features and properties in the parser, to register event handlers
17 for document processing, and to initiate a document parse.
18
19 All SAX interfaces are assumed to be synchronous: the parse
20 methods must not return until parsing is complete, and readers
21 must wait for an event-handler callback to return before reporting
22 the next event."""
Fred Drake16f63292000-10-23 18:09:50 +000023
Fred Drake45cd9de2000-06-29 19:34:54 +000024 def __init__(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +000025 self._cont_handler = handler.ContentHandler()
Lars Gustäbele292a242000-09-24 20:19:45 +000026 self._dtd_handler = handler.DTDHandler()
27 self._ent_handler = handler.EntityResolver()
Fred Drake07cbc4e2000-09-21 17:43:48 +000028 self._err_handler = handler.ErrorHandler()
Fred Drake45cd9de2000-06-29 19:34:54 +000029
30 def parse(self, source):
Skip Montanarof9059eb2000-07-06 03:01:40 +000031 "Parse an XML document from a system identifier or an InputSource."
Fred Drake45cd9de2000-06-29 19:34:54 +000032 raise NotImplementedError("This method must be implemented!")
33
34 def getContentHandler(self):
35 "Returns the current ContentHandler."
36 return self._cont_handler
37
38 def setContentHandler(self, handler):
39 "Registers a new object to receive document content events."
40 self._cont_handler = handler
Fred Drake16f63292000-10-23 18:09:50 +000041
Fred Drake45cd9de2000-06-29 19:34:54 +000042 def getDTDHandler(self):
43 "Returns the current DTD handler."
44 return self._dtd_handler
Fred Drake16f63292000-10-23 18:09:50 +000045
Fred Drake45cd9de2000-06-29 19:34:54 +000046 def setDTDHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000047 "Register an object to receive basic DTD-related events."
48 self._dtd_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000049
50 def getEntityResolver(self):
51 "Returns the current EntityResolver."
52 return self._ent_handler
Fred Drake16f63292000-10-23 18:09:50 +000053
Fred Drake45cd9de2000-06-29 19:34:54 +000054 def setEntityResolver(self, resolver):
Skip Montanarof9059eb2000-07-06 03:01:40 +000055 "Register an object to resolve external entities."
56 self._ent_handler = resolver
Fred Drake45cd9de2000-06-29 19:34:54 +000057
58 def getErrorHandler(self):
59 "Returns the current ErrorHandler."
60 return self._err_handler
Fred Drake16f63292000-10-23 18:09:50 +000061
Fred Drake45cd9de2000-06-29 19:34:54 +000062 def setErrorHandler(self, handler):
Skip Montanarof9059eb2000-07-06 03:01:40 +000063 "Register an object to receive error-message events."
64 self._err_handler = handler
Fred Drake45cd9de2000-06-29 19:34:54 +000065
66 def setLocale(self, locale):
Fred Drake16f63292000-10-23 18:09:50 +000067 """Allow an application to set the locale for errors and warnings.
68
Thomas Wouters7e474022000-07-16 12:04:32 +000069 SAX parsers are not required to provide localization for errors
Fred Drake45cd9de2000-06-29 19:34:54 +000070 and warnings; if they cannot support the requested locale,
Andrew Svetlov737fb892012-12-18 21:14:22 +020071 however, they must raise a SAX exception. Applications may
Fred Drake45cd9de2000-06-29 19:34:54 +000072 request a locale change in the middle of a parse."""
73 raise SAXNotSupportedException("Locale support not implemented")
Fred Drake16f63292000-10-23 18:09:50 +000074
Fred Drake45cd9de2000-06-29 19:34:54 +000075 def getFeature(self, name):
76 "Looks up and returns the state of a SAX2 feature."
77 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
78
79 def setFeature(self, name, state):
80 "Sets the state of a SAX2 feature."
81 raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
82
83 def getProperty(self, name):
84 "Looks up and returns the value of a SAX2 property."
85 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
86
87 def setProperty(self, name, value):
88 "Sets the value of a SAX2 property."
89 raise SAXNotRecognizedException("Property '%s' not recognized" % name)
90
Fred Drake45cd9de2000-06-29 19:34:54 +000091class IncrementalParser(XMLReader):
92 """This interface adds three extra methods to the XMLReader
93 interface that allow XML parsers to support incremental
94 parsing. Support for this interface is optional, since not all
95 underlying XML parsers support this functionality.
96
97 When the parser is instantiated it is ready to begin accepting
98 data from the feed method immediately. After parsing has been
99 finished with a call to close the reset method must be called to
100 make the parser ready to accept new data, either from feed or
101 using the parse method.
102
103 Note that these methods must _not_ be called during parsing, that
104 is, after parse has been called and before it returns.
105
106 By default, the class also implements the parse method of the XMLReader
107 interface using the feed, close and reset methods of the
108 IncrementalParser interface as a convenience to SAX 2.0 driver
109 writers."""
Fred Drake07cbc4e2000-09-21 17:43:48 +0000110
111 def __init__(self, bufsize=2**16):
112 self._bufsize = bufsize
113 XMLReader.__init__(self)
114
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000115 def parse(self, source):
Guido van Rossum3b271052006-08-17 09:10:09 +0000116 from . import saxutils
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000117 source = saxutils.prepare_input_source(source)
Fred Drake16f63292000-10-23 18:09:50 +0000118
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000119 self.prepareParser(source)
Serhiy Storchaka61de0872015-04-02 21:00:13 +0300120 file = source.getCharacterStream()
121 if file is None:
122 file = source.getByteStream()
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000123 buffer = file.read(self._bufsize)
Benjamin Petersona7f4f5a2008-09-04 02:22:52 +0000124 while buffer:
Fred Drake45cd9de2000-06-29 19:34:54 +0000125 self.feed(buffer)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000126 buffer = file.read(self._bufsize)
Martin v. Löwis31b485f2000-10-06 21:12:12 +0000127 self.close()
Fred Drake45cd9de2000-06-29 19:34:54 +0000128
Fred Drake16f63292000-10-23 18:09:50 +0000129 def feed(self, data):
Fred Drake45cd9de2000-06-29 19:34:54 +0000130 """This method gives the raw XML data in the data parameter to
131 the parser and makes it parse the data, emitting the
132 corresponding events. It is allowed for XML constructs to be
133 split across several calls to feed.
134
135 feed may raise SAXException."""
136 raise NotImplementedError("This method must be implemented!")
Fred Drake07cbc4e2000-09-21 17:43:48 +0000137
Fred Drake45cd9de2000-06-29 19:34:54 +0000138 def prepareParser(self, source):
139 """This method is called by the parse implementation to allow
140 the SAX 2.0 driver to prepare itself for parsing."""
141 raise NotImplementedError("prepareParser must be overridden!")
142
143 def close(self):
144 """This method is called when the entire XML document has been
145 passed to the parser through the feed method, to notify the
146 parser that there are no more data. This allows the parser to
147 do the final checks on the document and empty the internal
148 data buffer.
149
150 The parser will not be ready to parse another document until
151 the reset method has been called.
152
153 close may raise SAXException."""
154 raise NotImplementedError("This method must be implemented!")
155
156 def reset(self):
157 """This method is called after close has been called to reset
158 the parser so that it is ready to parse new documents. The
159 results of calling parse or feed after close without calling
160 reset are undefined."""
161 raise NotImplementedError("This method must be implemented!")
162
163# ===== LOCATOR =====
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000164
Fred Drake45cd9de2000-06-29 19:34:54 +0000165class Locator:
166 """Interface for associating a SAX event with a document
167 location. A locator object will return valid results only during
168 calls to DocumentHandler methods; at any other time, the
169 results are unpredictable."""
170
171 def getColumnNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000172 "Return the column number where the current event ends."
173 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000174
175 def getLineNumber(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000176 "Return the line number where the current event ends."
177 return -1
Fred Drake45cd9de2000-06-29 19:34:54 +0000178
179 def getPublicId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000180 "Return the public identifier for the current event."
181 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000182
183 def getSystemId(self):
Skip Montanarof9059eb2000-07-06 03:01:40 +0000184 "Return the system identifier for the current event."
185 return None
Fred Drake45cd9de2000-06-29 19:34:54 +0000186
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000187# ===== INPUTSOURCE =====
188
189class InputSource:
190 """Encapsulation of the information needed by the XMLReader to
191 read entities.
192
193 This class may include information about the public identifier,
194 system identifier, byte stream (possibly with character encoding
195 information) and/or the character stream of an entity.
196
197 Applications will create objects of this class for use in the
198 XMLReader.parse method and for returning from
199 EntityResolver.resolveEntity.
200
201 An InputSource belongs to the application, the XMLReader is not
202 allowed to modify InputSource objects passed to it from the
203 application, although it may make copies and modify those."""
204
205 def __init__(self, system_id = None):
206 self.__system_id = system_id
207 self.__public_id = None
208 self.__encoding = None
209 self.__bytefile = None
210 self.__charfile = None
211
212 def setPublicId(self, public_id):
213 "Sets the public identifier of this InputSource."
214 self.__public_id = public_id
215
216 def getPublicId(self):
217 "Returns the public identifier of this InputSource."
218 return self.__public_id
219
220 def setSystemId(self, system_id):
221 "Sets the system identifier of this InputSource."
222 self.__system_id = system_id
223
224 def getSystemId(self):
225 "Returns the system identifier of this InputSource."
226 return self.__system_id
227
228 def setEncoding(self, encoding):
229 """Sets the character encoding of this InputSource.
230
231 The encoding must be a string acceptable for an XML encoding
232 declaration (see section 4.3.3 of the XML recommendation).
233
234 The encoding attribute of the InputSource is ignored if the
235 InputSource also contains a character stream."""
236 self.__encoding = encoding
237
238 def getEncoding(self):
239 "Get the character encoding of this InputSource."
240 return self.__encoding
241
242 def setByteStream(self, bytefile):
243 """Set the byte stream (a Python file-like object which does
244 not perform byte-to-character conversion) for this input
245 source.
Fred Drake16f63292000-10-23 18:09:50 +0000246
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000247 The SAX parser will ignore this if there is also a character
248 stream specified, but it will use a byte stream in preference
249 to opening a URI connection itself.
250
251 If the application knows the character encoding of the byte
252 stream, it should set it with the setEncoding method."""
253 self.__bytefile = bytefile
254
255 def getByteStream(self):
256 """Get the byte stream for this input source.
Fred Drake16f63292000-10-23 18:09:50 +0000257
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000258 The getEncoding method will return the character encoding for
Fred Drake16f63292000-10-23 18:09:50 +0000259 this byte stream, or None if unknown."""
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000260 return self.__bytefile
Fred Drake16f63292000-10-23 18:09:50 +0000261
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000262 def setCharacterStream(self, charfile):
263 """Set the character stream for this input source. (The stream
Martin v. Löwis711a5bd2001-01-27 08:56:24 +0000264 must be a Python 2.0 Unicode-wrapped file-like that performs
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000265 conversion to Unicode strings.)
Fred Drake16f63292000-10-23 18:09:50 +0000266
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000267 If there is a character stream specified, the SAX parser will
268 ignore any byte stream and will not attempt to open a URI
269 connection to the system identifier."""
270 self.__charfile = charfile
271
272 def getCharacterStream(self):
273 "Get the character stream for this input source."
274 return self.__charfile
Fred Drake16f63292000-10-23 18:09:50 +0000275
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000276# ===== ATTRIBUTESIMPL =====
277
Fred Drake45cd9de2000-06-29 19:34:54 +0000278class AttributesImpl:
Fred Drake16f63292000-10-23 18:09:50 +0000279
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000280 def __init__(self, attrs):
281 """Non-NS-aware implementation.
282
283 attrs should be of the form {name : value}."""
Fred Drake45cd9de2000-06-29 19:34:54 +0000284 self._attrs = attrs
Fred Drake45cd9de2000-06-29 19:34:54 +0000285
286 def getLength(self):
287 return len(self._attrs)
288
289 def getType(self, name):
290 return "CDATA"
291
292 def getValue(self, name):
293 return self._attrs[name]
294
295 def getValueByQName(self, name):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000296 return self._attrs[name]
Fred Drake45cd9de2000-06-29 19:34:54 +0000297
298 def getNameByQName(self, name):
Guido van Rossum1b01e5c2006-08-19 02:45:06 +0000299 if name not in self._attrs:
Collin Winter70e79802007-08-24 18:57:22 +0000300 raise KeyError(name)
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000301 return name
Fred Drake45cd9de2000-06-29 19:34:54 +0000302
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000303 def getQNameByName(self, name):
Guido van Rossum1b01e5c2006-08-19 02:45:06 +0000304 if name not in self._attrs:
Collin Winter70e79802007-08-24 18:57:22 +0000305 raise KeyError(name)
Fred Drake16f63292000-10-23 18:09:50 +0000306 return name
307
Fred Drake45cd9de2000-06-29 19:34:54 +0000308 def getNames(self):
Guido van Rossum091153d2007-02-11 18:44:55 +0000309 return list(self._attrs.keys())
Fred Drake45cd9de2000-06-29 19:34:54 +0000310
311 def getQNames(self):
Guido van Rossum091153d2007-02-11 18:44:55 +0000312 return list(self._attrs.keys())
Fred Drake45cd9de2000-06-29 19:34:54 +0000313
314 def __len__(self):
315 return len(self._attrs)
316
317 def __getitem__(self, name):
318 return self._attrs[name]
319
320 def keys(self):
Guido van Rossum091153d2007-02-11 18:44:55 +0000321 return list(self._attrs.keys())
Fred Drake45cd9de2000-06-29 19:34:54 +0000322
Raymond Hettinger0e449232003-01-30 00:56:33 +0000323 def __contains__(self, name):
Guido van Rossum1b01e5c2006-08-19 02:45:06 +0000324 return name in self._attrs
Raymond Hettinger0e449232003-01-30 00:56:33 +0000325
Fred Drake45cd9de2000-06-29 19:34:54 +0000326 def get(self, name, alternative=None):
327 return self._attrs.get(name, alternative)
328
329 def copy(self):
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000330 return self.__class__(self._attrs)
Fred Drake45cd9de2000-06-29 19:34:54 +0000331
332 def items(self):
Guido van Rossum091153d2007-02-11 18:44:55 +0000333 return list(self._attrs.items())
Fred Drake45cd9de2000-06-29 19:34:54 +0000334
335 def values(self):
Guido van Rossum091153d2007-02-11 18:44:55 +0000336 return list(self._attrs.values())
Fred Drake45cd9de2000-06-29 19:34:54 +0000337
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000338# ===== ATTRIBUTESNSIMPL =====
339
340class AttributesNSImpl(AttributesImpl):
Fred Drake16f63292000-10-23 18:09:50 +0000341
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000342 def __init__(self, attrs, qnames):
343 """NS-aware implementation.
344
345 attrs should be of the form {(ns_uri, lname): value, ...}.
346 qnames of the form {(ns_uri, lname): qname, ...}."""
347 self._attrs = attrs
348 self._qnames = qnames
349
350 def getValueByQName(self, name):
351 for (nsname, qname) in self._qnames.items():
352 if qname == name:
353 return self._attrs[nsname]
Fred Drake16f63292000-10-23 18:09:50 +0000354
Collin Winter70e79802007-08-24 18:57:22 +0000355 raise KeyError(name)
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000356
357 def getNameByQName(self, name):
358 for (nsname, qname) in self._qnames.items():
359 if qname == name:
360 return nsname
Fred Drake16f63292000-10-23 18:09:50 +0000361
Collin Winter70e79802007-08-24 18:57:22 +0000362 raise KeyError(name)
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000363
364 def getQNameByName(self, name):
365 return self._qnames[name]
Fred Drake16f63292000-10-23 18:09:50 +0000366
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000367 def getQNames(self):
Guido van Rossum091153d2007-02-11 18:44:55 +0000368 return list(self._qnames.values())
Lars Gustäbel32bf12e2000-09-24 18:39:23 +0000369
370 def copy(self):
371 return self.__class__(self._attrs, self._qnames)
Fred Drake16f63292000-10-23 18:09:50 +0000372
Fred Drake07cbc4e2000-09-21 17:43:48 +0000373
Fred Drake45cd9de2000-06-29 19:34:54 +0000374def _test():
375 XMLReader()
376 IncrementalParser()
377 Locator()
Fred Drake45cd9de2000-06-29 19:34:54 +0000378
Fred Drake07cbc4e2000-09-21 17:43:48 +0000379if __name__ == "__main__":
Fred Drake45cd9de2000-06-29 19:34:54 +0000380 _test()