blob: 71b1c67dba9933355ad6d6ead5b7d929cfc38f49 [file] [log] [blame]
William M. Bracke9449c52004-07-11 14:41:20 +00001# -*- coding: iso-8859-1 -*-
Daniel Veillard4f860202003-01-02 13:00:02 +00002""" A SAX2 driver for libxml2, on top of it's XmlReader API
3
4USAGE
5 # put this file (drv_libxml2.py) in PYTHONPATH
6 import xml.sax
7 reader = xml.sax.make_parser(["drv_libxml2"])
8 # ...and the rest is standard python sax.
9
10CAVEATS
11 - Lexical handlers are supported, except for start/endEntity
12 (waiting for XmlReader.ResolveEntity) and start/endDTD
Daniel Veillard417be3a2003-01-20 21:26:34 +000013 - Error callbacks are not exactly synchronous, they tend
14 to be invoked before the corresponding content callback,
15 because the underlying reader interface parses
16 data by chunks of 512 bytes
Daniel Veillard4f860202003-01-02 13:00:02 +000017
18TODO
19 - search for TODO
20 - some ErrorHandler events (warning)
21 - some ContentHandler events (setDocumentLocator, skippedEntity)
22 - EntityResolver (using libxml2.?)
23 - DTDHandler (if/when libxml2 exposes such node types)
24 - DeclHandler (if/when libxml2 exposes such node types)
25 - property_xml_string?
26 - feature_string_interning?
27 - Incremental parser
28 - additional performance tuning:
29 - one might cache callbacks to avoid some name lookups
30 - one might implement a smarter way to pass attributes to startElement
31 (some kind of lazy evaluation?)
32 - there might be room for improvement in start/endPrefixMapping
33 - other?
34
35"""
36
Alexandre Rostovtsev6c9c6112013-07-10 23:00:54 -040037__author__ = "Stéphane Bidoul <sbi@skynet.be>"
Daniel Veillard417be3a2003-01-20 21:26:34 +000038__version__ = "0.3"
Daniel Veillard4f860202003-01-02 13:00:02 +000039
Alexandre Rostovtsev6c9c6112013-07-10 23:00:54 -040040import sys
Daniel Veillard4f860202003-01-02 13:00:02 +000041import codecs
Alexandre Rostovtsev6c9c6112013-07-10 23:00:54 -040042
Arfrever Frehtes Taifersar Arahesis6dd77752013-07-12 11:18:11 +080043if sys.version_info[0] < 3:
Alexandre Rostovtsev6c9c6112013-07-10 23:00:54 -040044 __author__ = codecs.unicode_escape_decode(__author__)[0]
45
Arfrever Frehtes Taifersar Arahesis6dd77752013-07-12 11:18:11 +080046 StringTypes = (str, unicode)
Michael Stahl65112cb2016-01-18 12:46:41 +010047 # libxml2 returns strings as UTF8
48 _decoder = codecs.lookup("utf8")[1]
49 def _d(s):
50 if s is None:
51 return s
52 else:
53 return _decoder(s)[0]
Alexandre Rostovtsev6c9c6112013-07-10 23:00:54 -040054else:
Arfrever Frehtes Taifersar Arahesis6dd77752013-07-12 11:18:11 +080055 StringTypes = str
Michael Stahl65112cb2016-01-18 12:46:41 +010056 # s is Unicode `str` already
57 def _d(s):
58 return s
Daniel Veillard4f860202003-01-02 13:00:02 +000059
60from xml.sax._exceptions import *
61from xml.sax import xmlreader, saxutils
62from xml.sax.handler import \
63 feature_namespaces, \
64 feature_namespace_prefixes, \
65 feature_string_interning, \
66 feature_validation, \
67 feature_external_ges, \
68 feature_external_pes, \
69 property_lexical_handler, \
70 property_declaration_handler, \
71 property_dom_node, \
72 property_xml_string
73
Daniel Veillard4f860202003-01-02 13:00:02 +000074try:
75 import libxml2
Arfrever Frehtes Taifersar Arahesis6dd77752013-07-12 11:18:11 +080076except ImportError:
Daniel Veillarde329fc22003-01-09 21:36:42 +000077 raise SAXReaderNotAvailable("libxml2 not available: " \
Arfrever Frehtes Taifersar Arahesis6dd77752013-07-12 11:18:11 +080078 "import error was: %s" % sys.exc_info()[1])
Daniel Veillard4f860202003-01-02 13:00:02 +000079
Daniel Veillard417be3a2003-01-20 21:26:34 +000080class Locator(xmlreader.Locator):
81 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
82
83 def __init__(self,locator):
84 self.__locator = locator
85
86 def getColumnNumber(self):
87 "Return the column number where the current event ends."
88 return -1
89
90 def getLineNumber(self):
91 "Return the line number where the current event ends."
92 return self.__locator.LineNumber()
93
94 def getPublicId(self):
95 "Return the public identifier for the current event."
96 return None
97
98 def getSystemId(self):
99 "Return the system identifier for the current event."
100 return self.__locator.BaseURI()
Daniel Veillard4f860202003-01-02 13:00:02 +0000101
102class LibXml2Reader(xmlreader.XMLReader):
103
104 def __init__(self):
105 xmlreader.XMLReader.__init__(self)
106 # features
107 self.__ns = 0
108 self.__nspfx = 0
109 self.__validate = 0
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000110 self.__extparams = 1
Daniel Veillard4f860202003-01-02 13:00:02 +0000111 # parsing flag
112 self.__parsing = 0
113 # additional handlers
114 self.__lex_handler = None
115 self.__decl_handler = None
116 # error messages accumulator
117 self.__errors = None
118
Daniel Veillard417be3a2003-01-20 21:26:34 +0000119 def _errorHandler(self,arg,msg,severity,locator):
Daniel Veillard4f860202003-01-02 13:00:02 +0000120 if self.__errors is None:
121 self.__errors = []
Daniel Veillard417be3a2003-01-20 21:26:34 +0000122 self.__errors.append((severity,
123 SAXParseException(msg,None,
124 Locator(locator))))
Daniel Veillard4f860202003-01-02 13:00:02 +0000125
Daniel Veillard417be3a2003-01-20 21:26:34 +0000126 def _reportErrors(self,fatal):
127 for severity,exception in self.__errors:
128 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
129 libxml2.PARSER_SEVERITY_WARNING):
130 self._err_handler.warning(exception)
131 else:
132 # when fatal is set, the parse will stop;
133 # we consider that the last error reported
134 # is the fatal one.
135 if fatal and exception is self.__errors[-1][1]:
136 self._err_handler.fatalError(exception)
137 else:
138 self._err_handler.error(exception)
Daniel Veillard4f860202003-01-02 13:00:02 +0000139 self.__errors = None
Daniel Veillard4f860202003-01-02 13:00:02 +0000140
141 def parse(self, source):
142 self.__parsing = 1
Daniel Veillard4f860202003-01-02 13:00:02 +0000143 try:
144 # prepare source and create reader
Arfrever Frehtes Taifersar Arahesis6dd77752013-07-12 11:18:11 +0800145 if isinstance(source, StringTypes):
Daniel Veillard4f860202003-01-02 13:00:02 +0000146 reader = libxml2.newTextReaderFilename(source)
147 else:
148 source = saxutils.prepare_input_source(source)
149 input = libxml2.inputBuffer(source.getByteStream())
150 reader = input.newTextReader(source.getSystemId())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000151 reader.SetErrorHandler(self._errorHandler,None)
Daniel Veillard4f860202003-01-02 13:00:02 +0000152 # configure reader
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000153 if self.__extparams:
154 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
155 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
156 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
157 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
158 else:
159 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
Daniel Veillard4f860202003-01-02 13:00:02 +0000160 # we reuse attribute maps (for a slight performance gain)
161 if self.__ns:
162 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
163 else:
164 attributesImpl = xmlreader.AttributesImpl({})
165 # prefixes to pop (for endPrefixMapping)
166 prefixes = []
167 # start loop
168 self._cont_handler.startDocument()
169 while 1:
170 r = reader.Read()
171 # check for errors
172 if r == 1:
173 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000174 self._reportErrors(0)
Daniel Veillard4f860202003-01-02 13:00:02 +0000175 elif r == 0:
176 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000177 self._reportErrors(0)
178 break # end of parse
Daniel Veillard4f860202003-01-02 13:00:02 +0000179 else:
Daniel Veillard4f860202003-01-02 13:00:02 +0000180 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000181 self._reportErrors(1)
Daniel Veillard4f860202003-01-02 13:00:02 +0000182 else:
183 self._err_handler.fatalError(\
184 SAXException("Read failed (no details available)"))
Daniel Veillard417be3a2003-01-20 21:26:34 +0000185 break # fatal parse error
Daniel Veillard4f860202003-01-02 13:00:02 +0000186 # get node type
187 nodeType = reader.NodeType()
188 # Element
189 if nodeType == 1:
190 if self.__ns:
191 eltName = (_d(reader.NamespaceUri()),\
192 _d(reader.LocalName()))
193 eltQName = _d(reader.Name())
194 attributesNSImpl._attrs = attrs = {}
195 attributesNSImpl._qnames = qnames = {}
196 newPrefixes = []
197 while reader.MoveToNextAttribute():
198 qname = _d(reader.Name())
199 value = _d(reader.Value())
200 if qname.startswith("xmlns"):
201 if len(qname) > 5:
202 newPrefix = qname[6:]
203 else:
204 newPrefix = None
205 newPrefixes.append(newPrefix)
206 self._cont_handler.startPrefixMapping(\
207 newPrefix,value)
208 if not self.__nspfx:
209 continue # don't report xmlns attribute
210 attName = (_d(reader.NamespaceUri()),
211 _d(reader.LocalName()))
212 qnames[attName] = qname
213 attrs[attName] = value
Daniel Veillard417be3a2003-01-20 21:26:34 +0000214 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000215 self._cont_handler.startElementNS( \
216 eltName,eltQName,attributesNSImpl)
217 if reader.IsEmptyElement():
218 self._cont_handler.endElementNS(eltName,eltQName)
219 for newPrefix in newPrefixes:
220 self._cont_handler.endPrefixMapping(newPrefix)
221 else:
222 prefixes.append(newPrefixes)
223 else:
224 eltName = _d(reader.Name())
225 attributesImpl._attrs = attrs = {}
226 while reader.MoveToNextAttribute():
227 attName = _d(reader.Name())
228 attrs[attName] = _d(reader.Value())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000229 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000230 self._cont_handler.startElement( \
231 eltName,attributesImpl)
232 if reader.IsEmptyElement():
233 self._cont_handler.endElement(eltName)
234 # EndElement
235 elif nodeType == 15:
236 if self.__ns:
237 self._cont_handler.endElementNS( \
238 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
239 _d(reader.Name()))
240 for prefix in prefixes.pop():
241 self._cont_handler.endPrefixMapping(prefix)
242 else:
243 self._cont_handler.endElement(_d(reader.Name()))
244 # Text
245 elif nodeType == 3:
246 self._cont_handler.characters(_d(reader.Value()))
247 # Whitespace
248 elif nodeType == 13:
249 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
250 # SignificantWhitespace
251 elif nodeType == 14:
252 self._cont_handler.characters(_d(reader.Value()))
253 # CDATA
254 elif nodeType == 4:
255 if not self.__lex_handler is None:
256 self.__lex_handler.startCDATA()
257 self._cont_handler.characters(_d(reader.Value()))
258 if not self.__lex_handler is None:
259 self.__lex_handler.endCDATA()
260 # EntityReference
261 elif nodeType == 5:
262 if not self.__lex_handler is None:
263 self.startEntity(_d(reader.Name()))
264 reader.ResolveEntity()
265 # EndEntity
266 elif nodeType == 16:
267 if not self.__lex_handler is None:
268 self.endEntity(_d(reader.Name()))
269 # ProcessingInstruction
270 elif nodeType == 7:
271 self._cont_handler.processingInstruction( \
272 _d(reader.Name()),_d(reader.Value()))
273 # Comment
274 elif nodeType == 8:
275 if not self.__lex_handler is None:
276 self.__lex_handler.comment(_d(reader.Value()))
277 # DocumentType
278 elif nodeType == 10:
279 #if not self.__lex_handler is None:
280 # self.__lex_handler.startDTD()
281 pass # TODO (how to detect endDTD? on first non-dtd event?)
282 # XmlDeclaration
283 elif nodeType == 17:
284 pass # TODO
285 # Entity
286 elif nodeType == 6:
287 pass # TODO (entity decl)
288 # Notation (decl)
289 elif nodeType == 12:
290 pass # TODO
291 # Attribute (never in this loop)
292 #elif nodeType == 2:
293 # pass
294 # Document (not exposed)
295 #elif nodeType == 9:
296 # pass
297 # DocumentFragment (never returned by XmlReader)
298 #elif nodeType == 11:
299 # pass
300 # None
301 #elif nodeType == 0:
302 # pass
303 # -
304 else:
305 raise SAXException("Unexpected node type %d" % nodeType)
306 if r == 0:
307 self._cont_handler.endDocument()
308 reader.Close()
309 finally:
310 self.__parsing = 0
Daniel Veillard4f860202003-01-02 13:00:02 +0000311
312 def setDTDHandler(self, handler):
313 # TODO (when supported, the inherited method works just fine)
314 raise SAXNotSupportedException("DTDHandler not supported")
315
316 def setEntityResolver(self, resolver):
317 # TODO (when supported, the inherited method works just fine)
318 raise SAXNotSupportedException("EntityResolver not supported")
319
320 def getFeature(self, name):
321 if name == feature_namespaces:
322 return self.__ns
323 elif name == feature_namespace_prefixes:
324 return self.__nspfx
325 elif name == feature_validation:
326 return self.__validate
327 elif name == feature_external_ges:
328 return 1 # TODO (does that relate to PARSER_LOADDTD)?
329 elif name == feature_external_pes:
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000330 return self.__extparams
Daniel Veillard4f860202003-01-02 13:00:02 +0000331 else:
332 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
333 name)
334
335 def setFeature(self, name, state):
336 if self.__parsing:
337 raise SAXNotSupportedException("Cannot set feature %s " \
338 "while parsing" % name)
339 if name == feature_namespaces:
340 self.__ns = state
341 elif name == feature_namespace_prefixes:
342 self.__nspfx = state
343 elif name == feature_validation:
344 self.__validate = state
345 elif name == feature_external_ges:
346 if state == 0:
347 # TODO (does that relate to PARSER_LOADDTD)?
348 raise SAXNotSupportedException("Feature '%s' not supported" % \
349 name)
350 elif name == feature_external_pes:
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000351 self.__extparams = state
Daniel Veillard4f860202003-01-02 13:00:02 +0000352 else:
353 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
354 name)
355
356 def getProperty(self, name):
357 if name == property_lexical_handler:
358 return self.__lex_handler
359 elif name == property_declaration_handler:
360 return self.__decl_handler
361 else:
362 raise SAXNotRecognizedException("Property '%s' not recognized" % \
363 name)
364
365 def setProperty(self, name, value):
366 if name == property_lexical_handler:
367 self.__lex_handler = value
368 elif name == property_declaration_handler:
369 # TODO: remove if/when libxml2 supports dtd events
370 raise SAXNotSupportedException("Property '%s' not supported" % \
371 name)
372 self.__decl_handler = value
373 else:
374 raise SAXNotRecognizedException("Property '%s' not recognized" % \
375 name)
376
377def create_parser():
378 return LibXml2Reader()
379