blob: 421f9455b0b3f71ffb9e04ad59bb75b6a2734967 [file] [log] [blame]
William M. Bracke9449c52004-07-11 14:41:20 +00001# -*- coding: iso-8859-1 -*-
Daniel Veillard4f860202003-01-02 13:00:02 +00002""" A SAX2 driver for libxml2, on top of it's XmlReader API
3
4USAGE
5 # put this file (drv_libxml2.py) in PYTHONPATH
6 import xml.sax
7 reader = xml.sax.make_parser(["drv_libxml2"])
8 # ...and the rest is standard python sax.
9
10CAVEATS
11 - Lexical handlers are supported, except for start/endEntity
12 (waiting for XmlReader.ResolveEntity) and start/endDTD
Daniel Veillard417be3a2003-01-20 21:26:34 +000013 - Error callbacks are not exactly synchronous, they tend
14 to be invoked before the corresponding content callback,
15 because the underlying reader interface parses
16 data by chunks of 512 bytes
Daniel Veillard4f860202003-01-02 13:00:02 +000017
18TODO
19 - search for TODO
20 - some ErrorHandler events (warning)
21 - some ContentHandler events (setDocumentLocator, skippedEntity)
22 - EntityResolver (using libxml2.?)
23 - DTDHandler (if/when libxml2 exposes such node types)
24 - DeclHandler (if/when libxml2 exposes such node types)
25 - property_xml_string?
26 - feature_string_interning?
27 - Incremental parser
28 - additional performance tuning:
29 - one might cache callbacks to avoid some name lookups
30 - one might implement a smarter way to pass attributes to startElement
31 (some kind of lazy evaluation?)
32 - there might be room for improvement in start/endPrefixMapping
33 - other?
34
35"""
36
37__author__ = u"Stéphane Bidoul <sbi@skynet.be>"
Daniel Veillard417be3a2003-01-20 21:26:34 +000038__version__ = "0.3"
Daniel Veillard4f860202003-01-02 13:00:02 +000039
40import codecs
Daniel Veillarde329fc22003-01-09 21:36:42 +000041import sys
42from types import StringType, UnicodeType
43StringTypes = (StringType,UnicodeType)
Daniel Veillard4f860202003-01-02 13:00:02 +000044
45from xml.sax._exceptions import *
46from xml.sax import xmlreader, saxutils
47from xml.sax.handler import \
48 feature_namespaces, \
49 feature_namespace_prefixes, \
50 feature_string_interning, \
51 feature_validation, \
52 feature_external_ges, \
53 feature_external_pes, \
54 property_lexical_handler, \
55 property_declaration_handler, \
56 property_dom_node, \
57 property_xml_string
58
59# libxml2 returns strings as UTF8
Daniel Veillarde329fc22003-01-09 21:36:42 +000060_decoder = codecs.lookup("utf8")[1]
Daniel Veillard4f860202003-01-02 13:00:02 +000061def _d(s):
62 if s is None:
63 return s
64 else:
65 return _decoder(s)[0]
66
67try:
68 import libxml2
69except ImportError, e:
Daniel Veillarde329fc22003-01-09 21:36:42 +000070 raise SAXReaderNotAvailable("libxml2 not available: " \
71 "import error was: %s" % e)
Daniel Veillard4f860202003-01-02 13:00:02 +000072
Daniel Veillard417be3a2003-01-20 21:26:34 +000073class Locator(xmlreader.Locator):
74 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
75
76 def __init__(self,locator):
77 self.__locator = locator
78
79 def getColumnNumber(self):
80 "Return the column number where the current event ends."
81 return -1
82
83 def getLineNumber(self):
84 "Return the line number where the current event ends."
85 return self.__locator.LineNumber()
86
87 def getPublicId(self):
88 "Return the public identifier for the current event."
89 return None
90
91 def getSystemId(self):
92 "Return the system identifier for the current event."
93 return self.__locator.BaseURI()
Daniel Veillard4f860202003-01-02 13:00:02 +000094
95class LibXml2Reader(xmlreader.XMLReader):
96
97 def __init__(self):
98 xmlreader.XMLReader.__init__(self)
99 # features
100 self.__ns = 0
101 self.__nspfx = 0
102 self.__validate = 0
103 # parsing flag
104 self.__parsing = 0
105 # additional handlers
106 self.__lex_handler = None
107 self.__decl_handler = None
108 # error messages accumulator
109 self.__errors = None
110
Daniel Veillard417be3a2003-01-20 21:26:34 +0000111 def _errorHandler(self,arg,msg,severity,locator):
Daniel Veillard4f860202003-01-02 13:00:02 +0000112 if self.__errors is None:
113 self.__errors = []
Daniel Veillard417be3a2003-01-20 21:26:34 +0000114 self.__errors.append((severity,
115 SAXParseException(msg,None,
116 Locator(locator))))
Daniel Veillard4f860202003-01-02 13:00:02 +0000117
Daniel Veillard417be3a2003-01-20 21:26:34 +0000118 def _reportErrors(self,fatal):
119 for severity,exception in self.__errors:
120 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
121 libxml2.PARSER_SEVERITY_WARNING):
122 self._err_handler.warning(exception)
123 else:
124 # when fatal is set, the parse will stop;
125 # we consider that the last error reported
126 # is the fatal one.
127 if fatal and exception is self.__errors[-1][1]:
128 self._err_handler.fatalError(exception)
129 else:
130 self._err_handler.error(exception)
Daniel Veillard4f860202003-01-02 13:00:02 +0000131 self.__errors = None
Daniel Veillard4f860202003-01-02 13:00:02 +0000132
133 def parse(self, source):
134 self.__parsing = 1
Daniel Veillard4f860202003-01-02 13:00:02 +0000135 try:
136 # prepare source and create reader
137 if type(source) in StringTypes:
138 reader = libxml2.newTextReaderFilename(source)
139 else:
140 source = saxutils.prepare_input_source(source)
141 input = libxml2.inputBuffer(source.getByteStream())
142 reader = input.newTextReader(source.getSystemId())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000143 reader.SetErrorHandler(self._errorHandler,None)
Daniel Veillard4f860202003-01-02 13:00:02 +0000144 # configure reader
145 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
146 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
147 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
148 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
149 # we reuse attribute maps (for a slight performance gain)
150 if self.__ns:
151 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
152 else:
153 attributesImpl = xmlreader.AttributesImpl({})
154 # prefixes to pop (for endPrefixMapping)
155 prefixes = []
156 # start loop
157 self._cont_handler.startDocument()
158 while 1:
159 r = reader.Read()
160 # check for errors
161 if r == 1:
162 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000163 self._reportErrors(0)
Daniel Veillard4f860202003-01-02 13:00:02 +0000164 elif r == 0:
165 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000166 self._reportErrors(0)
167 break # end of parse
Daniel Veillard4f860202003-01-02 13:00:02 +0000168 else:
Daniel Veillard4f860202003-01-02 13:00:02 +0000169 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000170 self._reportErrors(1)
Daniel Veillard4f860202003-01-02 13:00:02 +0000171 else:
172 self._err_handler.fatalError(\
173 SAXException("Read failed (no details available)"))
Daniel Veillard417be3a2003-01-20 21:26:34 +0000174 break # fatal parse error
Daniel Veillard4f860202003-01-02 13:00:02 +0000175 # get node type
176 nodeType = reader.NodeType()
177 # Element
178 if nodeType == 1:
179 if self.__ns:
180 eltName = (_d(reader.NamespaceUri()),\
181 _d(reader.LocalName()))
182 eltQName = _d(reader.Name())
183 attributesNSImpl._attrs = attrs = {}
184 attributesNSImpl._qnames = qnames = {}
185 newPrefixes = []
186 while reader.MoveToNextAttribute():
187 qname = _d(reader.Name())
188 value = _d(reader.Value())
189 if qname.startswith("xmlns"):
190 if len(qname) > 5:
191 newPrefix = qname[6:]
192 else:
193 newPrefix = None
194 newPrefixes.append(newPrefix)
195 self._cont_handler.startPrefixMapping(\
196 newPrefix,value)
197 if not self.__nspfx:
198 continue # don't report xmlns attribute
199 attName = (_d(reader.NamespaceUri()),
200 _d(reader.LocalName()))
201 qnames[attName] = qname
202 attrs[attName] = value
Daniel Veillard417be3a2003-01-20 21:26:34 +0000203 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000204 self._cont_handler.startElementNS( \
205 eltName,eltQName,attributesNSImpl)
206 if reader.IsEmptyElement():
207 self._cont_handler.endElementNS(eltName,eltQName)
208 for newPrefix in newPrefixes:
209 self._cont_handler.endPrefixMapping(newPrefix)
210 else:
211 prefixes.append(newPrefixes)
212 else:
213 eltName = _d(reader.Name())
214 attributesImpl._attrs = attrs = {}
215 while reader.MoveToNextAttribute():
216 attName = _d(reader.Name())
217 attrs[attName] = _d(reader.Value())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000218 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000219 self._cont_handler.startElement( \
220 eltName,attributesImpl)
221 if reader.IsEmptyElement():
222 self._cont_handler.endElement(eltName)
223 # EndElement
224 elif nodeType == 15:
225 if self.__ns:
226 self._cont_handler.endElementNS( \
227 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
228 _d(reader.Name()))
229 for prefix in prefixes.pop():
230 self._cont_handler.endPrefixMapping(prefix)
231 else:
232 self._cont_handler.endElement(_d(reader.Name()))
233 # Text
234 elif nodeType == 3:
235 self._cont_handler.characters(_d(reader.Value()))
236 # Whitespace
237 elif nodeType == 13:
238 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
239 # SignificantWhitespace
240 elif nodeType == 14:
241 self._cont_handler.characters(_d(reader.Value()))
242 # CDATA
243 elif nodeType == 4:
244 if not self.__lex_handler is None:
245 self.__lex_handler.startCDATA()
246 self._cont_handler.characters(_d(reader.Value()))
247 if not self.__lex_handler is None:
248 self.__lex_handler.endCDATA()
249 # EntityReference
250 elif nodeType == 5:
251 if not self.__lex_handler is None:
252 self.startEntity(_d(reader.Name()))
253 reader.ResolveEntity()
254 # EndEntity
255 elif nodeType == 16:
256 if not self.__lex_handler is None:
257 self.endEntity(_d(reader.Name()))
258 # ProcessingInstruction
259 elif nodeType == 7:
260 self._cont_handler.processingInstruction( \
261 _d(reader.Name()),_d(reader.Value()))
262 # Comment
263 elif nodeType == 8:
264 if not self.__lex_handler is None:
265 self.__lex_handler.comment(_d(reader.Value()))
266 # DocumentType
267 elif nodeType == 10:
268 #if not self.__lex_handler is None:
269 # self.__lex_handler.startDTD()
270 pass # TODO (how to detect endDTD? on first non-dtd event?)
271 # XmlDeclaration
272 elif nodeType == 17:
273 pass # TODO
274 # Entity
275 elif nodeType == 6:
276 pass # TODO (entity decl)
277 # Notation (decl)
278 elif nodeType == 12:
279 pass # TODO
280 # Attribute (never in this loop)
281 #elif nodeType == 2:
282 # pass
283 # Document (not exposed)
284 #elif nodeType == 9:
285 # pass
286 # DocumentFragment (never returned by XmlReader)
287 #elif nodeType == 11:
288 # pass
289 # None
290 #elif nodeType == 0:
291 # pass
292 # -
293 else:
294 raise SAXException("Unexpected node type %d" % nodeType)
295 if r == 0:
296 self._cont_handler.endDocument()
297 reader.Close()
298 finally:
299 self.__parsing = 0
Daniel Veillard4f860202003-01-02 13:00:02 +0000300
301 def setDTDHandler(self, handler):
302 # TODO (when supported, the inherited method works just fine)
303 raise SAXNotSupportedException("DTDHandler not supported")
304
305 def setEntityResolver(self, resolver):
306 # TODO (when supported, the inherited method works just fine)
307 raise SAXNotSupportedException("EntityResolver not supported")
308
309 def getFeature(self, name):
310 if name == feature_namespaces:
311 return self.__ns
312 elif name == feature_namespace_prefixes:
313 return self.__nspfx
314 elif name == feature_validation:
315 return self.__validate
316 elif name == feature_external_ges:
317 return 1 # TODO (does that relate to PARSER_LOADDTD)?
318 elif name == feature_external_pes:
319 return 1 # TODO (does that relate to PARSER_LOADDTD)?
320 else:
321 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
322 name)
323
324 def setFeature(self, name, state):
325 if self.__parsing:
326 raise SAXNotSupportedException("Cannot set feature %s " \
327 "while parsing" % name)
328 if name == feature_namespaces:
329 self.__ns = state
330 elif name == feature_namespace_prefixes:
331 self.__nspfx = state
332 elif name == feature_validation:
333 self.__validate = state
334 elif name == feature_external_ges:
335 if state == 0:
336 # TODO (does that relate to PARSER_LOADDTD)?
337 raise SAXNotSupportedException("Feature '%s' not supported" % \
338 name)
339 elif name == feature_external_pes:
340 if state == 0:
341 # TODO (does that relate to PARSER_LOADDTD)?
342 raise SAXNotSupportedException("Feature '%s' not supported" % \
343 name)
344 else:
345 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
346 name)
347
348 def getProperty(self, name):
349 if name == property_lexical_handler:
350 return self.__lex_handler
351 elif name == property_declaration_handler:
352 return self.__decl_handler
353 else:
354 raise SAXNotRecognizedException("Property '%s' not recognized" % \
355 name)
356
357 def setProperty(self, name, value):
358 if name == property_lexical_handler:
359 self.__lex_handler = value
360 elif name == property_declaration_handler:
361 # TODO: remove if/when libxml2 supports dtd events
362 raise SAXNotSupportedException("Property '%s' not supported" % \
363 name)
364 self.__decl_handler = value
365 else:
366 raise SAXNotRecognizedException("Property '%s' not recognized" % \
367 name)
368
369def create_parser():
370 return LibXml2Reader()
371