blob: 514aa89954e0c0570ff9596a07d384bd036a9cc8 [file] [log] [blame]
Daniel Veillard4f860202003-01-02 13:00:02 +00001""" A SAX2 driver for libxml2, on top of it's XmlReader API
2
3USAGE
4 # put this file (drv_libxml2.py) in PYTHONPATH
5 import xml.sax
6 reader = xml.sax.make_parser(["drv_libxml2"])
7 # ...and the rest is standard python sax.
8
9CAVEATS
10 - Lexical handlers are supported, except for start/endEntity
11 (waiting for XmlReader.ResolveEntity) and start/endDTD
12 - as understand it, libxml2 error handlers are globals (per thread);
13 each call to parse() registers a new error handler,
14 overwriting any previously registered handler
15 --> you can't have 2 LibXml2Reader active at the same time
16
17TODO
18 - search for TODO
19 - some ErrorHandler events (warning)
20 - some ContentHandler events (setDocumentLocator, skippedEntity)
21 - EntityResolver (using libxml2.?)
22 - DTDHandler (if/when libxml2 exposes such node types)
23 - DeclHandler (if/when libxml2 exposes such node types)
24 - property_xml_string?
25 - feature_string_interning?
26 - Incremental parser
27 - additional performance tuning:
28 - one might cache callbacks to avoid some name lookups
29 - one might implement a smarter way to pass attributes to startElement
30 (some kind of lazy evaluation?)
31 - there might be room for improvement in start/endPrefixMapping
32 - other?
33
34"""
35
36__author__ = u"Stéphane Bidoul <sbi@skynet.be>"
37__version__ = "0.1"
38
39import codecs
40from types import StringTypes
41
42from xml.sax._exceptions import *
43from xml.sax import xmlreader, saxutils
44from xml.sax.handler import \
45 feature_namespaces, \
46 feature_namespace_prefixes, \
47 feature_string_interning, \
48 feature_validation, \
49 feature_external_ges, \
50 feature_external_pes, \
51 property_lexical_handler, \
52 property_declaration_handler, \
53 property_dom_node, \
54 property_xml_string
55
56# libxml2 returns strings as UTF8
57_decoder = codecs.getdecoder("utf8")
58def _d(s):
59 if s is None:
60 return s
61 else:
62 return _decoder(s)[0]
63
64try:
65 import libxml2
66except ImportError, e:
67 raise SAXReaderNotAvailable("libxml2 not available: " + e)
68
69try:
70 import libxslt
71except ImportError:
72 # normal behaviour
73 def _registerErrorHandler(handler):
74 libxml2.registerErrorHandler(handler,"drv_libxml")
75else:
76 # work around libxslt bindings bug (libxml2 bug #102181)
77 def _registerErrorHandler(handler):
78 libxml2.registerErrorHandler(handler,"drv_libxml")
79 libxslt.registerErrorHandler(handler,"drv_libxml")
80
81class LibXml2Reader(xmlreader.XMLReader):
82
83 def __init__(self):
84 xmlreader.XMLReader.__init__(self)
85 # features
86 self.__ns = 0
87 self.__nspfx = 0
88 self.__validate = 0
89 # parsing flag
90 self.__parsing = 0
91 # additional handlers
92 self.__lex_handler = None
93 self.__decl_handler = None
94 # error messages accumulator
95 self.__errors = None
96
97 def _errorHandler(self,ctx,str):
98 if self.__errors is None:
99 self.__errors = []
100 self.__errors.append(str)
101
102 def _reportError(self,callback):
103 # TODO: use SAXParseException, but we need a Locator for that
104 # TODO: distinguish warnings from errors
105 msg = "".join(self.__errors)
106 self.__errors = None
107 callback(SAXException(msg))
108
109 def parse(self, source):
110 self.__parsing = 1
111 _registerErrorHandler(self._errorHandler)
112 try:
113 # prepare source and create reader
114 if type(source) in StringTypes:
115 reader = libxml2.newTextReaderFilename(source)
116 else:
117 source = saxutils.prepare_input_source(source)
118 input = libxml2.inputBuffer(source.getByteStream())
119 reader = input.newTextReader(source.getSystemId())
120 # configure reader
121 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
122 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
123 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
124 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
125 # we reuse attribute maps (for a slight performance gain)
126 if self.__ns:
127 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
128 else:
129 attributesImpl = xmlreader.AttributesImpl({})
130 # prefixes to pop (for endPrefixMapping)
131 prefixes = []
132 # start loop
133 self._cont_handler.startDocument()
134 while 1:
135 r = reader.Read()
136 # check for errors
137 if r == 1:
138 if not self.__errors is None:
139 # non-fatal error
140 self._reportError(self._err_handler.error)
141 elif r == 0:
142 if not self.__errors is None:
143 # non-fatal error
144 self._reportError(self._err_handler.error)
145 break
146 else:
147 # fatal error
148 if not self.__errors is None:
149 self._reportError(self._err_handler.fatalError)
150 else:
151 self._err_handler.fatalError(\
152 SAXException("Read failed (no details available)"))
153 break
154 # get node type
155 nodeType = reader.NodeType()
156 # Element
157 if nodeType == 1:
158 if self.__ns:
159 eltName = (_d(reader.NamespaceUri()),\
160 _d(reader.LocalName()))
161 eltQName = _d(reader.Name())
162 attributesNSImpl._attrs = attrs = {}
163 attributesNSImpl._qnames = qnames = {}
164 newPrefixes = []
165 while reader.MoveToNextAttribute():
166 qname = _d(reader.Name())
167 value = _d(reader.Value())
168 if qname.startswith("xmlns"):
169 if len(qname) > 5:
170 newPrefix = qname[6:]
171 else:
172 newPrefix = None
173 newPrefixes.append(newPrefix)
174 self._cont_handler.startPrefixMapping(\
175 newPrefix,value)
176 if not self.__nspfx:
177 continue # don't report xmlns attribute
178 attName = (_d(reader.NamespaceUri()),
179 _d(reader.LocalName()))
180 qnames[attName] = qname
181 attrs[attName] = value
182 self._cont_handler.startElementNS( \
183 eltName,eltQName,attributesNSImpl)
184 if reader.IsEmptyElement():
185 self._cont_handler.endElementNS(eltName,eltQName)
186 for newPrefix in newPrefixes:
187 self._cont_handler.endPrefixMapping(newPrefix)
188 else:
189 prefixes.append(newPrefixes)
190 else:
191 eltName = _d(reader.Name())
192 attributesImpl._attrs = attrs = {}
193 while reader.MoveToNextAttribute():
194 attName = _d(reader.Name())
195 attrs[attName] = _d(reader.Value())
196 self._cont_handler.startElement( \
197 eltName,attributesImpl)
198 if reader.IsEmptyElement():
199 self._cont_handler.endElement(eltName)
200 # EndElement
201 elif nodeType == 15:
202 if self.__ns:
203 self._cont_handler.endElementNS( \
204 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
205 _d(reader.Name()))
206 for prefix in prefixes.pop():
207 self._cont_handler.endPrefixMapping(prefix)
208 else:
209 self._cont_handler.endElement(_d(reader.Name()))
210 # Text
211 elif nodeType == 3:
212 self._cont_handler.characters(_d(reader.Value()))
213 # Whitespace
214 elif nodeType == 13:
215 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
216 # SignificantWhitespace
217 elif nodeType == 14:
218 self._cont_handler.characters(_d(reader.Value()))
219 # CDATA
220 elif nodeType == 4:
221 if not self.__lex_handler is None:
222 self.__lex_handler.startCDATA()
223 self._cont_handler.characters(_d(reader.Value()))
224 if not self.__lex_handler is None:
225 self.__lex_handler.endCDATA()
226 # EntityReference
227 elif nodeType == 5:
228 if not self.__lex_handler is None:
229 self.startEntity(_d(reader.Name()))
230 reader.ResolveEntity()
231 # EndEntity
232 elif nodeType == 16:
233 if not self.__lex_handler is None:
234 self.endEntity(_d(reader.Name()))
235 # ProcessingInstruction
236 elif nodeType == 7:
237 self._cont_handler.processingInstruction( \
238 _d(reader.Name()),_d(reader.Value()))
239 # Comment
240 elif nodeType == 8:
241 if not self.__lex_handler is None:
242 self.__lex_handler.comment(_d(reader.Value()))
243 # DocumentType
244 elif nodeType == 10:
245 #if not self.__lex_handler is None:
246 # self.__lex_handler.startDTD()
247 pass # TODO (how to detect endDTD? on first non-dtd event?)
248 # XmlDeclaration
249 elif nodeType == 17:
250 pass # TODO
251 # Entity
252 elif nodeType == 6:
253 pass # TODO (entity decl)
254 # Notation (decl)
255 elif nodeType == 12:
256 pass # TODO
257 # Attribute (never in this loop)
258 #elif nodeType == 2:
259 # pass
260 # Document (not exposed)
261 #elif nodeType == 9:
262 # pass
263 # DocumentFragment (never returned by XmlReader)
264 #elif nodeType == 11:
265 # pass
266 # None
267 #elif nodeType == 0:
268 # pass
269 # -
270 else:
271 raise SAXException("Unexpected node type %d" % nodeType)
272 if r == 0:
273 self._cont_handler.endDocument()
274 reader.Close()
275 finally:
276 self.__parsing = 0
277 # TODO: unregister error handler?
278
279 def setDTDHandler(self, handler):
280 # TODO (when supported, the inherited method works just fine)
281 raise SAXNotSupportedException("DTDHandler not supported")
282
283 def setEntityResolver(self, resolver):
284 # TODO (when supported, the inherited method works just fine)
285 raise SAXNotSupportedException("EntityResolver not supported")
286
287 def getFeature(self, name):
288 if name == feature_namespaces:
289 return self.__ns
290 elif name == feature_namespace_prefixes:
291 return self.__nspfx
292 elif name == feature_validation:
293 return self.__validate
294 elif name == feature_external_ges:
295 return 1 # TODO (does that relate to PARSER_LOADDTD)?
296 elif name == feature_external_pes:
297 return 1 # TODO (does that relate to PARSER_LOADDTD)?
298 else:
299 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
300 name)
301
302 def setFeature(self, name, state):
303 if self.__parsing:
304 raise SAXNotSupportedException("Cannot set feature %s " \
305 "while parsing" % name)
306 if name == feature_namespaces:
307 self.__ns = state
308 elif name == feature_namespace_prefixes:
309 self.__nspfx = state
310 elif name == feature_validation:
311 self.__validate = state
312 elif name == feature_external_ges:
313 if state == 0:
314 # TODO (does that relate to PARSER_LOADDTD)?
315 raise SAXNotSupportedException("Feature '%s' not supported" % \
316 name)
317 elif name == feature_external_pes:
318 if state == 0:
319 # TODO (does that relate to PARSER_LOADDTD)?
320 raise SAXNotSupportedException("Feature '%s' not supported" % \
321 name)
322 else:
323 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
324 name)
325
326 def getProperty(self, name):
327 if name == property_lexical_handler:
328 return self.__lex_handler
329 elif name == property_declaration_handler:
330 return self.__decl_handler
331 else:
332 raise SAXNotRecognizedException("Property '%s' not recognized" % \
333 name)
334
335 def setProperty(self, name, value):
336 if name == property_lexical_handler:
337 self.__lex_handler = value
338 elif name == property_declaration_handler:
339 # TODO: remove if/when libxml2 supports dtd events
340 raise SAXNotSupportedException("Property '%s' not supported" % \
341 name)
342 self.__decl_handler = value
343 else:
344 raise SAXNotRecognizedException("Property '%s' not recognized" % \
345 name)
346
347def create_parser():
348 return LibXml2Reader()
349