blob: 977c8686ef0d36800e6e5af1954cb87d1cbeba39 [file] [log] [blame]
Daniel Veillard4f860202003-01-02 13:00:02 +00001""" A SAX2 driver for libxml2, on top of it's XmlReader API
2
3USAGE
4 # put this file (drv_libxml2.py) in PYTHONPATH
5 import xml.sax
6 reader = xml.sax.make_parser(["drv_libxml2"])
7 # ...and the rest is standard python sax.
8
9CAVEATS
10 - Lexical handlers are supported, except for start/endEntity
11 (waiting for XmlReader.ResolveEntity) and start/endDTD
Daniel Veillard417be3a2003-01-20 21:26:34 +000012 - Error callbacks are not exactly synchronous, they tend
13 to be invoked before the corresponding content callback,
14 because the underlying reader interface parses
15 data by chunks of 512 bytes
Daniel Veillard4f860202003-01-02 13:00:02 +000016
17TODO
18 - search for TODO
19 - some ErrorHandler events (warning)
20 - some ContentHandler events (setDocumentLocator, skippedEntity)
21 - EntityResolver (using libxml2.?)
22 - DTDHandler (if/when libxml2 exposes such node types)
23 - DeclHandler (if/when libxml2 exposes such node types)
24 - property_xml_string?
25 - feature_string_interning?
26 - Incremental parser
27 - additional performance tuning:
28 - one might cache callbacks to avoid some name lookups
29 - one might implement a smarter way to pass attributes to startElement
30 (some kind of lazy evaluation?)
31 - there might be room for improvement in start/endPrefixMapping
32 - other?
33
34"""
35
36__author__ = u"Stéphane Bidoul <sbi@skynet.be>"
Daniel Veillard417be3a2003-01-20 21:26:34 +000037__version__ = "0.3"
Daniel Veillard4f860202003-01-02 13:00:02 +000038
39import codecs
Daniel Veillarde329fc22003-01-09 21:36:42 +000040import sys
41from types import StringType, UnicodeType
42StringTypes = (StringType,UnicodeType)
Daniel Veillard4f860202003-01-02 13:00:02 +000043
44from xml.sax._exceptions import *
45from xml.sax import xmlreader, saxutils
46from xml.sax.handler import \
47 feature_namespaces, \
48 feature_namespace_prefixes, \
49 feature_string_interning, \
50 feature_validation, \
51 feature_external_ges, \
52 feature_external_pes, \
53 property_lexical_handler, \
54 property_declaration_handler, \
55 property_dom_node, \
56 property_xml_string
57
58# libxml2 returns strings as UTF8
Daniel Veillarde329fc22003-01-09 21:36:42 +000059_decoder = codecs.lookup("utf8")[1]
Daniel Veillard4f860202003-01-02 13:00:02 +000060def _d(s):
61 if s is None:
62 return s
63 else:
64 return _decoder(s)[0]
65
66try:
67 import libxml2
68except ImportError, e:
Daniel Veillarde329fc22003-01-09 21:36:42 +000069 raise SAXReaderNotAvailable("libxml2 not available: " \
70 "import error was: %s" % e)
Daniel Veillard4f860202003-01-02 13:00:02 +000071
Daniel Veillard417be3a2003-01-20 21:26:34 +000072class Locator(xmlreader.Locator):
73 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
74
75 def __init__(self,locator):
76 self.__locator = locator
77
78 def getColumnNumber(self):
79 "Return the column number where the current event ends."
80 return -1
81
82 def getLineNumber(self):
83 "Return the line number where the current event ends."
84 return self.__locator.LineNumber()
85
86 def getPublicId(self):
87 "Return the public identifier for the current event."
88 return None
89
90 def getSystemId(self):
91 "Return the system identifier for the current event."
92 return self.__locator.BaseURI()
Daniel Veillard4f860202003-01-02 13:00:02 +000093
94class LibXml2Reader(xmlreader.XMLReader):
95
96 def __init__(self):
97 xmlreader.XMLReader.__init__(self)
98 # features
99 self.__ns = 0
100 self.__nspfx = 0
101 self.__validate = 0
102 # parsing flag
103 self.__parsing = 0
104 # additional handlers
105 self.__lex_handler = None
106 self.__decl_handler = None
107 # error messages accumulator
108 self.__errors = None
109
Daniel Veillard417be3a2003-01-20 21:26:34 +0000110 def _errorHandler(self,arg,msg,severity,locator):
Daniel Veillard4f860202003-01-02 13:00:02 +0000111 if self.__errors is None:
112 self.__errors = []
Daniel Veillard417be3a2003-01-20 21:26:34 +0000113 self.__errors.append((severity,
114 SAXParseException(msg,None,
115 Locator(locator))))
Daniel Veillard4f860202003-01-02 13:00:02 +0000116
Daniel Veillard417be3a2003-01-20 21:26:34 +0000117 def _reportErrors(self,fatal):
118 for severity,exception in self.__errors:
119 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
120 libxml2.PARSER_SEVERITY_WARNING):
121 self._err_handler.warning(exception)
122 else:
123 # when fatal is set, the parse will stop;
124 # we consider that the last error reported
125 # is the fatal one.
126 if fatal and exception is self.__errors[-1][1]:
127 self._err_handler.fatalError(exception)
128 else:
129 self._err_handler.error(exception)
Daniel Veillard4f860202003-01-02 13:00:02 +0000130 self.__errors = None
Daniel Veillard4f860202003-01-02 13:00:02 +0000131
132 def parse(self, source):
133 self.__parsing = 1
Daniel Veillard4f860202003-01-02 13:00:02 +0000134 try:
135 # prepare source and create reader
136 if type(source) in StringTypes:
137 reader = libxml2.newTextReaderFilename(source)
138 else:
139 source = saxutils.prepare_input_source(source)
140 input = libxml2.inputBuffer(source.getByteStream())
141 reader = input.newTextReader(source.getSystemId())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000142 reader.SetErrorHandler(self._errorHandler,None)
Daniel Veillard4f860202003-01-02 13:00:02 +0000143 # configure reader
144 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
145 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
146 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
147 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
148 # we reuse attribute maps (for a slight performance gain)
149 if self.__ns:
150 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
151 else:
152 attributesImpl = xmlreader.AttributesImpl({})
153 # prefixes to pop (for endPrefixMapping)
154 prefixes = []
155 # start loop
156 self._cont_handler.startDocument()
157 while 1:
158 r = reader.Read()
159 # check for errors
160 if r == 1:
161 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000162 self._reportErrors(0)
Daniel Veillard4f860202003-01-02 13:00:02 +0000163 elif r == 0:
164 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000165 self._reportErrors(0)
166 break # end of parse
Daniel Veillard4f860202003-01-02 13:00:02 +0000167 else:
Daniel Veillard4f860202003-01-02 13:00:02 +0000168 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000169 self._reportErrors(1)
Daniel Veillard4f860202003-01-02 13:00:02 +0000170 else:
171 self._err_handler.fatalError(\
172 SAXException("Read failed (no details available)"))
Daniel Veillard417be3a2003-01-20 21:26:34 +0000173 break # fatal parse error
Daniel Veillard4f860202003-01-02 13:00:02 +0000174 # get node type
175 nodeType = reader.NodeType()
176 # Element
177 if nodeType == 1:
178 if self.__ns:
179 eltName = (_d(reader.NamespaceUri()),\
180 _d(reader.LocalName()))
181 eltQName = _d(reader.Name())
182 attributesNSImpl._attrs = attrs = {}
183 attributesNSImpl._qnames = qnames = {}
184 newPrefixes = []
185 while reader.MoveToNextAttribute():
186 qname = _d(reader.Name())
187 value = _d(reader.Value())
188 if qname.startswith("xmlns"):
189 if len(qname) > 5:
190 newPrefix = qname[6:]
191 else:
192 newPrefix = None
193 newPrefixes.append(newPrefix)
194 self._cont_handler.startPrefixMapping(\
195 newPrefix,value)
196 if not self.__nspfx:
197 continue # don't report xmlns attribute
198 attName = (_d(reader.NamespaceUri()),
199 _d(reader.LocalName()))
200 qnames[attName] = qname
201 attrs[attName] = value
Daniel Veillard417be3a2003-01-20 21:26:34 +0000202 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000203 self._cont_handler.startElementNS( \
204 eltName,eltQName,attributesNSImpl)
205 if reader.IsEmptyElement():
206 self._cont_handler.endElementNS(eltName,eltQName)
207 for newPrefix in newPrefixes:
208 self._cont_handler.endPrefixMapping(newPrefix)
209 else:
210 prefixes.append(newPrefixes)
211 else:
212 eltName = _d(reader.Name())
213 attributesImpl._attrs = attrs = {}
214 while reader.MoveToNextAttribute():
215 attName = _d(reader.Name())
216 attrs[attName] = _d(reader.Value())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000217 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000218 self._cont_handler.startElement( \
219 eltName,attributesImpl)
220 if reader.IsEmptyElement():
221 self._cont_handler.endElement(eltName)
222 # EndElement
223 elif nodeType == 15:
224 if self.__ns:
225 self._cont_handler.endElementNS( \
226 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
227 _d(reader.Name()))
228 for prefix in prefixes.pop():
229 self._cont_handler.endPrefixMapping(prefix)
230 else:
231 self._cont_handler.endElement(_d(reader.Name()))
232 # Text
233 elif nodeType == 3:
234 self._cont_handler.characters(_d(reader.Value()))
235 # Whitespace
236 elif nodeType == 13:
237 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
238 # SignificantWhitespace
239 elif nodeType == 14:
240 self._cont_handler.characters(_d(reader.Value()))
241 # CDATA
242 elif nodeType == 4:
243 if not self.__lex_handler is None:
244 self.__lex_handler.startCDATA()
245 self._cont_handler.characters(_d(reader.Value()))
246 if not self.__lex_handler is None:
247 self.__lex_handler.endCDATA()
248 # EntityReference
249 elif nodeType == 5:
250 if not self.__lex_handler is None:
251 self.startEntity(_d(reader.Name()))
252 reader.ResolveEntity()
253 # EndEntity
254 elif nodeType == 16:
255 if not self.__lex_handler is None:
256 self.endEntity(_d(reader.Name()))
257 # ProcessingInstruction
258 elif nodeType == 7:
259 self._cont_handler.processingInstruction( \
260 _d(reader.Name()),_d(reader.Value()))
261 # Comment
262 elif nodeType == 8:
263 if not self.__lex_handler is None:
264 self.__lex_handler.comment(_d(reader.Value()))
265 # DocumentType
266 elif nodeType == 10:
267 #if not self.__lex_handler is None:
268 # self.__lex_handler.startDTD()
269 pass # TODO (how to detect endDTD? on first non-dtd event?)
270 # XmlDeclaration
271 elif nodeType == 17:
272 pass # TODO
273 # Entity
274 elif nodeType == 6:
275 pass # TODO (entity decl)
276 # Notation (decl)
277 elif nodeType == 12:
278 pass # TODO
279 # Attribute (never in this loop)
280 #elif nodeType == 2:
281 # pass
282 # Document (not exposed)
283 #elif nodeType == 9:
284 # pass
285 # DocumentFragment (never returned by XmlReader)
286 #elif nodeType == 11:
287 # pass
288 # None
289 #elif nodeType == 0:
290 # pass
291 # -
292 else:
293 raise SAXException("Unexpected node type %d" % nodeType)
294 if r == 0:
295 self._cont_handler.endDocument()
296 reader.Close()
297 finally:
298 self.__parsing = 0
Daniel Veillard4f860202003-01-02 13:00:02 +0000299
300 def setDTDHandler(self, handler):
301 # TODO (when supported, the inherited method works just fine)
302 raise SAXNotSupportedException("DTDHandler not supported")
303
304 def setEntityResolver(self, resolver):
305 # TODO (when supported, the inherited method works just fine)
306 raise SAXNotSupportedException("EntityResolver not supported")
307
308 def getFeature(self, name):
309 if name == feature_namespaces:
310 return self.__ns
311 elif name == feature_namespace_prefixes:
312 return self.__nspfx
313 elif name == feature_validation:
314 return self.__validate
315 elif name == feature_external_ges:
316 return 1 # TODO (does that relate to PARSER_LOADDTD)?
317 elif name == feature_external_pes:
318 return 1 # TODO (does that relate to PARSER_LOADDTD)?
319 else:
320 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
321 name)
322
323 def setFeature(self, name, state):
324 if self.__parsing:
325 raise SAXNotSupportedException("Cannot set feature %s " \
326 "while parsing" % name)
327 if name == feature_namespaces:
328 self.__ns = state
329 elif name == feature_namespace_prefixes:
330 self.__nspfx = state
331 elif name == feature_validation:
332 self.__validate = state
333 elif name == feature_external_ges:
334 if state == 0:
335 # TODO (does that relate to PARSER_LOADDTD)?
336 raise SAXNotSupportedException("Feature '%s' not supported" % \
337 name)
338 elif name == feature_external_pes:
339 if state == 0:
340 # TODO (does that relate to PARSER_LOADDTD)?
341 raise SAXNotSupportedException("Feature '%s' not supported" % \
342 name)
343 else:
344 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
345 name)
346
347 def getProperty(self, name):
348 if name == property_lexical_handler:
349 return self.__lex_handler
350 elif name == property_declaration_handler:
351 return self.__decl_handler
352 else:
353 raise SAXNotRecognizedException("Property '%s' not recognized" % \
354 name)
355
356 def setProperty(self, name, value):
357 if name == property_lexical_handler:
358 self.__lex_handler = value
359 elif name == property_declaration_handler:
360 # TODO: remove if/when libxml2 supports dtd events
361 raise SAXNotSupportedException("Property '%s' not supported" % \
362 name)
363 self.__decl_handler = value
364 else:
365 raise SAXNotRecognizedException("Property '%s' not recognized" % \
366 name)
367
368def create_parser():
369 return LibXml2Reader()
370