blob: e43fb1d7eaa14d5387e4a5cf5503c40b26592dbc [file] [log] [blame]
William M. Bracke9449c52004-07-11 14:41:20 +00001# -*- coding: iso-8859-1 -*-
Daniel Veillard4f860202003-01-02 13:00:02 +00002""" A SAX2 driver for libxml2, on top of it's XmlReader API
3
4USAGE
5 # put this file (drv_libxml2.py) in PYTHONPATH
6 import xml.sax
7 reader = xml.sax.make_parser(["drv_libxml2"])
8 # ...and the rest is standard python sax.
9
10CAVEATS
11 - Lexical handlers are supported, except for start/endEntity
12 (waiting for XmlReader.ResolveEntity) and start/endDTD
Daniel Veillard417be3a2003-01-20 21:26:34 +000013 - Error callbacks are not exactly synchronous, they tend
14 to be invoked before the corresponding content callback,
15 because the underlying reader interface parses
16 data by chunks of 512 bytes
Daniel Veillard4f860202003-01-02 13:00:02 +000017
18TODO
19 - search for TODO
20 - some ErrorHandler events (warning)
21 - some ContentHandler events (setDocumentLocator, skippedEntity)
22 - EntityResolver (using libxml2.?)
23 - DTDHandler (if/when libxml2 exposes such node types)
24 - DeclHandler (if/when libxml2 exposes such node types)
25 - property_xml_string?
26 - feature_string_interning?
27 - Incremental parser
28 - additional performance tuning:
29 - one might cache callbacks to avoid some name lookups
30 - one might implement a smarter way to pass attributes to startElement
31 (some kind of lazy evaluation?)
32 - there might be room for improvement in start/endPrefixMapping
33 - other?
34
35"""
36
37__author__ = u"Stéphane Bidoul <sbi@skynet.be>"
Daniel Veillard417be3a2003-01-20 21:26:34 +000038__version__ = "0.3"
Daniel Veillard4f860202003-01-02 13:00:02 +000039
40import codecs
Daniel Veillarde329fc22003-01-09 21:36:42 +000041from types import StringType, UnicodeType
42StringTypes = (StringType,UnicodeType)
Daniel Veillard4f860202003-01-02 13:00:02 +000043
44from xml.sax._exceptions import *
45from xml.sax import xmlreader, saxutils
46from xml.sax.handler import \
47 feature_namespaces, \
48 feature_namespace_prefixes, \
49 feature_string_interning, \
50 feature_validation, \
51 feature_external_ges, \
52 feature_external_pes, \
53 property_lexical_handler, \
54 property_declaration_handler, \
55 property_dom_node, \
56 property_xml_string
57
58# libxml2 returns strings as UTF8
Daniel Veillarde329fc22003-01-09 21:36:42 +000059_decoder = codecs.lookup("utf8")[1]
Daniel Veillard4f860202003-01-02 13:00:02 +000060def _d(s):
61 if s is None:
62 return s
63 else:
64 return _decoder(s)[0]
65
66try:
67 import libxml2
68except ImportError, e:
Daniel Veillarde329fc22003-01-09 21:36:42 +000069 raise SAXReaderNotAvailable("libxml2 not available: " \
70 "import error was: %s" % e)
Daniel Veillard4f860202003-01-02 13:00:02 +000071
Daniel Veillard417be3a2003-01-20 21:26:34 +000072class Locator(xmlreader.Locator):
73 """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
74
75 def __init__(self,locator):
76 self.__locator = locator
77
78 def getColumnNumber(self):
79 "Return the column number where the current event ends."
80 return -1
81
82 def getLineNumber(self):
83 "Return the line number where the current event ends."
84 return self.__locator.LineNumber()
85
86 def getPublicId(self):
87 "Return the public identifier for the current event."
88 return None
89
90 def getSystemId(self):
91 "Return the system identifier for the current event."
92 return self.__locator.BaseURI()
Daniel Veillard4f860202003-01-02 13:00:02 +000093
94class LibXml2Reader(xmlreader.XMLReader):
95
96 def __init__(self):
97 xmlreader.XMLReader.__init__(self)
98 # features
99 self.__ns = 0
100 self.__nspfx = 0
101 self.__validate = 0
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000102 self.__extparams = 1
Daniel Veillard4f860202003-01-02 13:00:02 +0000103 # parsing flag
104 self.__parsing = 0
105 # additional handlers
106 self.__lex_handler = None
107 self.__decl_handler = None
108 # error messages accumulator
109 self.__errors = None
110
Daniel Veillard417be3a2003-01-20 21:26:34 +0000111 def _errorHandler(self,arg,msg,severity,locator):
Daniel Veillard4f860202003-01-02 13:00:02 +0000112 if self.__errors is None:
113 self.__errors = []
Daniel Veillard417be3a2003-01-20 21:26:34 +0000114 self.__errors.append((severity,
115 SAXParseException(msg,None,
116 Locator(locator))))
Daniel Veillard4f860202003-01-02 13:00:02 +0000117
Daniel Veillard417be3a2003-01-20 21:26:34 +0000118 def _reportErrors(self,fatal):
119 for severity,exception in self.__errors:
120 if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
121 libxml2.PARSER_SEVERITY_WARNING):
122 self._err_handler.warning(exception)
123 else:
124 # when fatal is set, the parse will stop;
125 # we consider that the last error reported
126 # is the fatal one.
127 if fatal and exception is self.__errors[-1][1]:
128 self._err_handler.fatalError(exception)
129 else:
130 self._err_handler.error(exception)
Daniel Veillard4f860202003-01-02 13:00:02 +0000131 self.__errors = None
Daniel Veillard4f860202003-01-02 13:00:02 +0000132
133 def parse(self, source):
134 self.__parsing = 1
Daniel Veillard4f860202003-01-02 13:00:02 +0000135 try:
136 # prepare source and create reader
137 if type(source) in StringTypes:
138 reader = libxml2.newTextReaderFilename(source)
139 else:
140 source = saxutils.prepare_input_source(source)
141 input = libxml2.inputBuffer(source.getByteStream())
142 reader = input.newTextReader(source.getSystemId())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000143 reader.SetErrorHandler(self._errorHandler,None)
Daniel Veillard4f860202003-01-02 13:00:02 +0000144 # configure reader
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000145 if self.__extparams:
146 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
147 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
148 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
149 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
150 else:
151 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
Daniel Veillard4f860202003-01-02 13:00:02 +0000152 # we reuse attribute maps (for a slight performance gain)
153 if self.__ns:
154 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
155 else:
156 attributesImpl = xmlreader.AttributesImpl({})
157 # prefixes to pop (for endPrefixMapping)
158 prefixes = []
159 # start loop
160 self._cont_handler.startDocument()
161 while 1:
162 r = reader.Read()
163 # check for errors
164 if r == 1:
165 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000166 self._reportErrors(0)
Daniel Veillard4f860202003-01-02 13:00:02 +0000167 elif r == 0:
168 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000169 self._reportErrors(0)
170 break # end of parse
Daniel Veillard4f860202003-01-02 13:00:02 +0000171 else:
Daniel Veillard4f860202003-01-02 13:00:02 +0000172 if not self.__errors is None:
Daniel Veillard417be3a2003-01-20 21:26:34 +0000173 self._reportErrors(1)
Daniel Veillard4f860202003-01-02 13:00:02 +0000174 else:
175 self._err_handler.fatalError(\
176 SAXException("Read failed (no details available)"))
Daniel Veillard417be3a2003-01-20 21:26:34 +0000177 break # fatal parse error
Daniel Veillard4f860202003-01-02 13:00:02 +0000178 # get node type
179 nodeType = reader.NodeType()
180 # Element
181 if nodeType == 1:
182 if self.__ns:
183 eltName = (_d(reader.NamespaceUri()),\
184 _d(reader.LocalName()))
185 eltQName = _d(reader.Name())
186 attributesNSImpl._attrs = attrs = {}
187 attributesNSImpl._qnames = qnames = {}
188 newPrefixes = []
189 while reader.MoveToNextAttribute():
190 qname = _d(reader.Name())
191 value = _d(reader.Value())
192 if qname.startswith("xmlns"):
193 if len(qname) > 5:
194 newPrefix = qname[6:]
195 else:
196 newPrefix = None
197 newPrefixes.append(newPrefix)
198 self._cont_handler.startPrefixMapping(\
199 newPrefix,value)
200 if not self.__nspfx:
201 continue # don't report xmlns attribute
202 attName = (_d(reader.NamespaceUri()),
203 _d(reader.LocalName()))
204 qnames[attName] = qname
205 attrs[attName] = value
Daniel Veillard417be3a2003-01-20 21:26:34 +0000206 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000207 self._cont_handler.startElementNS( \
208 eltName,eltQName,attributesNSImpl)
209 if reader.IsEmptyElement():
210 self._cont_handler.endElementNS(eltName,eltQName)
211 for newPrefix in newPrefixes:
212 self._cont_handler.endPrefixMapping(newPrefix)
213 else:
214 prefixes.append(newPrefixes)
215 else:
216 eltName = _d(reader.Name())
217 attributesImpl._attrs = attrs = {}
218 while reader.MoveToNextAttribute():
219 attName = _d(reader.Name())
220 attrs[attName] = _d(reader.Value())
Daniel Veillard417be3a2003-01-20 21:26:34 +0000221 reader.MoveToElement()
Daniel Veillard4f860202003-01-02 13:00:02 +0000222 self._cont_handler.startElement( \
223 eltName,attributesImpl)
224 if reader.IsEmptyElement():
225 self._cont_handler.endElement(eltName)
226 # EndElement
227 elif nodeType == 15:
228 if self.__ns:
229 self._cont_handler.endElementNS( \
230 (_d(reader.NamespaceUri()),_d(reader.LocalName())),
231 _d(reader.Name()))
232 for prefix in prefixes.pop():
233 self._cont_handler.endPrefixMapping(prefix)
234 else:
235 self._cont_handler.endElement(_d(reader.Name()))
236 # Text
237 elif nodeType == 3:
238 self._cont_handler.characters(_d(reader.Value()))
239 # Whitespace
240 elif nodeType == 13:
241 self._cont_handler.ignorableWhitespace(_d(reader.Value()))
242 # SignificantWhitespace
243 elif nodeType == 14:
244 self._cont_handler.characters(_d(reader.Value()))
245 # CDATA
246 elif nodeType == 4:
247 if not self.__lex_handler is None:
248 self.__lex_handler.startCDATA()
249 self._cont_handler.characters(_d(reader.Value()))
250 if not self.__lex_handler is None:
251 self.__lex_handler.endCDATA()
252 # EntityReference
253 elif nodeType == 5:
254 if not self.__lex_handler is None:
255 self.startEntity(_d(reader.Name()))
256 reader.ResolveEntity()
257 # EndEntity
258 elif nodeType == 16:
259 if not self.__lex_handler is None:
260 self.endEntity(_d(reader.Name()))
261 # ProcessingInstruction
262 elif nodeType == 7:
263 self._cont_handler.processingInstruction( \
264 _d(reader.Name()),_d(reader.Value()))
265 # Comment
266 elif nodeType == 8:
267 if not self.__lex_handler is None:
268 self.__lex_handler.comment(_d(reader.Value()))
269 # DocumentType
270 elif nodeType == 10:
271 #if not self.__lex_handler is None:
272 # self.__lex_handler.startDTD()
273 pass # TODO (how to detect endDTD? on first non-dtd event?)
274 # XmlDeclaration
275 elif nodeType == 17:
276 pass # TODO
277 # Entity
278 elif nodeType == 6:
279 pass # TODO (entity decl)
280 # Notation (decl)
281 elif nodeType == 12:
282 pass # TODO
283 # Attribute (never in this loop)
284 #elif nodeType == 2:
285 # pass
286 # Document (not exposed)
287 #elif nodeType == 9:
288 # pass
289 # DocumentFragment (never returned by XmlReader)
290 #elif nodeType == 11:
291 # pass
292 # None
293 #elif nodeType == 0:
294 # pass
295 # -
296 else:
297 raise SAXException("Unexpected node type %d" % nodeType)
298 if r == 0:
299 self._cont_handler.endDocument()
300 reader.Close()
301 finally:
302 self.__parsing = 0
Daniel Veillard4f860202003-01-02 13:00:02 +0000303
304 def setDTDHandler(self, handler):
305 # TODO (when supported, the inherited method works just fine)
306 raise SAXNotSupportedException("DTDHandler not supported")
307
308 def setEntityResolver(self, resolver):
309 # TODO (when supported, the inherited method works just fine)
310 raise SAXNotSupportedException("EntityResolver not supported")
311
312 def getFeature(self, name):
313 if name == feature_namespaces:
314 return self.__ns
315 elif name == feature_namespace_prefixes:
316 return self.__nspfx
317 elif name == feature_validation:
318 return self.__validate
319 elif name == feature_external_ges:
320 return 1 # TODO (does that relate to PARSER_LOADDTD)?
321 elif name == feature_external_pes:
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000322 return self.__extparams
Daniel Veillard4f860202003-01-02 13:00:02 +0000323 else:
324 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
325 name)
326
327 def setFeature(self, name, state):
328 if self.__parsing:
329 raise SAXNotSupportedException("Cannot set feature %s " \
330 "while parsing" % name)
331 if name == feature_namespaces:
332 self.__ns = state
333 elif name == feature_namespace_prefixes:
334 self.__nspfx = state
335 elif name == feature_validation:
336 self.__validate = state
337 elif name == feature_external_ges:
338 if state == 0:
339 # TODO (does that relate to PARSER_LOADDTD)?
340 raise SAXNotSupportedException("Feature '%s' not supported" % \
341 name)
342 elif name == feature_external_pes:
Daniel Veillardbc2f2c32004-09-29 09:04:00 +0000343 self.__extparams = state
Daniel Veillard4f860202003-01-02 13:00:02 +0000344 else:
345 raise SAXNotRecognizedException("Feature '%s' not recognized" % \
346 name)
347
348 def getProperty(self, name):
349 if name == property_lexical_handler:
350 return self.__lex_handler
351 elif name == property_declaration_handler:
352 return self.__decl_handler
353 else:
354 raise SAXNotRecognizedException("Property '%s' not recognized" % \
355 name)
356
357 def setProperty(self, name, value):
358 if name == property_lexical_handler:
359 self.__lex_handler = value
360 elif name == property_declaration_handler:
361 # TODO: remove if/when libxml2 supports dtd events
362 raise SAXNotSupportedException("Property '%s' not supported" % \
363 name)
364 self.__decl_handler = value
365 else:
366 raise SAXNotRecognizedException("Property '%s' not recognized" % \
367 name)
368
369def create_parser():
370 return LibXml2Reader()
371