integrated drv_libxml2.py Python xml.sax driver from Stéphane Bidoul based
* libxml.spec.in python/Makefile.am python/drv_libxml2.py:
integrated drv_libxml2.py Python xml.sax driver from Stéphane Bidoul
based on the python XmlTextReader interface.
Daniel
diff --git a/ChangeLog b/ChangeLog
index 9d589b7..9f8a3ab 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+Thu Jan 2 13:57:07 CET 2003 Daniel Veillard <daniel@veillard.com>
+
+ * libxml.spec.in python/Makefile.am python/drv_libxml2.py:
+ integrated drv_libxml2.py Python xml.sax driver from Stéphane Bidoul
+ based on the python XmlTextReader interface.
+
Wed Jan 1 22:05:40 CET 2003 Daniel Veillard <daniel@veillard.com>
* tree.c: backing out one change in the last patch which broke the
diff --git a/libxml.spec.in b/libxml.spec.in
index c8e051b..c4f1228 100644
--- a/libxml.spec.in
+++ b/libxml.spec.in
@@ -129,6 +129,7 @@
%doc AUTHORS ChangeLog NEWS README Copyright
%{_libdir}/python*/site-packages/libxml2.py
+%{_libdir}/python*/site-packages/drv_libxml2.py
%{_libdir}/python*/site-packages/libxml2mod*
%doc python/TODO
%doc python/libxml2class.txt
@@ -140,6 +141,10 @@
* @RELDATE@ Daniel Veillard <veillard@redhat.com>
- upstream release @VERSION@ see http://xmlsoft.org/news.html
+* Thu Jan 2 2003 Daniel Veillard <veillard@redhat.com>
+- integrated drv_libxml2 xml.sax driver from Stéphane Bidoul
+- provides the new XmlTextReader interfaces based on C# XML APIs
+
* Wed Oct 23 2002 Daniel Veillard <veillard@redhat.com>
- revamped the spec file, cleaned up some rpm building problems
diff --git a/python/Makefile.am b/python/Makefile.am
index 5b4769a..0fb1e31 100644
--- a/python/Makefile.am
+++ b/python/Makefile.am
@@ -19,6 +19,7 @@
generator.py \
libxml_wrap.h \
libxml.py \
+ drv_libxml.py \
libxml2-python-api.xml \
$(DOCS)
@@ -42,6 +43,7 @@
install-data-local:
$(mkinstalldirs) $(DESTDIR)$(libdir)/python${PYTHON_VERSION}/site-packages
@INSTALL@ -m 0644 libxml2.py $(DESTDIR)$(libdir)/python${PYTHON_VERSION}/site-packages
+ @INSTALL@ -m 0644 drv_libxml2.py $(DESTDIR)$(libdir)/python${PYTHON_VERSION}/site-packages
$(mkinstalldirs) $(DESTDIR)$(DOCS_DIR)
@(for doc in $(DOCS) ; \
do @INSTALL@ -m 0644 $$doc $(DESTDIR)$(DOCS_DIR) ; done)
diff --git a/python/drv_libxml2.py b/python/drv_libxml2.py
new file mode 100644
index 0000000..514aa89
--- /dev/null
+++ b/python/drv_libxml2.py
@@ -0,0 +1,349 @@
+""" A SAX2 driver for libxml2, on top of it's XmlReader API
+
+USAGE
+ # put this file (drv_libxml2.py) in PYTHONPATH
+ import xml.sax
+ reader = xml.sax.make_parser(["drv_libxml2"])
+ # ...and the rest is standard python sax.
+
+CAVEATS
+ - Lexical handlers are supported, except for start/endEntity
+ (waiting for XmlReader.ResolveEntity) and start/endDTD
+ - as understand it, libxml2 error handlers are globals (per thread);
+ each call to parse() registers a new error handler,
+ overwriting any previously registered handler
+ --> you can't have 2 LibXml2Reader active at the same time
+
+TODO
+ - search for TODO
+ - some ErrorHandler events (warning)
+ - some ContentHandler events (setDocumentLocator, skippedEntity)
+ - EntityResolver (using libxml2.?)
+ - DTDHandler (if/when libxml2 exposes such node types)
+ - DeclHandler (if/when libxml2 exposes such node types)
+ - property_xml_string?
+ - feature_string_interning?
+ - Incremental parser
+ - additional performance tuning:
+ - one might cache callbacks to avoid some name lookups
+ - one might implement a smarter way to pass attributes to startElement
+ (some kind of lazy evaluation?)
+ - there might be room for improvement in start/endPrefixMapping
+ - other?
+
+"""
+
+__author__ = u"Stéphane Bidoul <sbi@skynet.be>"
+__version__ = "0.1"
+
+import codecs
+from types import StringTypes
+
+from xml.sax._exceptions import *
+from xml.sax import xmlreader, saxutils
+from xml.sax.handler import \
+ feature_namespaces, \
+ feature_namespace_prefixes, \
+ feature_string_interning, \
+ feature_validation, \
+ feature_external_ges, \
+ feature_external_pes, \
+ property_lexical_handler, \
+ property_declaration_handler, \
+ property_dom_node, \
+ property_xml_string
+
+# libxml2 returns strings as UTF8
+_decoder = codecs.getdecoder("utf8")
+def _d(s):
+ if s is None:
+ return s
+ else:
+ return _decoder(s)[0]
+
+try:
+ import libxml2
+except ImportError, e:
+ raise SAXReaderNotAvailable("libxml2 not available: " + e)
+
+try:
+ import libxslt
+except ImportError:
+ # normal behaviour
+ def _registerErrorHandler(handler):
+ libxml2.registerErrorHandler(handler,"drv_libxml")
+else:
+ # work around libxslt bindings bug (libxml2 bug #102181)
+ def _registerErrorHandler(handler):
+ libxml2.registerErrorHandler(handler,"drv_libxml")
+ libxslt.registerErrorHandler(handler,"drv_libxml")
+
+class LibXml2Reader(xmlreader.XMLReader):
+
+ def __init__(self):
+ xmlreader.XMLReader.__init__(self)
+ # features
+ self.__ns = 0
+ self.__nspfx = 0
+ self.__validate = 0
+ # parsing flag
+ self.__parsing = 0
+ # additional handlers
+ self.__lex_handler = None
+ self.__decl_handler = None
+ # error messages accumulator
+ self.__errors = None
+
+ def _errorHandler(self,ctx,str):
+ if self.__errors is None:
+ self.__errors = []
+ self.__errors.append(str)
+
+ def _reportError(self,callback):
+ # TODO: use SAXParseException, but we need a Locator for that
+ # TODO: distinguish warnings from errors
+ msg = "".join(self.__errors)
+ self.__errors = None
+ callback(SAXException(msg))
+
+ def parse(self, source):
+ self.__parsing = 1
+ _registerErrorHandler(self._errorHandler)
+ try:
+ # prepare source and create reader
+ if type(source) in StringTypes:
+ reader = libxml2.newTextReaderFilename(source)
+ else:
+ source = saxutils.prepare_input_source(source)
+ input = libxml2.inputBuffer(source.getByteStream())
+ reader = input.newTextReader(source.getSystemId())
+ # configure reader
+ reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
+ reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
+ reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
+ reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
+ # we reuse attribute maps (for a slight performance gain)
+ if self.__ns:
+ attributesNSImpl = xmlreader.AttributesNSImpl({},{})
+ else:
+ attributesImpl = xmlreader.AttributesImpl({})
+ # prefixes to pop (for endPrefixMapping)
+ prefixes = []
+ # start loop
+ self._cont_handler.startDocument()
+ while 1:
+ r = reader.Read()
+ # check for errors
+ if r == 1:
+ if not self.__errors is None:
+ # non-fatal error
+ self._reportError(self._err_handler.error)
+ elif r == 0:
+ if not self.__errors is None:
+ # non-fatal error
+ self._reportError(self._err_handler.error)
+ break
+ else:
+ # fatal error
+ if not self.__errors is None:
+ self._reportError(self._err_handler.fatalError)
+ else:
+ self._err_handler.fatalError(\
+ SAXException("Read failed (no details available)"))
+ break
+ # get node type
+ nodeType = reader.NodeType()
+ # Element
+ if nodeType == 1:
+ if self.__ns:
+ eltName = (_d(reader.NamespaceUri()),\
+ _d(reader.LocalName()))
+ eltQName = _d(reader.Name())
+ attributesNSImpl._attrs = attrs = {}
+ attributesNSImpl._qnames = qnames = {}
+ newPrefixes = []
+ while reader.MoveToNextAttribute():
+ qname = _d(reader.Name())
+ value = _d(reader.Value())
+ if qname.startswith("xmlns"):
+ if len(qname) > 5:
+ newPrefix = qname[6:]
+ else:
+ newPrefix = None
+ newPrefixes.append(newPrefix)
+ self._cont_handler.startPrefixMapping(\
+ newPrefix,value)
+ if not self.__nspfx:
+ continue # don't report xmlns attribute
+ attName = (_d(reader.NamespaceUri()),
+ _d(reader.LocalName()))
+ qnames[attName] = qname
+ attrs[attName] = value
+ self._cont_handler.startElementNS( \
+ eltName,eltQName,attributesNSImpl)
+ if reader.IsEmptyElement():
+ self._cont_handler.endElementNS(eltName,eltQName)
+ for newPrefix in newPrefixes:
+ self._cont_handler.endPrefixMapping(newPrefix)
+ else:
+ prefixes.append(newPrefixes)
+ else:
+ eltName = _d(reader.Name())
+ attributesImpl._attrs = attrs = {}
+ while reader.MoveToNextAttribute():
+ attName = _d(reader.Name())
+ attrs[attName] = _d(reader.Value())
+ self._cont_handler.startElement( \
+ eltName,attributesImpl)
+ if reader.IsEmptyElement():
+ self._cont_handler.endElement(eltName)
+ # EndElement
+ elif nodeType == 15:
+ if self.__ns:
+ self._cont_handler.endElementNS( \
+ (_d(reader.NamespaceUri()),_d(reader.LocalName())),
+ _d(reader.Name()))
+ for prefix in prefixes.pop():
+ self._cont_handler.endPrefixMapping(prefix)
+ else:
+ self._cont_handler.endElement(_d(reader.Name()))
+ # Text
+ elif nodeType == 3:
+ self._cont_handler.characters(_d(reader.Value()))
+ # Whitespace
+ elif nodeType == 13:
+ self._cont_handler.ignorableWhitespace(_d(reader.Value()))
+ # SignificantWhitespace
+ elif nodeType == 14:
+ self._cont_handler.characters(_d(reader.Value()))
+ # CDATA
+ elif nodeType == 4:
+ if not self.__lex_handler is None:
+ self.__lex_handler.startCDATA()
+ self._cont_handler.characters(_d(reader.Value()))
+ if not self.__lex_handler is None:
+ self.__lex_handler.endCDATA()
+ # EntityReference
+ elif nodeType == 5:
+ if not self.__lex_handler is None:
+ self.startEntity(_d(reader.Name()))
+ reader.ResolveEntity()
+ # EndEntity
+ elif nodeType == 16:
+ if not self.__lex_handler is None:
+ self.endEntity(_d(reader.Name()))
+ # ProcessingInstruction
+ elif nodeType == 7:
+ self._cont_handler.processingInstruction( \
+ _d(reader.Name()),_d(reader.Value()))
+ # Comment
+ elif nodeType == 8:
+ if not self.__lex_handler is None:
+ self.__lex_handler.comment(_d(reader.Value()))
+ # DocumentType
+ elif nodeType == 10:
+ #if not self.__lex_handler is None:
+ # self.__lex_handler.startDTD()
+ pass # TODO (how to detect endDTD? on first non-dtd event?)
+ # XmlDeclaration
+ elif nodeType == 17:
+ pass # TODO
+ # Entity
+ elif nodeType == 6:
+ pass # TODO (entity decl)
+ # Notation (decl)
+ elif nodeType == 12:
+ pass # TODO
+ # Attribute (never in this loop)
+ #elif nodeType == 2:
+ # pass
+ # Document (not exposed)
+ #elif nodeType == 9:
+ # pass
+ # DocumentFragment (never returned by XmlReader)
+ #elif nodeType == 11:
+ # pass
+ # None
+ #elif nodeType == 0:
+ # pass
+ # -
+ else:
+ raise SAXException("Unexpected node type %d" % nodeType)
+ if r == 0:
+ self._cont_handler.endDocument()
+ reader.Close()
+ finally:
+ self.__parsing = 0
+ # TODO: unregister error handler?
+
+ def setDTDHandler(self, handler):
+ # TODO (when supported, the inherited method works just fine)
+ raise SAXNotSupportedException("DTDHandler not supported")
+
+ def setEntityResolver(self, resolver):
+ # TODO (when supported, the inherited method works just fine)
+ raise SAXNotSupportedException("EntityResolver not supported")
+
+ def getFeature(self, name):
+ if name == feature_namespaces:
+ return self.__ns
+ elif name == feature_namespace_prefixes:
+ return self.__nspfx
+ elif name == feature_validation:
+ return self.__validate
+ elif name == feature_external_ges:
+ return 1 # TODO (does that relate to PARSER_LOADDTD)?
+ elif name == feature_external_pes:
+ return 1 # TODO (does that relate to PARSER_LOADDTD)?
+ else:
+ raise SAXNotRecognizedException("Feature '%s' not recognized" % \
+ name)
+
+ def setFeature(self, name, state):
+ if self.__parsing:
+ raise SAXNotSupportedException("Cannot set feature %s " \
+ "while parsing" % name)
+ if name == feature_namespaces:
+ self.__ns = state
+ elif name == feature_namespace_prefixes:
+ self.__nspfx = state
+ elif name == feature_validation:
+ self.__validate = state
+ elif name == feature_external_ges:
+ if state == 0:
+ # TODO (does that relate to PARSER_LOADDTD)?
+ raise SAXNotSupportedException("Feature '%s' not supported" % \
+ name)
+ elif name == feature_external_pes:
+ if state == 0:
+ # TODO (does that relate to PARSER_LOADDTD)?
+ raise SAXNotSupportedException("Feature '%s' not supported" % \
+ name)
+ else:
+ raise SAXNotRecognizedException("Feature '%s' not recognized" % \
+ name)
+
+ def getProperty(self, name):
+ if name == property_lexical_handler:
+ return self.__lex_handler
+ elif name == property_declaration_handler:
+ return self.__decl_handler
+ else:
+ raise SAXNotRecognizedException("Property '%s' not recognized" % \
+ name)
+
+ def setProperty(self, name, value):
+ if name == property_lexical_handler:
+ self.__lex_handler = value
+ elif name == property_declaration_handler:
+ # TODO: remove if/when libxml2 supports dtd events
+ raise SAXNotSupportedException("Property '%s' not supported" % \
+ name)
+ self.__decl_handler = value
+ else:
+ raise SAXNotRecognizedException("Property '%s' not recognized" % \
+ name)
+
+def create_parser():
+ return LibXml2Reader()
+