William M. Brack | e9449c5 | 2004-07-11 14:41:20 +0000 | [diff] [blame] | 1 | # -*- coding: iso-8859-1 -*- |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 2 | """ A SAX2 driver for libxml2, on top of it's XmlReader API |
| 3 | |
| 4 | USAGE |
| 5 | # put this file (drv_libxml2.py) in PYTHONPATH |
| 6 | import xml.sax |
| 7 | reader = xml.sax.make_parser(["drv_libxml2"]) |
| 8 | # ...and the rest is standard python sax. |
| 9 | |
| 10 | CAVEATS |
| 11 | - Lexical handlers are supported, except for start/endEntity |
| 12 | (waiting for XmlReader.ResolveEntity) and start/endDTD |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 13 | - Error callbacks are not exactly synchronous, they tend |
| 14 | to be invoked before the corresponding content callback, |
| 15 | because the underlying reader interface parses |
| 16 | data by chunks of 512 bytes |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 17 | |
| 18 | TODO |
| 19 | - search for TODO |
| 20 | - some ErrorHandler events (warning) |
| 21 | - some ContentHandler events (setDocumentLocator, skippedEntity) |
| 22 | - EntityResolver (using libxml2.?) |
| 23 | - DTDHandler (if/when libxml2 exposes such node types) |
| 24 | - DeclHandler (if/when libxml2 exposes such node types) |
| 25 | - property_xml_string? |
| 26 | - feature_string_interning? |
| 27 | - Incremental parser |
| 28 | - additional performance tuning: |
| 29 | - one might cache callbacks to avoid some name lookups |
| 30 | - one might implement a smarter way to pass attributes to startElement |
| 31 | (some kind of lazy evaluation?) |
| 32 | - there might be room for improvement in start/endPrefixMapping |
| 33 | - other? |
| 34 | |
| 35 | """ |
| 36 | |
Alexandre Rostovtsev | 6c9c611 | 2013-07-10 23:00:54 -0400 | [diff] [blame] | 37 | __author__ = "Stéphane Bidoul <sbi@skynet.be>" |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 38 | __version__ = "0.3" |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 39 | |
Alexandre Rostovtsev | 6c9c611 | 2013-07-10 23:00:54 -0400 | [diff] [blame] | 40 | import sys |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 41 | import codecs |
Alexandre Rostovtsev | 6c9c611 | 2013-07-10 23:00:54 -0400 | [diff] [blame] | 42 | |
Arfrever Frehtes Taifersar Arahesis | 6dd7775 | 2013-07-12 11:18:11 +0800 | [diff] [blame] | 43 | if sys.version_info[0] < 3: |
Alexandre Rostovtsev | 6c9c611 | 2013-07-10 23:00:54 -0400 | [diff] [blame] | 44 | __author__ = codecs.unicode_escape_decode(__author__)[0] |
| 45 | |
Arfrever Frehtes Taifersar Arahesis | 6dd7775 | 2013-07-12 11:18:11 +0800 | [diff] [blame] | 46 | StringTypes = (str, unicode) |
Michael Stahl | 65112cb | 2016-01-18 12:46:41 +0100 | [diff] [blame] | 47 | # libxml2 returns strings as UTF8 |
| 48 | _decoder = codecs.lookup("utf8")[1] |
| 49 | def _d(s): |
| 50 | if s is None: |
| 51 | return s |
| 52 | else: |
| 53 | return _decoder(s)[0] |
Alexandre Rostovtsev | 6c9c611 | 2013-07-10 23:00:54 -0400 | [diff] [blame] | 54 | else: |
Arfrever Frehtes Taifersar Arahesis | 6dd7775 | 2013-07-12 11:18:11 +0800 | [diff] [blame] | 55 | StringTypes = str |
Michael Stahl | 65112cb | 2016-01-18 12:46:41 +0100 | [diff] [blame] | 56 | # s is Unicode `str` already |
| 57 | def _d(s): |
| 58 | return s |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 59 | |
| 60 | from xml.sax._exceptions import * |
| 61 | from xml.sax import xmlreader, saxutils |
| 62 | from xml.sax.handler import \ |
| 63 | feature_namespaces, \ |
| 64 | feature_namespace_prefixes, \ |
| 65 | feature_string_interning, \ |
| 66 | feature_validation, \ |
| 67 | feature_external_ges, \ |
| 68 | feature_external_pes, \ |
| 69 | property_lexical_handler, \ |
| 70 | property_declaration_handler, \ |
| 71 | property_dom_node, \ |
| 72 | property_xml_string |
| 73 | |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 74 | try: |
| 75 | import libxml2 |
Arfrever Frehtes Taifersar Arahesis | 6dd7775 | 2013-07-12 11:18:11 +0800 | [diff] [blame] | 76 | except ImportError: |
Daniel Veillard | e329fc2 | 2003-01-09 21:36:42 +0000 | [diff] [blame] | 77 | raise SAXReaderNotAvailable("libxml2 not available: " \ |
Arfrever Frehtes Taifersar Arahesis | 6dd7775 | 2013-07-12 11:18:11 +0800 | [diff] [blame] | 78 | "import error was: %s" % sys.exc_info()[1]) |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 79 | |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 80 | class Locator(xmlreader.Locator): |
| 81 | """SAX Locator adapter for libxml2.xmlTextReaderLocator""" |
| 82 | |
| 83 | def __init__(self,locator): |
| 84 | self.__locator = locator |
| 85 | |
| 86 | def getColumnNumber(self): |
| 87 | "Return the column number where the current event ends." |
| 88 | return -1 |
| 89 | |
| 90 | def getLineNumber(self): |
| 91 | "Return the line number where the current event ends." |
| 92 | return self.__locator.LineNumber() |
| 93 | |
| 94 | def getPublicId(self): |
| 95 | "Return the public identifier for the current event." |
| 96 | return None |
| 97 | |
| 98 | def getSystemId(self): |
| 99 | "Return the system identifier for the current event." |
| 100 | return self.__locator.BaseURI() |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 101 | |
| 102 | class LibXml2Reader(xmlreader.XMLReader): |
| 103 | |
| 104 | def __init__(self): |
| 105 | xmlreader.XMLReader.__init__(self) |
| 106 | # features |
| 107 | self.__ns = 0 |
| 108 | self.__nspfx = 0 |
| 109 | self.__validate = 0 |
Daniel Veillard | bc2f2c3 | 2004-09-29 09:04:00 +0000 | [diff] [blame] | 110 | self.__extparams = 1 |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 111 | # parsing flag |
| 112 | self.__parsing = 0 |
| 113 | # additional handlers |
| 114 | self.__lex_handler = None |
| 115 | self.__decl_handler = None |
| 116 | # error messages accumulator |
| 117 | self.__errors = None |
| 118 | |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 119 | def _errorHandler(self,arg,msg,severity,locator): |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 120 | if self.__errors is None: |
| 121 | self.__errors = [] |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 122 | self.__errors.append((severity, |
| 123 | SAXParseException(msg,None, |
| 124 | Locator(locator)))) |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 125 | |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 126 | def _reportErrors(self,fatal): |
| 127 | for severity,exception in self.__errors: |
| 128 | if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING, |
| 129 | libxml2.PARSER_SEVERITY_WARNING): |
| 130 | self._err_handler.warning(exception) |
| 131 | else: |
| 132 | # when fatal is set, the parse will stop; |
| 133 | # we consider that the last error reported |
| 134 | # is the fatal one. |
| 135 | if fatal and exception is self.__errors[-1][1]: |
| 136 | self._err_handler.fatalError(exception) |
| 137 | else: |
| 138 | self._err_handler.error(exception) |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 139 | self.__errors = None |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 140 | |
| 141 | def parse(self, source): |
| 142 | self.__parsing = 1 |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 143 | try: |
| 144 | # prepare source and create reader |
Arfrever Frehtes Taifersar Arahesis | 6dd7775 | 2013-07-12 11:18:11 +0800 | [diff] [blame] | 145 | if isinstance(source, StringTypes): |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 146 | reader = libxml2.newTextReaderFilename(source) |
| 147 | else: |
| 148 | source = saxutils.prepare_input_source(source) |
| 149 | input = libxml2.inputBuffer(source.getByteStream()) |
| 150 | reader = input.newTextReader(source.getSystemId()) |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 151 | reader.SetErrorHandler(self._errorHandler,None) |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 152 | # configure reader |
Daniel Veillard | bc2f2c3 | 2004-09-29 09:04:00 +0000 | [diff] [blame] | 153 | if self.__extparams: |
| 154 | reader.SetParserProp(libxml2.PARSER_LOADDTD,1) |
| 155 | reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1) |
| 156 | reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1) |
| 157 | reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate) |
| 158 | else: |
| 159 | reader.SetParserProp(libxml2.PARSER_LOADDTD, 0) |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 160 | # we reuse attribute maps (for a slight performance gain) |
| 161 | if self.__ns: |
| 162 | attributesNSImpl = xmlreader.AttributesNSImpl({},{}) |
| 163 | else: |
| 164 | attributesImpl = xmlreader.AttributesImpl({}) |
| 165 | # prefixes to pop (for endPrefixMapping) |
| 166 | prefixes = [] |
| 167 | # start loop |
| 168 | self._cont_handler.startDocument() |
| 169 | while 1: |
| 170 | r = reader.Read() |
| 171 | # check for errors |
| 172 | if r == 1: |
| 173 | if not self.__errors is None: |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 174 | self._reportErrors(0) |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 175 | elif r == 0: |
| 176 | if not self.__errors is None: |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 177 | self._reportErrors(0) |
| 178 | break # end of parse |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 179 | else: |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 180 | if not self.__errors is None: |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 181 | self._reportErrors(1) |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 182 | else: |
| 183 | self._err_handler.fatalError(\ |
| 184 | SAXException("Read failed (no details available)")) |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 185 | break # fatal parse error |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 186 | # get node type |
| 187 | nodeType = reader.NodeType() |
| 188 | # Element |
| 189 | if nodeType == 1: |
| 190 | if self.__ns: |
| 191 | eltName = (_d(reader.NamespaceUri()),\ |
| 192 | _d(reader.LocalName())) |
| 193 | eltQName = _d(reader.Name()) |
| 194 | attributesNSImpl._attrs = attrs = {} |
| 195 | attributesNSImpl._qnames = qnames = {} |
| 196 | newPrefixes = [] |
| 197 | while reader.MoveToNextAttribute(): |
| 198 | qname = _d(reader.Name()) |
| 199 | value = _d(reader.Value()) |
| 200 | if qname.startswith("xmlns"): |
| 201 | if len(qname) > 5: |
| 202 | newPrefix = qname[6:] |
| 203 | else: |
| 204 | newPrefix = None |
| 205 | newPrefixes.append(newPrefix) |
| 206 | self._cont_handler.startPrefixMapping(\ |
| 207 | newPrefix,value) |
| 208 | if not self.__nspfx: |
| 209 | continue # don't report xmlns attribute |
| 210 | attName = (_d(reader.NamespaceUri()), |
| 211 | _d(reader.LocalName())) |
| 212 | qnames[attName] = qname |
| 213 | attrs[attName] = value |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 214 | reader.MoveToElement() |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 215 | self._cont_handler.startElementNS( \ |
| 216 | eltName,eltQName,attributesNSImpl) |
| 217 | if reader.IsEmptyElement(): |
| 218 | self._cont_handler.endElementNS(eltName,eltQName) |
| 219 | for newPrefix in newPrefixes: |
| 220 | self._cont_handler.endPrefixMapping(newPrefix) |
| 221 | else: |
| 222 | prefixes.append(newPrefixes) |
| 223 | else: |
| 224 | eltName = _d(reader.Name()) |
| 225 | attributesImpl._attrs = attrs = {} |
| 226 | while reader.MoveToNextAttribute(): |
| 227 | attName = _d(reader.Name()) |
| 228 | attrs[attName] = _d(reader.Value()) |
Daniel Veillard | 417be3a | 2003-01-20 21:26:34 +0000 | [diff] [blame] | 229 | reader.MoveToElement() |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 230 | self._cont_handler.startElement( \ |
| 231 | eltName,attributesImpl) |
| 232 | if reader.IsEmptyElement(): |
| 233 | self._cont_handler.endElement(eltName) |
| 234 | # EndElement |
| 235 | elif nodeType == 15: |
| 236 | if self.__ns: |
| 237 | self._cont_handler.endElementNS( \ |
| 238 | (_d(reader.NamespaceUri()),_d(reader.LocalName())), |
| 239 | _d(reader.Name())) |
| 240 | for prefix in prefixes.pop(): |
| 241 | self._cont_handler.endPrefixMapping(prefix) |
| 242 | else: |
| 243 | self._cont_handler.endElement(_d(reader.Name())) |
| 244 | # Text |
| 245 | elif nodeType == 3: |
| 246 | self._cont_handler.characters(_d(reader.Value())) |
| 247 | # Whitespace |
| 248 | elif nodeType == 13: |
| 249 | self._cont_handler.ignorableWhitespace(_d(reader.Value())) |
| 250 | # SignificantWhitespace |
| 251 | elif nodeType == 14: |
| 252 | self._cont_handler.characters(_d(reader.Value())) |
| 253 | # CDATA |
| 254 | elif nodeType == 4: |
| 255 | if not self.__lex_handler is None: |
| 256 | self.__lex_handler.startCDATA() |
| 257 | self._cont_handler.characters(_d(reader.Value())) |
| 258 | if not self.__lex_handler is None: |
| 259 | self.__lex_handler.endCDATA() |
| 260 | # EntityReference |
| 261 | elif nodeType == 5: |
| 262 | if not self.__lex_handler is None: |
| 263 | self.startEntity(_d(reader.Name())) |
| 264 | reader.ResolveEntity() |
| 265 | # EndEntity |
| 266 | elif nodeType == 16: |
| 267 | if not self.__lex_handler is None: |
| 268 | self.endEntity(_d(reader.Name())) |
| 269 | # ProcessingInstruction |
| 270 | elif nodeType == 7: |
| 271 | self._cont_handler.processingInstruction( \ |
| 272 | _d(reader.Name()),_d(reader.Value())) |
| 273 | # Comment |
| 274 | elif nodeType == 8: |
| 275 | if not self.__lex_handler is None: |
| 276 | self.__lex_handler.comment(_d(reader.Value())) |
| 277 | # DocumentType |
| 278 | elif nodeType == 10: |
| 279 | #if not self.__lex_handler is None: |
| 280 | # self.__lex_handler.startDTD() |
| 281 | pass # TODO (how to detect endDTD? on first non-dtd event?) |
| 282 | # XmlDeclaration |
| 283 | elif nodeType == 17: |
| 284 | pass # TODO |
| 285 | # Entity |
| 286 | elif nodeType == 6: |
| 287 | pass # TODO (entity decl) |
| 288 | # Notation (decl) |
| 289 | elif nodeType == 12: |
| 290 | pass # TODO |
| 291 | # Attribute (never in this loop) |
| 292 | #elif nodeType == 2: |
| 293 | # pass |
| 294 | # Document (not exposed) |
| 295 | #elif nodeType == 9: |
| 296 | # pass |
| 297 | # DocumentFragment (never returned by XmlReader) |
| 298 | #elif nodeType == 11: |
| 299 | # pass |
| 300 | # None |
| 301 | #elif nodeType == 0: |
| 302 | # pass |
| 303 | # - |
| 304 | else: |
| 305 | raise SAXException("Unexpected node type %d" % nodeType) |
| 306 | if r == 0: |
| 307 | self._cont_handler.endDocument() |
| 308 | reader.Close() |
| 309 | finally: |
| 310 | self.__parsing = 0 |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 311 | |
| 312 | def setDTDHandler(self, handler): |
| 313 | # TODO (when supported, the inherited method works just fine) |
| 314 | raise SAXNotSupportedException("DTDHandler not supported") |
| 315 | |
| 316 | def setEntityResolver(self, resolver): |
| 317 | # TODO (when supported, the inherited method works just fine) |
| 318 | raise SAXNotSupportedException("EntityResolver not supported") |
| 319 | |
| 320 | def getFeature(self, name): |
| 321 | if name == feature_namespaces: |
| 322 | return self.__ns |
| 323 | elif name == feature_namespace_prefixes: |
| 324 | return self.__nspfx |
| 325 | elif name == feature_validation: |
| 326 | return self.__validate |
| 327 | elif name == feature_external_ges: |
| 328 | return 1 # TODO (does that relate to PARSER_LOADDTD)? |
| 329 | elif name == feature_external_pes: |
Daniel Veillard | bc2f2c3 | 2004-09-29 09:04:00 +0000 | [diff] [blame] | 330 | return self.__extparams |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 331 | else: |
| 332 | raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
| 333 | name) |
| 334 | |
| 335 | def setFeature(self, name, state): |
| 336 | if self.__parsing: |
| 337 | raise SAXNotSupportedException("Cannot set feature %s " \ |
| 338 | "while parsing" % name) |
| 339 | if name == feature_namespaces: |
| 340 | self.__ns = state |
| 341 | elif name == feature_namespace_prefixes: |
| 342 | self.__nspfx = state |
| 343 | elif name == feature_validation: |
| 344 | self.__validate = state |
| 345 | elif name == feature_external_ges: |
| 346 | if state == 0: |
| 347 | # TODO (does that relate to PARSER_LOADDTD)? |
| 348 | raise SAXNotSupportedException("Feature '%s' not supported" % \ |
| 349 | name) |
| 350 | elif name == feature_external_pes: |
Daniel Veillard | bc2f2c3 | 2004-09-29 09:04:00 +0000 | [diff] [blame] | 351 | self.__extparams = state |
Daniel Veillard | 4f86020 | 2003-01-02 13:00:02 +0000 | [diff] [blame] | 352 | else: |
| 353 | raise SAXNotRecognizedException("Feature '%s' not recognized" % \ |
| 354 | name) |
| 355 | |
| 356 | def getProperty(self, name): |
| 357 | if name == property_lexical_handler: |
| 358 | return self.__lex_handler |
| 359 | elif name == property_declaration_handler: |
| 360 | return self.__decl_handler |
| 361 | else: |
| 362 | raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
| 363 | name) |
| 364 | |
| 365 | def setProperty(self, name, value): |
| 366 | if name == property_lexical_handler: |
| 367 | self.__lex_handler = value |
| 368 | elif name == property_declaration_handler: |
| 369 | # TODO: remove if/when libxml2 supports dtd events |
| 370 | raise SAXNotSupportedException("Property '%s' not supported" % \ |
| 371 | name) |
| 372 | self.__decl_handler = value |
| 373 | else: |
| 374 | raise SAXNotRecognizedException("Property '%s' not recognized" % \ |
| 375 | name) |
| 376 | |
| 377 | def create_parser(): |
| 378 | return LibXml2Reader() |
| 379 | |