blob: 9fd6e5e5ae03e4f3066b39f15cbc572f139e59b7 [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Armin Rigo9ed73062005-12-14 18:10:45 +000036#
37# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000038# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# fredrik@pythonware.com
41# http://www.pythonware.com
42#
43# --------------------------------------------------------------------
44# The ElementTree toolkit is
45#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000046# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000047#
48# By obtaining, using, and/or copying this software and/or its
49# associated documentation, you agree that you have read, understood,
50# and will comply with the following terms and conditions:
51#
52# Permission to use, copy, modify, and distribute this software and
53# its associated documentation for any purpose and without fee is
54# hereby granted, provided that the above copyright notice appears in
55# all copies, and that both that copyright notice and this permission
56# notice appear in supporting documentation, and that the name of
57# Secret Labs AB or the author not be used in advertising or publicity
58# pertaining to distribution of the software without specific, written
59# prior permission.
60#
61# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
62# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
63# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
64# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
65# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
66# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
67# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
68# OF THIS SOFTWARE.
69# --------------------------------------------------------------------
70
Fredrik Lundh63168a52005-12-14 22:29:34 +000071# Licensed to PSF under a Contributor Agreement.
Florent Xiclunaf15351d2010-03-13 23:24:31 +000072# See http://www.python.org/psf/license for licensing details.
Fredrik Lundh63168a52005-12-14 22:29:34 +000073
Armin Rigo9ed73062005-12-14 18:10:45 +000074__all__ = [
75 # public symbols
76 "Comment",
77 "dump",
78 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000079 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000080 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000081 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000082 "PI", "ProcessingInstruction",
83 "QName",
84 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000085 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000086 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000087 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010088 "XML", "XMLID",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000089 "XMLParser", "XMLTreeBuilder",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010090 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000091 ]
92
Florent Xiclunaf15351d2010-03-13 23:24:31 +000093VERSION = "1.3.0"
94
Florent Xiclunaf15351d2010-03-13 23:24:31 +000095import sys
96import re
97import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030098import io
99import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000100
Eli Bendersky27cbb192012-06-15 09:03:19 +0300101from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000102
Armin Rigo9ed73062005-12-14 18:10:45 +0000103
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000104class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800105 """An error when parsing an XML document.
106
107 In addition to its exception value, a ParseError contains
108 two extra attributes:
109 'code' - the specific exception code
110 'position' - the line and column of the error
111
112 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000113 pass
114
115# --------------------------------------------------------------------
116
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000117
118def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800119 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100120 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000121
Armin Rigo9ed73062005-12-14 18:10:45 +0000122
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000123class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800124 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000125
Eli Bendersky84fae782013-03-09 07:12:48 -0800126 This class is the reference implementation of the Element interface.
127
128 An element's length is its number of subelements. That means if you
129 you want to check if an element is truly empty, you should check BOTH
130 its length AND its text attribute.
131
132 The element tag, attribute names, and attribute values can be either
133 bytes or strings.
134
135 *tag* is the element name. *attrib* is an optional dictionary containing
136 element attributes. *extra* are additional element attributes given as
137 keyword arguments.
138
139 Example form:
140 <tag attrib>text<child/>...</tag>tail
141
142 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000143
144 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800145 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000146
147 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800148 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000149
150 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800151 """
152 Text before first subelement. This is either a string or the value None.
153 Note that if there is no text, this attribute may be either
154 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000155
Eli Bendersky84fae782013-03-09 07:12:48 -0800156 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000157
Eli Bendersky84fae782013-03-09 07:12:48 -0800158 tail = None
159 """
160 Text after this element's end tag, but before the next sibling element's
161 start tag. This is either a string or the value None. Note that if there
162 was no text, this attribute may be either None or an empty string,
163 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000164
Eli Bendersky84fae782013-03-09 07:12:48 -0800165 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000166
167 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300168 if not isinstance(attrib, dict):
169 raise TypeError("attrib must be dict, not %s" % (
170 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000171 attrib = attrib.copy()
172 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000173 self.tag = tag
174 self.attrib = attrib
175 self._children = []
176
177 def __repr__(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000178 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000179
Armin Rigo9ed73062005-12-14 18:10:45 +0000180 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800181 """Create a new element with the same type.
182
183 *tag* is a string containing the element name.
184 *attrib* is a dictionary containing the element attributes.
185
186 Do not call this method, use the SubElement factory function instead.
187
188 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000189 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000190
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000191 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800192 """Return copy of current element.
193
194 This creates a shallow copy. Subelements will be shared with the
195 original tree.
196
197 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000198 elem = self.makeelement(self.tag, self.attrib)
199 elem.text = self.text
200 elem.tail = self.tail
201 elem[:] = self
202 return elem
203
Armin Rigo9ed73062005-12-14 18:10:45 +0000204 def __len__(self):
205 return len(self._children)
206
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000207 def __bool__(self):
208 warnings.warn(
209 "The behavior of this method will change in future versions. "
210 "Use specific 'len(elem)' or 'elem is not None' test instead.",
211 FutureWarning, stacklevel=2
212 )
213 return len(self._children) != 0 # emulate old behaviour, for now
214
Armin Rigo9ed73062005-12-14 18:10:45 +0000215 def __getitem__(self, index):
216 return self._children[index]
217
Armin Rigo9ed73062005-12-14 18:10:45 +0000218 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000219 # if isinstance(index, slice):
220 # for elt in element:
221 # assert iselement(elt)
222 # else:
223 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000224 self._children[index] = element
225
Armin Rigo9ed73062005-12-14 18:10:45 +0000226 def __delitem__(self, index):
227 del self._children[index]
228
Eli Bendersky84fae782013-03-09 07:12:48 -0800229 def append(self, subelement):
230 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000231
Eli Bendersky84fae782013-03-09 07:12:48 -0800232 The new element will appear in document order after the last existing
233 subelement (or directly after the text, if it's the first subelement),
234 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000235
Eli Bendersky84fae782013-03-09 07:12:48 -0800236 """
237 self._assert_is_element(subelement)
238 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000239
240 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800241 """Append subelements from a sequence.
242
243 *elements* is a sequence with zero or more elements.
244
245 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200246 for element in elements:
247 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000248 self._children.extend(elements)
249
Armin Rigo9ed73062005-12-14 18:10:45 +0000250
Eli Bendersky84fae782013-03-09 07:12:48 -0800251 def insert(self, index, subelement):
252 """Insert *subelement* at position *index*."""
253 self._assert_is_element(subelement)
254 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000255
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200256 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200257 # Need to refer to the actual Python implementation, not the
258 # shadowing C implementation.
259 if not isinstance(e, _Element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200260 raise TypeError('expected an Element, not %s' % type(e).__name__)
261
Armin Rigo9ed73062005-12-14 18:10:45 +0000262
Eli Bendersky84fae782013-03-09 07:12:48 -0800263 def remove(self, subelement):
264 """Remove matching subelement.
265
266 Unlike the find methods, this method compares elements based on
267 identity, NOT ON tag value or contents. To remove subelements by
268 other means, the easiest way is to use a list comprehension to
269 select what elements to keep, and then use slice assignment to update
270 the parent element.
271
272 ValueError is raised if a matching element could not be found.
273
274 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000275 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800276 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000277
278 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800279 """(Deprecated) Return all subelements.
280
281 Elements are returned in document order.
282
283 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000284 warnings.warn(
285 "This method will be removed in future versions. "
286 "Use 'list(elem)' or iteration over elem instead.",
287 DeprecationWarning, stacklevel=2
288 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000289 return self._children
290
Armin Rigo9ed73062005-12-14 18:10:45 +0000291
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000292 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800293 """Find first matching element by tag name or path.
294
295 *path* is a string having either an element tag or an XPath,
296 *namespaces* is an optional mapping from namespace prefix to full name.
297
298 Return the first matching element, or None if no element was found.
299
300 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000301 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000302
Armin Rigo9ed73062005-12-14 18:10:45 +0000303
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000304 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800305 """Find text for first matching element by tag name or path.
306
307 *path* is a string having either an element tag or an XPath,
308 *default* is the value to return if the element was not found,
309 *namespaces* is an optional mapping from namespace prefix to full name.
310
311 Return text content of first matching element, or default value if
312 none was found. Note that if an element is found having no text
313 content, the empty string is returned.
314
315 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000316 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000317
Armin Rigo9ed73062005-12-14 18:10:45 +0000318
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000319 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800320 """Find all matching subelements by tag name or path.
321
322 *path* is a string having either an element tag or an XPath,
323 *namespaces* is an optional mapping from namespace prefix to full name.
324
325 Returns list containing all matching elements in document order.
326
327 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000328 return ElementPath.findall(self, path, namespaces)
329
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000330
331 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800332 """Find all matching subelements by tag name or path.
333
334 *path* is a string having either an element tag or an XPath,
335 *namespaces* is an optional mapping from namespace prefix to full name.
336
337 Return an iterable yielding all matching elements in document order.
338
339 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000340 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000341
Armin Rigo9ed73062005-12-14 18:10:45 +0000342
343 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800344 """Reset element.
345
346 This function removes all subelements, clears all attributes, and sets
347 the text and tail attributes to None.
348
349 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000350 self.attrib.clear()
351 self._children = []
352 self.text = self.tail = None
353
Armin Rigo9ed73062005-12-14 18:10:45 +0000354
355 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800356 """Get element attribute.
357
358 Equivalent to attrib.get, but some implementations may handle this a
359 bit more efficiently. *key* is what attribute to look for, and
360 *default* is what to return if the attribute was not found.
361
362 Returns a string containing the attribute value, or the default if
363 attribute was not found.
364
365 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000366 return self.attrib.get(key, default)
367
Armin Rigo9ed73062005-12-14 18:10:45 +0000368
369 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800370 """Set element attribute.
371
372 Equivalent to attrib[key] = value, but some implementations may handle
373 this a bit more efficiently. *key* is what attribute to set, and
374 *value* is the attribute value to set it to.
375
376 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000377 self.attrib[key] = value
378
Armin Rigo9ed73062005-12-14 18:10:45 +0000379
380 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800381 """Get list of attribute names.
382
383 Names are returned in an arbitrary order, just like an ordinary
384 Python dict. Equivalent to attrib.keys()
385
386 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000387 return self.attrib.keys()
388
Armin Rigo9ed73062005-12-14 18:10:45 +0000389
390 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800391 """Get element attributes as a sequence.
392
393 The attributes are returned in arbitrary order. Equivalent to
394 attrib.items().
395
396 Return a list of (name, value) tuples.
397
398 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000399 return self.attrib.items()
400
Armin Rigo9ed73062005-12-14 18:10:45 +0000401
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000402 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800403 """Create tree iterator.
404
405 The iterator loops over the element and all subelements in document
406 order, returning all elements with a matching tag.
407
408 If the tree structure is modified during iteration, new or removed
409 elements may or may not be included. To get a stable set, use the
410 list() function on the iterator, and loop over the resulting list.
411
412 *tag* is what tags to look for (default is to return all elements)
413
414 Return an iterator containing all the matching elements.
415
416 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000417 if tag == "*":
418 tag = None
419 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000420 yield self
421 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700422 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000423
424 # compatibility
425 def getiterator(self, tag=None):
426 # Change for a DeprecationWarning in 1.4
427 warnings.warn(
428 "This method will be removed in future versions. "
429 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
430 PendingDeprecationWarning, stacklevel=2
431 )
432 return list(self.iter(tag))
433
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000434
435 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800436 """Create text iterator.
437
438 The iterator loops over the element and all subelements in document
439 order, returning all inner text.
440
441 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000442 tag = self.tag
443 if not isinstance(tag, str) and tag is not None:
444 return
445 if self.text:
446 yield self.text
447 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700448 yield from e.itertext()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000449 if e.tail:
450 yield e.tail
Armin Rigo9ed73062005-12-14 18:10:45 +0000451
452# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000453_Element = _ElementInterface = Element
Armin Rigo9ed73062005-12-14 18:10:45 +0000454
Armin Rigo9ed73062005-12-14 18:10:45 +0000455
456def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800457 """Subelement factory which creates an element instance, and appends it
458 to an existing parent.
459
460 The element tag, attribute names, and attribute values can be either
461 bytes or Unicode strings.
462
463 *parent* is the parent element, *tag* is the subelements name, *attrib* is
464 an optional directory containing element attributes, *extra* are
465 additional attributes given as keyword arguments.
466
467 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000468 attrib = attrib.copy()
469 attrib.update(extra)
470 element = parent.makeelement(tag, attrib)
471 parent.append(element)
472 return element
473
Armin Rigo9ed73062005-12-14 18:10:45 +0000474
475def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800476 """Comment element factory.
477
478 This function creates a special element which the standard serializer
479 serializes as an XML comment.
480
481 *text* is a string containing the comment string.
482
483 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000484 element = Element(Comment)
485 element.text = text
486 return element
487
Armin Rigo9ed73062005-12-14 18:10:45 +0000488
489def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800490 """Processing Instruction element factory.
491
492 This function creates a special element which the standard serializer
493 serializes as an XML comment.
494
495 *target* is a string containing the processing instruction, *text* is a
496 string containing the processing instruction contents, if any.
497
498 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000499 element = Element(ProcessingInstruction)
500 element.text = target
501 if text:
502 element.text = element.text + " " + text
503 return element
504
505PI = ProcessingInstruction
506
Armin Rigo9ed73062005-12-14 18:10:45 +0000507
508class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800509 """Qualified name wrapper.
510
511 This class can be used to wrap a QName attribute value in order to get
512 proper namespace handing on output.
513
514 *text_or_uri* is a string containing the QName value either in the form
515 {uri}local, or if the tag argument is given, the URI part of a QName.
516
517 *tag* is an optional argument which if given, will make the first
518 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
519 be interpreted as a local name.
520
521 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000522 def __init__(self, text_or_uri, tag=None):
523 if tag:
524 text_or_uri = "{%s}%s" % (text_or_uri, tag)
525 self.text = text_or_uri
526 def __str__(self):
527 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000528 def __repr__(self):
Georg Brandlc95c9182010-12-09 18:26:02 +0000529 return '<QName %r>' % (self.text,)
Armin Rigo9ed73062005-12-14 18:10:45 +0000530 def __hash__(self):
531 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000532 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000533 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000534 return self.text <= other.text
535 return self.text <= other
536 def __lt__(self, other):
537 if isinstance(other, QName):
538 return self.text < other.text
539 return self.text < other
540 def __ge__(self, other):
541 if isinstance(other, QName):
542 return self.text >= other.text
543 return self.text >= other
544 def __gt__(self, other):
545 if isinstance(other, QName):
546 return self.text > other.text
547 return self.text > other
548 def __eq__(self, other):
549 if isinstance(other, QName):
550 return self.text == other.text
551 return self.text == other
552 def __ne__(self, other):
553 if isinstance(other, QName):
554 return self.text != other.text
555 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000556
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000557# --------------------------------------------------------------------
558
Armin Rigo9ed73062005-12-14 18:10:45 +0000559
560class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800561 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000562
Eli Bendersky84fae782013-03-09 07:12:48 -0800563 This class also provides support for serialization to and from
564 standard XML.
565
566 *element* is an optional root element node,
567 *file* is an optional file handle or file name of an XML file whose
568 contents will be used to initialize the tree with.
569
570 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000571 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000572 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000573 self._root = element # first node
574 if file:
575 self.parse(file)
576
Armin Rigo9ed73062005-12-14 18:10:45 +0000577 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800578 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000579 return self._root
580
Armin Rigo9ed73062005-12-14 18:10:45 +0000581 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800582 """Replace root element of this tree.
583
584 This will discard the current contents of the tree and replace it
585 with the given element. Use with care!
586
587 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000588 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000589 self._root = element
590
Armin Rigo9ed73062005-12-14 18:10:45 +0000591 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800592 """Load external XML document into element tree.
593
594 *source* is a file name or file object, *parser* is an optional parser
595 instance that defaults to XMLParser.
596
597 ParseError is raised if the parser fails to parse the document.
598
599 Returns the root element of the given source document.
600
601 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000602 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000603 if not hasattr(source, "read"):
604 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000605 close_source = True
606 try:
607 if not parser:
608 parser = XMLParser(target=TreeBuilder())
609 while 1:
610 data = source.read(65536)
611 if not data:
612 break
613 parser.feed(data)
614 self._root = parser.close()
615 return self._root
616 finally:
617 if close_source:
618 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000619
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000620 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800621 """Create and return tree iterator for the root element.
622
623 The iterator loops over all elements in this tree, in document order.
624
625 *tag* is a string with the tag name to iterate over
626 (default is to return all elements).
627
628 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000629 # assert self._root is not None
630 return self._root.iter(tag)
631
632 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000633 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000634 # Change for a DeprecationWarning in 1.4
635 warnings.warn(
636 "This method will be removed in future versions. "
637 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
638 PendingDeprecationWarning, stacklevel=2
639 )
640 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000641
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000642 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800643 """Find first matching element by tag name or path.
644
645 Same as getroot().find(path), which is Element.find()
646
647 *path* is a string having either an element tag or an XPath,
648 *namespaces* is an optional mapping from namespace prefix to full name.
649
650 Return the first matching element, or None if no element was found.
651
652 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000653 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000654 if path[:1] == "/":
655 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000656 warnings.warn(
657 "This search is broken in 1.3 and earlier, and will be "
658 "fixed in a future version. If you rely on the current "
659 "behaviour, change it to %r" % path,
660 FutureWarning, stacklevel=2
661 )
662 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000663
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000664 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800665 """Find first matching element by tag name or path.
666
667 Same as getroot().findtext(path), which is Element.findtext()
668
669 *path* is a string having either an element tag or an XPath,
670 *namespaces* is an optional mapping from namespace prefix to full name.
671
672 Return the first matching element, or None if no element was found.
673
674 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000675 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000676 if path[:1] == "/":
677 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000678 warnings.warn(
679 "This search is broken in 1.3 and earlier, and will be "
680 "fixed in a future version. If you rely on the current "
681 "behaviour, change it to %r" % path,
682 FutureWarning, stacklevel=2
683 )
684 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000685
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000686 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800687 """Find all matching subelements by tag name or path.
688
689 Same as getroot().findall(path), which is Element.findall().
690
691 *path* is a string having either an element tag or an XPath,
692 *namespaces* is an optional mapping from namespace prefix to full name.
693
694 Return list containing all matching elements in document order.
695
696 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000697 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000698 if path[:1] == "/":
699 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000700 warnings.warn(
701 "This search is broken in 1.3 and earlier, and will be "
702 "fixed in a future version. If you rely on the current "
703 "behaviour, change it to %r" % path,
704 FutureWarning, stacklevel=2
705 )
706 return self._root.findall(path, namespaces)
707
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000708 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800709 """Find all matching subelements by tag name or path.
710
711 Same as getroot().iterfind(path), which is element.iterfind()
712
713 *path* is a string having either an element tag or an XPath,
714 *namespaces* is an optional mapping from namespace prefix to full name.
715
716 Return an iterable yielding all matching elements in document order.
717
718 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000719 # assert self._root is not None
720 if path[:1] == "/":
721 path = "." + path
722 warnings.warn(
723 "This search is broken in 1.3 and earlier, and will be "
724 "fixed in a future version. If you rely on the current "
725 "behaviour, change it to %r" % path,
726 FutureWarning, stacklevel=2
727 )
728 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000729
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000730 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000731 encoding=None,
732 xml_declaration=None,
733 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800734 method=None, *,
735 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800736 """Write element tree to a file as XML.
737
738 Arguments:
739 *file_or_filename* -- file name or a file object opened for writing
740
741 *encoding* -- the output encoding (default: US-ASCII)
742
743 *xml_declaration* -- bool indicating if an XML declaration should be
744 added to the output. If None, an XML declaration
745 is added if encoding IS NOT either of:
746 US-ASCII, UTF-8, or Unicode
747
748 *default_namespace* -- sets the default XML namespace (for "xmlns")
749
750 *method* -- either "xml" (default), "html, "text", or "c14n"
751
752 *short_empty_elements* -- controls the formatting of elements
753 that contain no content. If True (default)
754 they are emitted as a single self-closed
755 tag, otherwise they are emitted as a pair
756 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800757
758 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000759 if not method:
760 method = "xml"
761 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000762 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000763 if not encoding:
764 if method == "c14n":
765 encoding = "utf-8"
766 else:
767 encoding = "us-ascii"
Florent Xiclunac17f1722010-08-08 19:48:29 +0000768 else:
769 encoding = encoding.lower()
Eli Bendersky00f402b2012-07-15 06:02:22 +0300770 with _get_writer(file_or_filename, encoding) as write:
771 if method == "xml" and (xml_declaration or
772 (xml_declaration is None and
773 encoding not in ("utf-8", "us-ascii", "unicode"))):
774 declared_encoding = encoding
775 if encoding == "unicode":
776 # Retrieve the default encoding for the xml declaration
777 import locale
778 declared_encoding = locale.getpreferredencoding()
779 write("<?xml version='1.0' encoding='%s'?>\n" % (
780 declared_encoding,))
781 if method == "text":
782 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000783 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300784 qnames, namespaces = _namespaces(self._root, default_namespace)
785 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800786 serialize(write, self._root, qnames, namespaces,
787 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000788
789 def write_c14n(self, file):
790 # lxml.etree compatibility. use output method instead
791 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000792
793# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000794# serialization support
795
Eli Bendersky00f402b2012-07-15 06:02:22 +0300796@contextlib.contextmanager
797def _get_writer(file_or_filename, encoding):
798 # returns text write method and release all resourses after using
799 try:
800 write = file_or_filename.write
801 except AttributeError:
802 # file_or_filename is a file name
803 if encoding == "unicode":
804 file = open(file_or_filename, "w")
805 else:
806 file = open(file_or_filename, "w", encoding=encoding,
807 errors="xmlcharrefreplace")
808 with file:
809 yield file.write
810 else:
811 # file_or_filename is a file-like object
812 # encoding determines if it is a text or binary writer
813 if encoding == "unicode":
814 # use a text writer as is
815 yield write
816 else:
817 # wrap a binary writer with TextIOWrapper
818 with contextlib.ExitStack() as stack:
819 if isinstance(file_or_filename, io.BufferedIOBase):
820 file = file_or_filename
821 elif isinstance(file_or_filename, io.RawIOBase):
822 file = io.BufferedWriter(file_or_filename)
823 # Keep the original file open when the BufferedWriter is
824 # destroyed
825 stack.callback(file.detach)
826 else:
827 # This is to handle passed objects that aren't in the
828 # IOBase hierarchy, but just have a write method
829 file = io.BufferedIOBase()
830 file.writable = lambda: True
831 file.write = write
832 try:
833 # TextIOWrapper uses this methods to determine
834 # if BOM (for UTF-16, etc) should be added
835 file.seekable = file_or_filename.seekable
836 file.tell = file_or_filename.tell
837 except AttributeError:
838 pass
839 file = io.TextIOWrapper(file,
840 encoding=encoding,
841 errors="xmlcharrefreplace",
842 newline="\n")
843 # Keep the original file open when the TextIOWrapper is
844 # destroyed
845 stack.callback(file.detach)
846 yield file.write
847
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000848def _namespaces(elem, default_namespace=None):
849 # identify namespaces used in this tree
850
851 # maps qnames to *encoded* prefix:local names
852 qnames = {None: None}
853
854 # maps uri:s to prefixes
855 namespaces = {}
856 if default_namespace:
857 namespaces[default_namespace] = ""
858
859 def add_qname(qname):
860 # calculate serialized qname representation
861 try:
862 if qname[:1] == "{":
863 uri, tag = qname[1:].rsplit("}", 1)
864 prefix = namespaces.get(uri)
865 if prefix is None:
866 prefix = _namespace_map.get(uri)
867 if prefix is None:
868 prefix = "ns%d" % len(namespaces)
869 if prefix != "xml":
870 namespaces[uri] = prefix
871 if prefix:
872 qnames[qname] = "%s:%s" % (prefix, tag)
873 else:
874 qnames[qname] = tag # default element
875 else:
876 if default_namespace:
877 # FIXME: can this be handled in XML 1.0?
878 raise ValueError(
879 "cannot use non-qualified names with "
880 "default_namespace option"
881 )
882 qnames[qname] = qname
883 except TypeError:
884 _raise_serialization_error(qname)
885
886 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300887 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000888 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000889 if isinstance(tag, QName):
890 if tag.text not in qnames:
891 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000892 elif isinstance(tag, str):
893 if tag not in qnames:
894 add_qname(tag)
895 elif tag is not None and tag is not Comment and tag is not PI:
896 _raise_serialization_error(tag)
897 for key, value in elem.items():
898 if isinstance(key, QName):
899 key = key.text
900 if key not in qnames:
901 add_qname(key)
902 if isinstance(value, QName) and value.text not in qnames:
903 add_qname(value.text)
904 text = elem.text
905 if isinstance(text, QName) and text.text not in qnames:
906 add_qname(text.text)
907 return qnames, namespaces
908
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800909def _serialize_xml(write, elem, qnames, namespaces,
910 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000911 tag = elem.tag
912 text = elem.text
913 if tag is Comment:
914 write("<!--%s-->" % text)
915 elif tag is ProcessingInstruction:
916 write("<?%s?>" % text)
917 else:
918 tag = qnames[tag]
919 if tag is None:
920 if text:
921 write(_escape_cdata(text))
922 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800923 _serialize_xml(write, e, qnames, None,
924 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000925 else:
926 write("<" + tag)
927 items = list(elem.items())
928 if items or namespaces:
929 if namespaces:
930 for v, k in sorted(namespaces.items(),
931 key=lambda x: x[1]): # sort on prefix
932 if k:
933 k = ":" + k
934 write(" xmlns%s=\"%s\"" % (
935 k,
936 _escape_attrib(v)
937 ))
938 for k, v in sorted(items): # lexical order
939 if isinstance(k, QName):
940 k = k.text
941 if isinstance(v, QName):
942 v = qnames[v.text]
943 else:
944 v = _escape_attrib(v)
945 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800946 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000947 write(">")
948 if text:
949 write(_escape_cdata(text))
950 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800951 _serialize_xml(write, e, qnames, None,
952 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000953 write("</" + tag + ">")
954 else:
955 write(" />")
956 if elem.tail:
957 write(_escape_cdata(elem.tail))
958
959HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300960 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000961
962try:
963 HTML_EMPTY = set(HTML_EMPTY)
964except NameError:
965 pass
966
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800967def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000968 tag = elem.tag
969 text = elem.text
970 if tag is Comment:
971 write("<!--%s-->" % _escape_cdata(text))
972 elif tag is ProcessingInstruction:
973 write("<?%s?>" % _escape_cdata(text))
974 else:
975 tag = qnames[tag]
976 if tag is None:
977 if text:
978 write(_escape_cdata(text))
979 for e in elem:
980 _serialize_html(write, e, qnames, None)
981 else:
982 write("<" + tag)
983 items = list(elem.items())
984 if items or namespaces:
985 if namespaces:
986 for v, k in sorted(namespaces.items(),
987 key=lambda x: x[1]): # sort on prefix
988 if k:
989 k = ":" + k
990 write(" xmlns%s=\"%s\"" % (
991 k,
992 _escape_attrib(v)
993 ))
994 for k, v in sorted(items): # lexical order
995 if isinstance(k, QName):
996 k = k.text
997 if isinstance(v, QName):
998 v = qnames[v.text]
999 else:
1000 v = _escape_attrib_html(v)
1001 # FIXME: handle boolean attributes
1002 write(" %s=\"%s\"" % (qnames[k], v))
1003 write(">")
1004 tag = tag.lower()
1005 if text:
1006 if tag == "script" or tag == "style":
1007 write(text)
1008 else:
1009 write(_escape_cdata(text))
1010 for e in elem:
1011 _serialize_html(write, e, qnames, None)
1012 if tag not in HTML_EMPTY:
1013 write("</" + tag + ">")
1014 if elem.tail:
1015 write(_escape_cdata(elem.tail))
1016
1017def _serialize_text(write, elem):
1018 for part in elem.itertext():
1019 write(part)
1020 if elem.tail:
1021 write(elem.tail)
1022
1023_serialize = {
1024 "xml": _serialize_xml,
1025 "html": _serialize_html,
1026 "text": _serialize_text,
1027# this optional method is imported at the end of the module
1028# "c14n": _serialize_c14n,
1029}
Armin Rigo9ed73062005-12-14 18:10:45 +00001030
Armin Rigo9ed73062005-12-14 18:10:45 +00001031
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001032def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001033 """Register a namespace prefix.
1034
1035 The registry is global, and any existing mapping for either the
1036 given prefix or the namespace URI will be removed.
1037
1038 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1039 attributes in this namespace will be serialized with prefix if possible.
1040
1041 ValueError is raised if prefix is reserved or is invalid.
1042
1043 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001044 if re.match("ns\d+$", prefix):
1045 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001046 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001047 if k == uri or v == prefix:
1048 del _namespace_map[k]
1049 _namespace_map[uri] = prefix
1050
1051_namespace_map = {
1052 # "well-known" namespace prefixes
1053 "http://www.w3.org/XML/1998/namespace": "xml",
1054 "http://www.w3.org/1999/xhtml": "html",
1055 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1056 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1057 # xml schema
1058 "http://www.w3.org/2001/XMLSchema": "xs",
1059 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1060 # dublin core
1061 "http://purl.org/dc/elements/1.1/": "dc",
1062}
Florent Xicluna16395052012-02-16 23:28:35 +01001063# For tests and troubleshooting
1064register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001065
1066def _raise_serialization_error(text):
1067 raise TypeError(
1068 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1069 )
1070
1071def _escape_cdata(text):
1072 # escape character data
1073 try:
1074 # it's worth avoiding do-nothing calls for strings that are
1075 # shorter than 500 character, or so. assume that's, by far,
1076 # the most common case in most applications.
1077 if "&" in text:
1078 text = text.replace("&", "&amp;")
1079 if "<" in text:
1080 text = text.replace("<", "&lt;")
1081 if ">" in text:
1082 text = text.replace(">", "&gt;")
1083 return text
1084 except (TypeError, AttributeError):
1085 _raise_serialization_error(text)
1086
1087def _escape_attrib(text):
1088 # escape attribute value
1089 try:
1090 if "&" in text:
1091 text = text.replace("&", "&amp;")
1092 if "<" in text:
1093 text = text.replace("<", "&lt;")
1094 if ">" in text:
1095 text = text.replace(">", "&gt;")
1096 if "\"" in text:
1097 text = text.replace("\"", "&quot;")
1098 if "\n" in text:
1099 text = text.replace("\n", "&#10;")
1100 return text
1101 except (TypeError, AttributeError):
1102 _raise_serialization_error(text)
1103
1104def _escape_attrib_html(text):
1105 # escape attribute value
1106 try:
1107 if "&" in text:
1108 text = text.replace("&", "&amp;")
1109 if ">" in text:
1110 text = text.replace(">", "&gt;")
1111 if "\"" in text:
1112 text = text.replace("\"", "&quot;")
1113 return text
1114 except (TypeError, AttributeError):
1115 _raise_serialization_error(text)
1116
1117# --------------------------------------------------------------------
1118
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001119def tostring(element, encoding=None, method=None, *,
1120 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001121 """Generate string representation of XML element.
1122
1123 All subelements are included. If encoding is "unicode", a string
1124 is returned. Otherwise a bytestring is returned.
1125
1126 *element* is an Element instance, *encoding* is an optional output
1127 encoding defaulting to US-ASCII, *method* is an optional output which can
1128 be one of "xml" (default), "html", "text" or "c14n".
1129
1130 Returns an (optionally) encoded string containing the XML data.
1131
1132 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001133 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001134 ElementTree(element).write(stream, encoding, method=method,
1135 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001136 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001137
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001138class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001139 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001140 def __init__(self, lst):
1141 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001142
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001143 def writable(self):
1144 return True
1145
1146 def seekable(self):
1147 return True
1148
1149 def write(self, b):
1150 self.lst.append(b)
1151
1152 def tell(self):
1153 return len(self.lst)
1154
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001155def tostringlist(element, encoding=None, method=None, *,
1156 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001157 lst = []
1158 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001159 ElementTree(element).write(stream, encoding, method=method,
1160 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001161 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001162
Armin Rigo9ed73062005-12-14 18:10:45 +00001163
1164def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001165 """Write element tree or element structure to sys.stdout.
1166
1167 This function should be used for debugging only.
1168
1169 *elem* is either an ElementTree, or a single Element. The exact output
1170 format is implementation dependent. In this version, it's written as an
1171 ordinary XML file.
1172
1173 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001174 # debugging
1175 if not isinstance(elem, ElementTree):
1176 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001177 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001178 tail = elem.getroot().tail
1179 if not tail or tail[-1] != "\n":
1180 sys.stdout.write("\n")
1181
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001182# --------------------------------------------------------------------
1183# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001184
Armin Rigo9ed73062005-12-14 18:10:45 +00001185
1186def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001187 """Parse XML document into element tree.
1188
1189 *source* is a filename or file object containing XML data,
1190 *parser* is an optional parser instance defaulting to XMLParser.
1191
1192 Return an ElementTree instance.
1193
1194 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001195 tree = ElementTree()
1196 tree.parse(source, parser)
1197 return tree
1198
Armin Rigo9ed73062005-12-14 18:10:45 +00001199
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001200def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001201 """Incrementally parse XML document into ElementTree.
1202
1203 This class also reports what's going on to the user based on the
1204 *events* it is initialized with. The supported events are the strings
1205 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1206 detailed namespace information). If *events* is omitted, only
1207 "end" events are reported.
1208
1209 *source* is a filename or file object containing XML data, *events* is
1210 a list of events to report back, *parser* is an optional parser instance.
1211
1212 Returns an iterator providing (event, elem) pairs.
1213
1214 """
Antoine Pitroue033e062010-10-29 10:38:18 +00001215 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001216 if not hasattr(source, "read"):
1217 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001218 close_source = True
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001219 if not parser:
1220 parser = XMLParser(target=TreeBuilder())
Antoine Pitroue033e062010-10-29 10:38:18 +00001221 return _IterParseIterator(source, events, parser, close_source)
Armin Rigo9ed73062005-12-14 18:10:45 +00001222
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001223class _IterParseIterator:
1224
Antoine Pitroue033e062010-10-29 10:38:18 +00001225 def __init__(self, source, events, parser, close_source=False):
Armin Rigo9ed73062005-12-14 18:10:45 +00001226 self._file = source
Antoine Pitroue033e062010-10-29 10:38:18 +00001227 self._close_file = close_source
Armin Rigo9ed73062005-12-14 18:10:45 +00001228 self._events = []
1229 self._index = 0
Florent Xicluna91d51932011-11-01 23:31:09 +01001230 self._error = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001231 self.root = self._root = None
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001232 self._parser = parser
Armin Rigo9ed73062005-12-14 18:10:45 +00001233 # wire up the parser for event reporting
1234 parser = self._parser._parser
1235 append = self._events.append
1236 if events is None:
1237 events = ["end"]
1238 for event in events:
1239 if event == "start":
1240 try:
1241 parser.ordered_attributes = 1
1242 parser.specified_attributes = 1
1243 def handler(tag, attrib_in, event=event, append=append,
1244 start=self._parser._start_list):
1245 append((event, start(tag, attrib_in)))
1246 parser.StartElementHandler = handler
1247 except AttributeError:
1248 def handler(tag, attrib_in, event=event, append=append,
1249 start=self._parser._start):
1250 append((event, start(tag, attrib_in)))
1251 parser.StartElementHandler = handler
1252 elif event == "end":
1253 def handler(tag, event=event, append=append,
1254 end=self._parser._end):
1255 append((event, end(tag)))
1256 parser.EndElementHandler = handler
1257 elif event == "start-ns":
1258 def handler(prefix, uri, event=event, append=append):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001259 append((event, (prefix or "", uri or "")))
Armin Rigo9ed73062005-12-14 18:10:45 +00001260 parser.StartNamespaceDeclHandler = handler
1261 elif event == "end-ns":
1262 def handler(prefix, event=event, append=append):
1263 append((event, None))
1264 parser.EndNamespaceDeclHandler = handler
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001265 else:
1266 raise ValueError("unknown event %r" % event)
Armin Rigo9ed73062005-12-14 18:10:45 +00001267
Georg Brandla18af4e2007-04-21 15:47:16 +00001268 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +00001269 while 1:
1270 try:
1271 item = self._events[self._index]
Florent Xicluna91d51932011-11-01 23:31:09 +01001272 self._index += 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001273 return item
Florent Xicluna91d51932011-11-01 23:31:09 +01001274 except IndexError:
1275 pass
1276 if self._error:
1277 e = self._error
1278 self._error = None
1279 raise e
1280 if self._parser is None:
1281 self.root = self._root
1282 if self._close_file:
1283 self._file.close()
1284 raise StopIteration
1285 # load event buffer
1286 del self._events[:]
1287 self._index = 0
1288 data = self._file.read(16384)
1289 if data:
1290 try:
1291 self._parser.feed(data)
1292 except SyntaxError as exc:
1293 self._error = exc
1294 else:
1295 self._root = self._parser.close()
1296 self._parser = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001297
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001298 def __iter__(self):
1299 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001300
Armin Rigo9ed73062005-12-14 18:10:45 +00001301
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001302def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001303 """Parse XML document from string constant.
1304
1305 This function can be used to embed "XML Literals" in Python code.
1306
1307 *text* is a string containing XML data, *parser* is an
1308 optional parser instance, defaulting to the standard XMLParser.
1309
1310 Returns an Element instance.
1311
1312 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001313 if not parser:
1314 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001315 parser.feed(text)
1316 return parser.close()
1317
Armin Rigo9ed73062005-12-14 18:10:45 +00001318
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001319def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001320 """Parse XML document from string constant for its IDs.
1321
1322 *text* is a string containing XML data, *parser* is an
1323 optional parser instance, defaulting to the standard XMLParser.
1324
1325 Returns an (Element, dict) tuple, in which the
1326 dict maps element id:s to elements.
1327
1328 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001329 if not parser:
1330 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001331 parser.feed(text)
1332 tree = parser.close()
1333 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001334 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001335 id = elem.get("id")
1336 if id:
1337 ids[id] = elem
1338 return tree, ids
1339
Victor Stinner765531d2013-03-26 01:11:54 +01001340# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001341fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001342
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001343def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001344 """Parse XML document from sequence of string fragments.
1345
1346 *sequence* is a list of other sequence, *parser* is an optional parser
1347 instance, defaulting to the standard XMLParser.
1348
1349 Returns an Element instance.
1350
1351 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001352 if not parser:
1353 parser = XMLParser(target=TreeBuilder())
1354 for text in sequence:
1355 parser.feed(text)
1356 return parser.close()
1357
1358# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001359
Armin Rigo9ed73062005-12-14 18:10:45 +00001360
1361class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001362 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001363
Eli Bendersky84fae782013-03-09 07:12:48 -08001364 This builder converts a sequence of start, data, and end method
1365 calls to a well-formed element structure.
1366
1367 You can use this class to build an element structure using a custom XML
1368 parser, or a parser for some other XML-like format.
1369
1370 *element_factory* is an optional element factory which is called
1371 to create new Element instances, as necessary.
1372
1373 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001374 def __init__(self, element_factory=None):
1375 self._data = [] # data collector
1376 self._elem = [] # element stack
1377 self._last = None # last element
1378 self._tail = None # true if we're after an end tag
1379 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001380 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001381 self._factory = element_factory
1382
Armin Rigo9ed73062005-12-14 18:10:45 +00001383 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001384 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001385 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001386 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001387 return self._last
1388
1389 def _flush(self):
1390 if self._data:
1391 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001392 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001393 if self._tail:
1394 assert self._last.tail is None, "internal error (tail)"
1395 self._last.tail = text
1396 else:
1397 assert self._last.text is None, "internal error (text)"
1398 self._last.text = text
1399 self._data = []
1400
Armin Rigo9ed73062005-12-14 18:10:45 +00001401
1402 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001403 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001404 self._data.append(data)
1405
Armin Rigo9ed73062005-12-14 18:10:45 +00001406
1407 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001408 """Open new element and return it.
1409
1410 *tag* is the element name, *attrs* is a dict containing element
1411 attributes.
1412
1413 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001414 self._flush()
1415 self._last = elem = self._factory(tag, attrs)
1416 if self._elem:
1417 self._elem[-1].append(elem)
1418 self._elem.append(elem)
1419 self._tail = 0
1420 return elem
1421
Armin Rigo9ed73062005-12-14 18:10:45 +00001422
1423 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001424 """Close and return current Element.
1425
1426 *tag* is the element name.
1427
1428 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001429 self._flush()
1430 self._last = self._elem.pop()
1431 assert self._last.tag == tag,\
1432 "end tag mismatch (expected %s, got %s)" % (
1433 self._last.tag, tag)
1434 self._tail = 1
1435 return self._last
1436
Armin Rigo9ed73062005-12-14 18:10:45 +00001437
Eli Bendersky84fae782013-03-09 07:12:48 -08001438# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001439class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001440 """Element structure builder for XML source data based on the expat parser.
1441
1442 *html* are predefined HTML entities (not supported currently),
1443 *target* is an optional target object which defaults to an instance of the
1444 standard TreeBuilder class, *encoding* is an optional encoding string
1445 which if given, overrides the encoding specified in the XML file:
1446 http://www.iana.org/assignments/character-sets
1447
1448 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001449
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001450 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001451 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001452 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001453 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001454 try:
1455 import pyexpat as expat
1456 except ImportError:
1457 raise ImportError(
1458 "No module named expat; use SimpleXMLTreeBuilder instead"
1459 )
1460 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001461 if target is None:
1462 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001463 # underscored names are provided for compatibility only
1464 self.parser = self._parser = parser
1465 self.target = self._target = target
1466 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001467 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001468 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001469 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001470 if hasattr(target, 'start'):
1471 parser.StartElementHandler = self._start
1472 if hasattr(target, 'end'):
1473 parser.EndElementHandler = self._end
1474 if hasattr(target, 'data'):
1475 parser.CharacterDataHandler = target.data
1476 # miscellaneous callbacks
1477 if hasattr(target, 'comment'):
1478 parser.CommentHandler = target.comment
1479 if hasattr(target, 'pi'):
1480 parser.ProcessingInstructionHandler = target.pi
Armin Rigo9ed73062005-12-14 18:10:45 +00001481 # let expat do the buffering, if supported
1482 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001483 parser.buffer_text = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001484 except AttributeError:
1485 pass
1486 # use new-style attribute handling, if supported
1487 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001488 parser.ordered_attributes = 1
1489 parser.specified_attributes = 1
1490 if hasattr(target, 'start'):
1491 parser.StartElementHandler = self._start_list
Armin Rigo9ed73062005-12-14 18:10:45 +00001492 except AttributeError:
1493 pass
Armin Rigo9ed73062005-12-14 18:10:45 +00001494 self._doctype = None
1495 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001496 try:
1497 self.version = "Expat %d.%d.%d" % expat.version_info
1498 except AttributeError:
1499 pass # unknown
1500
1501 def _raiseerror(self, value):
1502 err = ParseError(value)
1503 err.code = value.code
1504 err.position = value.lineno, value.offset
1505 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001506
Armin Rigo9ed73062005-12-14 18:10:45 +00001507 def _fixname(self, key):
1508 # expand qname, and convert name string to ascii, if possible
1509 try:
1510 name = self._names[key]
1511 except KeyError:
1512 name = key
1513 if "}" in name:
1514 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001515 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001516 return name
1517
1518 def _start(self, tag, attrib_in):
1519 fixname = self._fixname
1520 tag = fixname(tag)
1521 attrib = {}
1522 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001523 attrib[fixname(key)] = value
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001524 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001525
1526 def _start_list(self, tag, attrib_in):
1527 fixname = self._fixname
1528 tag = fixname(tag)
1529 attrib = {}
1530 if attrib_in:
1531 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001532 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001533 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001534
Armin Rigo9ed73062005-12-14 18:10:45 +00001535 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001536 return self.target.end(self._fixname(tag))
1537
Armin Rigo9ed73062005-12-14 18:10:45 +00001538 def _default(self, text):
1539 prefix = text[:1]
1540 if prefix == "&":
1541 # deal with undefined entities
1542 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001543 data_handler = self.target.data
1544 except AttributeError:
1545 return
1546 try:
1547 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001548 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001549 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001550 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001551 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001552 (text, self.parser.ErrorLineNumber,
1553 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001554 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001555 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001556 err.lineno = self.parser.ErrorLineNumber
1557 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001558 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001559 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1560 self._doctype = [] # inside a doctype declaration
1561 elif self._doctype is not None:
1562 # parse doctype contents
1563 if prefix == ">":
1564 self._doctype = None
1565 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001566 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001567 if not text:
1568 return
1569 self._doctype.append(text)
1570 n = len(self._doctype)
1571 if n > 2:
1572 type = self._doctype[1]
1573 if type == "PUBLIC" and n == 4:
1574 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001575 if pubid:
1576 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001577 elif type == "SYSTEM" and n == 3:
1578 name, type, system = self._doctype
1579 pubid = None
1580 else:
1581 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001582 if hasattr(self.target, "doctype"):
1583 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001584 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001585 # warn about deprecated call
1586 self._XMLParser__doctype(name, pubid, system[1:-1])
1587 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001588 self._doctype = None
1589
Armin Rigo9ed73062005-12-14 18:10:45 +00001590 def doctype(self, name, pubid, system):
Eli Bendersky84fae782013-03-09 07:12:48 -08001591 """(Deprecated) Handle doctype declaration
1592
1593 *name* is the Doctype name, *pubid* is the public identifier,
1594 and *system* is the system identifier.
1595
1596 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001597 warnings.warn(
1598 "This method of XMLParser is deprecated. Define doctype() "
1599 "method on the TreeBuilder target.",
1600 DeprecationWarning,
1601 )
1602
1603 # sentinel, if doctype is redefined in a subclass
1604 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001605
Armin Rigo9ed73062005-12-14 18:10:45 +00001606 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001607 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001608 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001609 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001610 except self._error as v:
1611 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001612
Armin Rigo9ed73062005-12-14 18:10:45 +00001613 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001614 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001615 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001616 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001617 except self._error as v:
1618 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001619 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001620 close_handler = self.target.close
1621 except AttributeError:
1622 pass
1623 else:
1624 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001625 finally:
1626 # get rid of circular references
1627 del self.parser, self._parser
1628 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001629
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001630
1631# Import the C accelerators
1632try:
1633 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1634 from _elementtree import *
1635except ImportError:
1636 pass
1637else:
1638 # Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser
1639
1640 class ElementTree(ElementTree):
Eli Bendersky84fae782013-03-09 07:12:48 -08001641 __doc__ = ElementTree.__doc__
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001642 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001643 __doc__ = ElementTree.parse.__doc__
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001644 close_source = False
1645 if not hasattr(source, 'read'):
1646 source = open(source, 'rb')
1647 close_source = True
1648 try:
1649 if parser is not None:
1650 while True:
1651 data = source.read(65536)
1652 if not data:
1653 break
1654 parser.feed(data)
1655 self._root = parser.close()
1656 else:
1657 parser = XMLParser()
1658 self._root = parser._parse(source)
1659 return self._root
1660 finally:
1661 if close_source:
1662 source.close()
1663
1664 class iterparse:
Eli Bendersky84fae782013-03-09 07:12:48 -08001665 __doc__ = iterparse.__doc__
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001666 root = None
Eli Bendersky84fae782013-03-09 07:12:48 -08001667 def __init__(self, source, events=None, parser=None):
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001668 self._close_file = False
Eli Bendersky84fae782013-03-09 07:12:48 -08001669 if not hasattr(source, 'read'):
1670 source = open(source, 'rb')
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001671 self._close_file = True
Eli Bendersky84fae782013-03-09 07:12:48 -08001672 self._file = source
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001673 self._events = []
1674 self._index = 0
1675 self._error = None
1676 self.root = self._root = None
Eli Benderskyaaa97802013-01-24 07:15:19 -08001677 if parser is None:
1678 parser = XMLParser(target=TreeBuilder())
1679 self._parser = parser
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001680 self._parser._setevents(self._events, events)
1681
1682 def __next__(self):
1683 while True:
1684 try:
1685 item = self._events[self._index]
1686 self._index += 1
1687 return item
1688 except IndexError:
1689 pass
1690 if self._error:
1691 e = self._error
1692 self._error = None
1693 raise e
1694 if self._parser is None:
1695 self.root = self._root
1696 if self._close_file:
1697 self._file.close()
1698 raise StopIteration
1699 # load event buffer
1700 del self._events[:]
1701 self._index = 0
1702 data = self._file.read(16384)
1703 if data:
1704 try:
1705 self._parser.feed(data)
1706 except SyntaxError as exc:
1707 self._error = exc
1708 else:
1709 self._root = self._parser.close()
1710 self._parser = None
1711
1712 def __iter__(self):
1713 return self
1714
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001715# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001716XMLTreeBuilder = XMLParser
1717
1718# workaround circular import.
1719try:
1720 from ElementC14N import _serialize_c14n
1721 _serialize["c14n"] = _serialize_c14n
1722except ImportError:
1723 pass