blob: bb32a8f03be9e3cff78d0371dbd075c647e67f4c [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Eli Benderskyc4e98a62013-05-19 09:24:43 -070088 "XMLParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
98import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +000099
Eli Bendersky27cbb192012-06-15 09:03:19 +0300100from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000101
Armin Rigo9ed73062005-12-14 18:10:45 +0000102
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000103class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800104 """An error when parsing an XML document.
105
106 In addition to its exception value, a ParseError contains
107 two extra attributes:
108 'code' - the specific exception code
109 'position' - the line and column of the error
110
111 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000112 pass
113
114# --------------------------------------------------------------------
115
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000116
117def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800118 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100119 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000120
Armin Rigo9ed73062005-12-14 18:10:45 +0000121
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000122class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800123 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000124
Eli Bendersky84fae782013-03-09 07:12:48 -0800125 This class is the reference implementation of the Element interface.
126
127 An element's length is its number of subelements. That means if you
Serhiy Storchaka56a6d852014-12-01 18:28:43 +0200128 want to check if an element is truly empty, you should check BOTH
Eli Bendersky84fae782013-03-09 07:12:48 -0800129 its length AND its text attribute.
130
131 The element tag, attribute names, and attribute values can be either
132 bytes or strings.
133
134 *tag* is the element name. *attrib* is an optional dictionary containing
135 element attributes. *extra* are additional element attributes given as
136 keyword arguments.
137
138 Example form:
139 <tag attrib>text<child/>...</tag>tail
140
141 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000142
143 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800144 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000145
146 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800147 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000148
149 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800150 """
151 Text before first subelement. This is either a string or the value None.
152 Note that if there is no text, this attribute may be either
153 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000154
Eli Bendersky84fae782013-03-09 07:12:48 -0800155 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000156
Eli Bendersky84fae782013-03-09 07:12:48 -0800157 tail = None
158 """
159 Text after this element's end tag, but before the next sibling element's
160 start tag. This is either a string or the value None. Note that if there
161 was no text, this attribute may be either None or an empty string,
162 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000163
Eli Bendersky84fae782013-03-09 07:12:48 -0800164 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000165
166 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300167 if not isinstance(attrib, dict):
168 raise TypeError("attrib must be dict, not %s" % (
169 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000170 attrib = attrib.copy()
171 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000172 self.tag = tag
173 self.attrib = attrib
174 self._children = []
175
176 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300177 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000178
Armin Rigo9ed73062005-12-14 18:10:45 +0000179 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800180 """Create a new element with the same type.
181
182 *tag* is a string containing the element name.
183 *attrib* is a dictionary containing the element attributes.
184
185 Do not call this method, use the SubElement factory function instead.
186
187 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000188 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000189
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000190 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800191 """Return copy of current element.
192
193 This creates a shallow copy. Subelements will be shared with the
194 original tree.
195
196 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000197 elem = self.makeelement(self.tag, self.attrib)
198 elem.text = self.text
199 elem.tail = self.tail
200 elem[:] = self
201 return elem
202
Armin Rigo9ed73062005-12-14 18:10:45 +0000203 def __len__(self):
204 return len(self._children)
205
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 def __bool__(self):
207 warnings.warn(
208 "The behavior of this method will change in future versions. "
209 "Use specific 'len(elem)' or 'elem is not None' test instead.",
210 FutureWarning, stacklevel=2
211 )
212 return len(self._children) != 0 # emulate old behaviour, for now
213
Armin Rigo9ed73062005-12-14 18:10:45 +0000214 def __getitem__(self, index):
215 return self._children[index]
216
Armin Rigo9ed73062005-12-14 18:10:45 +0000217 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000218 # if isinstance(index, slice):
219 # for elt in element:
220 # assert iselement(elt)
221 # else:
222 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000223 self._children[index] = element
224
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 def __delitem__(self, index):
226 del self._children[index]
227
Eli Bendersky84fae782013-03-09 07:12:48 -0800228 def append(self, subelement):
229 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000230
Eli Bendersky84fae782013-03-09 07:12:48 -0800231 The new element will appear in document order after the last existing
232 subelement (or directly after the text, if it's the first subelement),
233 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000234
Eli Bendersky84fae782013-03-09 07:12:48 -0800235 """
236 self._assert_is_element(subelement)
237 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000238
239 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800240 """Append subelements from a sequence.
241
242 *elements* is a sequence with zero or more elements.
243
244 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200245 for element in elements:
246 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000247 self._children.extend(elements)
248
Eli Bendersky84fae782013-03-09 07:12:48 -0800249 def insert(self, index, subelement):
250 """Insert *subelement* at position *index*."""
251 self._assert_is_element(subelement)
252 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000253
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200254 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200255 # Need to refer to the actual Python implementation, not the
256 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700257 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200258 raise TypeError('expected an Element, not %s' % type(e).__name__)
259
Eli Bendersky84fae782013-03-09 07:12:48 -0800260 def remove(self, subelement):
261 """Remove matching subelement.
262
263 Unlike the find methods, this method compares elements based on
264 identity, NOT ON tag value or contents. To remove subelements by
265 other means, the easiest way is to use a list comprehension to
266 select what elements to keep, and then use slice assignment to update
267 the parent element.
268
269 ValueError is raised if a matching element could not be found.
270
271 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000272 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800273 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000274
275 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800276 """(Deprecated) Return all subelements.
277
278 Elements are returned in document order.
279
280 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000281 warnings.warn(
282 "This method will be removed in future versions. "
283 "Use 'list(elem)' or iteration over elem instead.",
284 DeprecationWarning, stacklevel=2
285 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000286 return self._children
287
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000288 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800289 """Find first matching element by tag name or path.
290
291 *path* is a string having either an element tag or an XPath,
292 *namespaces* is an optional mapping from namespace prefix to full name.
293
294 Return the first matching element, or None if no element was found.
295
296 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000297 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000298
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000299 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800300 """Find text for first matching element by tag name or path.
301
302 *path* is a string having either an element tag or an XPath,
303 *default* is the value to return if the element was not found,
304 *namespaces* is an optional mapping from namespace prefix to full name.
305
306 Return text content of first matching element, or default value if
307 none was found. Note that if an element is found having no text
308 content, the empty string is returned.
309
310 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000311 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000312
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000313 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800314 """Find all matching subelements by tag name or path.
315
316 *path* is a string having either an element tag or an XPath,
317 *namespaces* is an optional mapping from namespace prefix to full name.
318
319 Returns list containing all matching elements in document order.
320
321 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000322 return ElementPath.findall(self, path, namespaces)
323
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000324 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800325 """Find all matching subelements by tag name or path.
326
327 *path* is a string having either an element tag or an XPath,
328 *namespaces* is an optional mapping from namespace prefix to full name.
329
330 Return an iterable yielding all matching elements in document order.
331
332 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000333 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000334
Armin Rigo9ed73062005-12-14 18:10:45 +0000335 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800336 """Reset element.
337
338 This function removes all subelements, clears all attributes, and sets
339 the text and tail attributes to None.
340
341 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000342 self.attrib.clear()
343 self._children = []
344 self.text = self.tail = None
345
Armin Rigo9ed73062005-12-14 18:10:45 +0000346 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800347 """Get element attribute.
348
349 Equivalent to attrib.get, but some implementations may handle this a
350 bit more efficiently. *key* is what attribute to look for, and
351 *default* is what to return if the attribute was not found.
352
353 Returns a string containing the attribute value, or the default if
354 attribute was not found.
355
356 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000357 return self.attrib.get(key, default)
358
Armin Rigo9ed73062005-12-14 18:10:45 +0000359 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800360 """Set element attribute.
361
362 Equivalent to attrib[key] = value, but some implementations may handle
363 this a bit more efficiently. *key* is what attribute to set, and
364 *value* is the attribute value to set it to.
365
366 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000367 self.attrib[key] = value
368
Armin Rigo9ed73062005-12-14 18:10:45 +0000369 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800370 """Get list of attribute names.
371
372 Names are returned in an arbitrary order, just like an ordinary
373 Python dict. Equivalent to attrib.keys()
374
375 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000376 return self.attrib.keys()
377
Armin Rigo9ed73062005-12-14 18:10:45 +0000378 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800379 """Get element attributes as a sequence.
380
381 The attributes are returned in arbitrary order. Equivalent to
382 attrib.items().
383
384 Return a list of (name, value) tuples.
385
386 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000387 return self.attrib.items()
388
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000389 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800390 """Create tree iterator.
391
392 The iterator loops over the element and all subelements in document
393 order, returning all elements with a matching tag.
394
395 If the tree structure is modified during iteration, new or removed
396 elements may or may not be included. To get a stable set, use the
397 list() function on the iterator, and loop over the resulting list.
398
399 *tag* is what tags to look for (default is to return all elements)
400
401 Return an iterator containing all the matching elements.
402
403 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000404 if tag == "*":
405 tag = None
406 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000407 yield self
408 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700409 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000410
411 # compatibility
412 def getiterator(self, tag=None):
413 # Change for a DeprecationWarning in 1.4
414 warnings.warn(
415 "This method will be removed in future versions. "
416 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
417 PendingDeprecationWarning, stacklevel=2
418 )
419 return list(self.iter(tag))
420
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000421 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800422 """Create text iterator.
423
424 The iterator loops over the element and all subelements in document
425 order, returning all inner text.
426
427 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000428 tag = self.tag
429 if not isinstance(tag, str) and tag is not None:
430 return
431 if self.text:
432 yield self.text
433 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700434 yield from e.itertext()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000435 if e.tail:
436 yield e.tail
Armin Rigo9ed73062005-12-14 18:10:45 +0000437
Armin Rigo9ed73062005-12-14 18:10:45 +0000438
439def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800440 """Subelement factory which creates an element instance, and appends it
441 to an existing parent.
442
443 The element tag, attribute names, and attribute values can be either
444 bytes or Unicode strings.
445
446 *parent* is the parent element, *tag* is the subelements name, *attrib* is
447 an optional directory containing element attributes, *extra* are
448 additional attributes given as keyword arguments.
449
450 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000451 attrib = attrib.copy()
452 attrib.update(extra)
453 element = parent.makeelement(tag, attrib)
454 parent.append(element)
455 return element
456
Armin Rigo9ed73062005-12-14 18:10:45 +0000457
458def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800459 """Comment element factory.
460
461 This function creates a special element which the standard serializer
462 serializes as an XML comment.
463
464 *text* is a string containing the comment string.
465
466 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000467 element = Element(Comment)
468 element.text = text
469 return element
470
Armin Rigo9ed73062005-12-14 18:10:45 +0000471
472def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800473 """Processing Instruction element factory.
474
475 This function creates a special element which the standard serializer
476 serializes as an XML comment.
477
478 *target* is a string containing the processing instruction, *text* is a
479 string containing the processing instruction contents, if any.
480
481 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000482 element = Element(ProcessingInstruction)
483 element.text = target
484 if text:
485 element.text = element.text + " " + text
486 return element
487
488PI = ProcessingInstruction
489
Armin Rigo9ed73062005-12-14 18:10:45 +0000490
491class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800492 """Qualified name wrapper.
493
494 This class can be used to wrap a QName attribute value in order to get
495 proper namespace handing on output.
496
497 *text_or_uri* is a string containing the QName value either in the form
498 {uri}local, or if the tag argument is given, the URI part of a QName.
499
500 *tag* is an optional argument which if given, will make the first
501 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
502 be interpreted as a local name.
503
504 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000505 def __init__(self, text_or_uri, tag=None):
506 if tag:
507 text_or_uri = "{%s}%s" % (text_or_uri, tag)
508 self.text = text_or_uri
509 def __str__(self):
510 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000511 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300512 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000513 def __hash__(self):
514 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000515 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000516 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000517 return self.text <= other.text
518 return self.text <= other
519 def __lt__(self, other):
520 if isinstance(other, QName):
521 return self.text < other.text
522 return self.text < other
523 def __ge__(self, other):
524 if isinstance(other, QName):
525 return self.text >= other.text
526 return self.text >= other
527 def __gt__(self, other):
528 if isinstance(other, QName):
529 return self.text > other.text
530 return self.text > other
531 def __eq__(self, other):
532 if isinstance(other, QName):
533 return self.text == other.text
534 return self.text == other
Armin Rigo9ed73062005-12-14 18:10:45 +0000535
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000536# --------------------------------------------------------------------
537
Armin Rigo9ed73062005-12-14 18:10:45 +0000538
539class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800540 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000541
Eli Bendersky84fae782013-03-09 07:12:48 -0800542 This class also provides support for serialization to and from
543 standard XML.
544
545 *element* is an optional root element node,
546 *file* is an optional file handle or file name of an XML file whose
547 contents will be used to initialize the tree with.
548
549 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000550 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000551 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000552 self._root = element # first node
553 if file:
554 self.parse(file)
555
Armin Rigo9ed73062005-12-14 18:10:45 +0000556 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800557 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000558 return self._root
559
Armin Rigo9ed73062005-12-14 18:10:45 +0000560 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800561 """Replace root element of this tree.
562
563 This will discard the current contents of the tree and replace it
564 with the given element. Use with care!
565
566 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000567 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000568 self._root = element
569
Armin Rigo9ed73062005-12-14 18:10:45 +0000570 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800571 """Load external XML document into element tree.
572
573 *source* is a file name or file object, *parser* is an optional parser
574 instance that defaults to XMLParser.
575
576 ParseError is raised if the parser fails to parse the document.
577
578 Returns the root element of the given source document.
579
580 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000581 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000582 if not hasattr(source, "read"):
583 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000584 close_source = True
585 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700586 if parser is None:
587 # If no parser was specified, create a default XMLParser
588 parser = XMLParser()
589 if hasattr(parser, '_parse_whole'):
590 # The default XMLParser, when it comes from an accelerator,
591 # can define an internal _parse_whole API for efficiency.
592 # It can be used to parse the whole source without feeding
593 # it with chunks.
594 self._root = parser._parse_whole(source)
595 return self._root
596 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000597 data = source.read(65536)
598 if not data:
599 break
600 parser.feed(data)
601 self._root = parser.close()
602 return self._root
603 finally:
604 if close_source:
605 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000606
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000607 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800608 """Create and return tree iterator for the root element.
609
610 The iterator loops over all elements in this tree, in document order.
611
612 *tag* is a string with the tag name to iterate over
613 (default is to return all elements).
614
615 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000616 # assert self._root is not None
617 return self._root.iter(tag)
618
619 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000620 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000621 # Change for a DeprecationWarning in 1.4
622 warnings.warn(
623 "This method will be removed in future versions. "
624 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
625 PendingDeprecationWarning, stacklevel=2
626 )
627 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000628
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000629 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800630 """Find first matching element by tag name or path.
631
632 Same as getroot().find(path), which is Element.find()
633
634 *path* is a string having either an element tag or an XPath,
635 *namespaces* is an optional mapping from namespace prefix to full name.
636
637 Return the first matching element, or None if no element was found.
638
639 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000640 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000641 if path[:1] == "/":
642 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000643 warnings.warn(
644 "This search is broken in 1.3 and earlier, and will be "
645 "fixed in a future version. If you rely on the current "
646 "behaviour, change it to %r" % path,
647 FutureWarning, stacklevel=2
648 )
649 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000650
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000651 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800652 """Find first matching element by tag name or path.
653
654 Same as getroot().findtext(path), which is Element.findtext()
655
656 *path* is a string having either an element tag or an XPath,
657 *namespaces* is an optional mapping from namespace prefix to full name.
658
659 Return the first matching element, or None if no element was found.
660
661 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000662 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000663 if path[:1] == "/":
664 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000665 warnings.warn(
666 "This search is broken in 1.3 and earlier, and will be "
667 "fixed in a future version. If you rely on the current "
668 "behaviour, change it to %r" % path,
669 FutureWarning, stacklevel=2
670 )
671 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000672
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000673 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800674 """Find all matching subelements by tag name or path.
675
676 Same as getroot().findall(path), which is Element.findall().
677
678 *path* is a string having either an element tag or an XPath,
679 *namespaces* is an optional mapping from namespace prefix to full name.
680
681 Return list containing all matching elements in document order.
682
683 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000684 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000685 if path[:1] == "/":
686 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000687 warnings.warn(
688 "This search is broken in 1.3 and earlier, and will be "
689 "fixed in a future version. If you rely on the current "
690 "behaviour, change it to %r" % path,
691 FutureWarning, stacklevel=2
692 )
693 return self._root.findall(path, namespaces)
694
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000695 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800696 """Find all matching subelements by tag name or path.
697
698 Same as getroot().iterfind(path), which is element.iterfind()
699
700 *path* is a string having either an element tag or an XPath,
701 *namespaces* is an optional mapping from namespace prefix to full name.
702
703 Return an iterable yielding all matching elements in document order.
704
705 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000706 # assert self._root is not None
707 if path[:1] == "/":
708 path = "." + path
709 warnings.warn(
710 "This search is broken in 1.3 and earlier, and will be "
711 "fixed in a future version. If you rely on the current "
712 "behaviour, change it to %r" % path,
713 FutureWarning, stacklevel=2
714 )
715 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000716
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000717 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000718 encoding=None,
719 xml_declaration=None,
720 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800721 method=None, *,
722 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800723 """Write element tree to a file as XML.
724
725 Arguments:
726 *file_or_filename* -- file name or a file object opened for writing
727
728 *encoding* -- the output encoding (default: US-ASCII)
729
730 *xml_declaration* -- bool indicating if an XML declaration should be
731 added to the output. If None, an XML declaration
732 is added if encoding IS NOT either of:
733 US-ASCII, UTF-8, or Unicode
734
735 *default_namespace* -- sets the default XML namespace (for "xmlns")
736
737 *method* -- either "xml" (default), "html, "text", or "c14n"
738
739 *short_empty_elements* -- controls the formatting of elements
740 that contain no content. If True (default)
741 they are emitted as a single self-closed
742 tag, otherwise they are emitted as a pair
743 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800744
745 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000746 if not method:
747 method = "xml"
748 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000749 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000750 if not encoding:
751 if method == "c14n":
752 encoding = "utf-8"
753 else:
754 encoding = "us-ascii"
Martin Panter89f76d32015-09-23 01:14:35 +0000755 enc_lower = encoding.lower()
756 with _get_writer(file_or_filename, enc_lower) as write:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300757 if method == "xml" and (xml_declaration or
758 (xml_declaration is None and
Martin Panter89f76d32015-09-23 01:14:35 +0000759 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
Eli Bendersky00f402b2012-07-15 06:02:22 +0300760 declared_encoding = encoding
Martin Panter89f76d32015-09-23 01:14:35 +0000761 if enc_lower == "unicode":
Eli Bendersky00f402b2012-07-15 06:02:22 +0300762 # Retrieve the default encoding for the xml declaration
763 import locale
764 declared_encoding = locale.getpreferredencoding()
765 write("<?xml version='1.0' encoding='%s'?>\n" % (
766 declared_encoding,))
767 if method == "text":
768 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000769 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300770 qnames, namespaces = _namespaces(self._root, default_namespace)
771 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800772 serialize(write, self._root, qnames, namespaces,
773 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000774
775 def write_c14n(self, file):
776 # lxml.etree compatibility. use output method instead
777 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000778
779# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000780# serialization support
781
Eli Bendersky00f402b2012-07-15 06:02:22 +0300782@contextlib.contextmanager
783def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300784 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300785 try:
786 write = file_or_filename.write
787 except AttributeError:
788 # file_or_filename is a file name
789 if encoding == "unicode":
790 file = open(file_or_filename, "w")
791 else:
792 file = open(file_or_filename, "w", encoding=encoding,
793 errors="xmlcharrefreplace")
794 with file:
795 yield file.write
796 else:
797 # file_or_filename is a file-like object
798 # encoding determines if it is a text or binary writer
799 if encoding == "unicode":
800 # use a text writer as is
801 yield write
802 else:
803 # wrap a binary writer with TextIOWrapper
804 with contextlib.ExitStack() as stack:
805 if isinstance(file_or_filename, io.BufferedIOBase):
806 file = file_or_filename
807 elif isinstance(file_or_filename, io.RawIOBase):
808 file = io.BufferedWriter(file_or_filename)
809 # Keep the original file open when the BufferedWriter is
810 # destroyed
811 stack.callback(file.detach)
812 else:
813 # This is to handle passed objects that aren't in the
814 # IOBase hierarchy, but just have a write method
815 file = io.BufferedIOBase()
816 file.writable = lambda: True
817 file.write = write
818 try:
819 # TextIOWrapper uses this methods to determine
820 # if BOM (for UTF-16, etc) should be added
821 file.seekable = file_or_filename.seekable
822 file.tell = file_or_filename.tell
823 except AttributeError:
824 pass
825 file = io.TextIOWrapper(file,
826 encoding=encoding,
827 errors="xmlcharrefreplace",
828 newline="\n")
829 # Keep the original file open when the TextIOWrapper is
830 # destroyed
831 stack.callback(file.detach)
832 yield file.write
833
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000834def _namespaces(elem, default_namespace=None):
835 # identify namespaces used in this tree
836
837 # maps qnames to *encoded* prefix:local names
838 qnames = {None: None}
839
840 # maps uri:s to prefixes
841 namespaces = {}
842 if default_namespace:
843 namespaces[default_namespace] = ""
844
845 def add_qname(qname):
846 # calculate serialized qname representation
847 try:
848 if qname[:1] == "{":
849 uri, tag = qname[1:].rsplit("}", 1)
850 prefix = namespaces.get(uri)
851 if prefix is None:
852 prefix = _namespace_map.get(uri)
853 if prefix is None:
854 prefix = "ns%d" % len(namespaces)
855 if prefix != "xml":
856 namespaces[uri] = prefix
857 if prefix:
858 qnames[qname] = "%s:%s" % (prefix, tag)
859 else:
860 qnames[qname] = tag # default element
861 else:
862 if default_namespace:
863 # FIXME: can this be handled in XML 1.0?
864 raise ValueError(
865 "cannot use non-qualified names with "
866 "default_namespace option"
867 )
868 qnames[qname] = qname
869 except TypeError:
870 _raise_serialization_error(qname)
871
872 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300873 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000874 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000875 if isinstance(tag, QName):
876 if tag.text not in qnames:
877 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000878 elif isinstance(tag, str):
879 if tag not in qnames:
880 add_qname(tag)
881 elif tag is not None and tag is not Comment and tag is not PI:
882 _raise_serialization_error(tag)
883 for key, value in elem.items():
884 if isinstance(key, QName):
885 key = key.text
886 if key not in qnames:
887 add_qname(key)
888 if isinstance(value, QName) and value.text not in qnames:
889 add_qname(value.text)
890 text = elem.text
891 if isinstance(text, QName) and text.text not in qnames:
892 add_qname(text.text)
893 return qnames, namespaces
894
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800895def _serialize_xml(write, elem, qnames, namespaces,
896 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000897 tag = elem.tag
898 text = elem.text
899 if tag is Comment:
900 write("<!--%s-->" % text)
901 elif tag is ProcessingInstruction:
902 write("<?%s?>" % text)
903 else:
904 tag = qnames[tag]
905 if tag is None:
906 if text:
907 write(_escape_cdata(text))
908 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800909 _serialize_xml(write, e, qnames, None,
910 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000911 else:
912 write("<" + tag)
913 items = list(elem.items())
914 if items or namespaces:
915 if namespaces:
916 for v, k in sorted(namespaces.items(),
917 key=lambda x: x[1]): # sort on prefix
918 if k:
919 k = ":" + k
920 write(" xmlns%s=\"%s\"" % (
921 k,
922 _escape_attrib(v)
923 ))
924 for k, v in sorted(items): # lexical order
925 if isinstance(k, QName):
926 k = k.text
927 if isinstance(v, QName):
928 v = qnames[v.text]
929 else:
930 v = _escape_attrib(v)
931 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800932 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000933 write(">")
934 if text:
935 write(_escape_cdata(text))
936 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800937 _serialize_xml(write, e, qnames, None,
938 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000939 write("</" + tag + ">")
940 else:
941 write(" />")
942 if elem.tail:
943 write(_escape_cdata(elem.tail))
944
945HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300946 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000947
948try:
949 HTML_EMPTY = set(HTML_EMPTY)
950except NameError:
951 pass
952
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800953def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000954 tag = elem.tag
955 text = elem.text
956 if tag is Comment:
957 write("<!--%s-->" % _escape_cdata(text))
958 elif tag is ProcessingInstruction:
959 write("<?%s?>" % _escape_cdata(text))
960 else:
961 tag = qnames[tag]
962 if tag is None:
963 if text:
964 write(_escape_cdata(text))
965 for e in elem:
966 _serialize_html(write, e, qnames, None)
967 else:
968 write("<" + tag)
969 items = list(elem.items())
970 if items or namespaces:
971 if namespaces:
972 for v, k in sorted(namespaces.items(),
973 key=lambda x: x[1]): # sort on prefix
974 if k:
975 k = ":" + k
976 write(" xmlns%s=\"%s\"" % (
977 k,
978 _escape_attrib(v)
979 ))
980 for k, v in sorted(items): # lexical order
981 if isinstance(k, QName):
982 k = k.text
983 if isinstance(v, QName):
984 v = qnames[v.text]
985 else:
986 v = _escape_attrib_html(v)
987 # FIXME: handle boolean attributes
988 write(" %s=\"%s\"" % (qnames[k], v))
989 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200990 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000991 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200992 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000993 write(text)
994 else:
995 write(_escape_cdata(text))
996 for e in elem:
997 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +0200998 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000999 write("</" + tag + ">")
1000 if elem.tail:
1001 write(_escape_cdata(elem.tail))
1002
1003def _serialize_text(write, elem):
1004 for part in elem.itertext():
1005 write(part)
1006 if elem.tail:
1007 write(elem.tail)
1008
1009_serialize = {
1010 "xml": _serialize_xml,
1011 "html": _serialize_html,
1012 "text": _serialize_text,
1013# this optional method is imported at the end of the module
1014# "c14n": _serialize_c14n,
1015}
Armin Rigo9ed73062005-12-14 18:10:45 +00001016
Armin Rigo9ed73062005-12-14 18:10:45 +00001017
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001018def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001019 """Register a namespace prefix.
1020
1021 The registry is global, and any existing mapping for either the
1022 given prefix or the namespace URI will be removed.
1023
1024 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1025 attributes in this namespace will be serialized with prefix if possible.
1026
1027 ValueError is raised if prefix is reserved or is invalid.
1028
1029 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001030 if re.match("ns\d+$", prefix):
1031 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001032 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001033 if k == uri or v == prefix:
1034 del _namespace_map[k]
1035 _namespace_map[uri] = prefix
1036
1037_namespace_map = {
1038 # "well-known" namespace prefixes
1039 "http://www.w3.org/XML/1998/namespace": "xml",
1040 "http://www.w3.org/1999/xhtml": "html",
1041 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1042 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1043 # xml schema
1044 "http://www.w3.org/2001/XMLSchema": "xs",
1045 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1046 # dublin core
1047 "http://purl.org/dc/elements/1.1/": "dc",
1048}
Florent Xicluna16395052012-02-16 23:28:35 +01001049# For tests and troubleshooting
1050register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001051
1052def _raise_serialization_error(text):
1053 raise TypeError(
1054 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1055 )
1056
1057def _escape_cdata(text):
1058 # escape character data
1059 try:
1060 # it's worth avoiding do-nothing calls for strings that are
1061 # shorter than 500 character, or so. assume that's, by far,
1062 # the most common case in most applications.
1063 if "&" in text:
1064 text = text.replace("&", "&amp;")
1065 if "<" in text:
1066 text = text.replace("<", "&lt;")
1067 if ">" in text:
1068 text = text.replace(">", "&gt;")
1069 return text
1070 except (TypeError, AttributeError):
1071 _raise_serialization_error(text)
1072
1073def _escape_attrib(text):
1074 # escape attribute value
1075 try:
1076 if "&" in text:
1077 text = text.replace("&", "&amp;")
1078 if "<" in text:
1079 text = text.replace("<", "&lt;")
1080 if ">" in text:
1081 text = text.replace(">", "&gt;")
1082 if "\"" in text:
1083 text = text.replace("\"", "&quot;")
1084 if "\n" in text:
1085 text = text.replace("\n", "&#10;")
1086 return text
1087 except (TypeError, AttributeError):
1088 _raise_serialization_error(text)
1089
1090def _escape_attrib_html(text):
1091 # escape attribute value
1092 try:
1093 if "&" in text:
1094 text = text.replace("&", "&amp;")
1095 if ">" in text:
1096 text = text.replace(">", "&gt;")
1097 if "\"" in text:
1098 text = text.replace("\"", "&quot;")
1099 return text
1100 except (TypeError, AttributeError):
1101 _raise_serialization_error(text)
1102
1103# --------------------------------------------------------------------
1104
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001105def tostring(element, encoding=None, method=None, *,
1106 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001107 """Generate string representation of XML element.
1108
1109 All subelements are included. If encoding is "unicode", a string
1110 is returned. Otherwise a bytestring is returned.
1111
1112 *element* is an Element instance, *encoding* is an optional output
1113 encoding defaulting to US-ASCII, *method* is an optional output which can
1114 be one of "xml" (default), "html", "text" or "c14n".
1115
1116 Returns an (optionally) encoded string containing the XML data.
1117
1118 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001119 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001120 ElementTree(element).write(stream, encoding, method=method,
1121 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001122 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001123
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001124class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001125 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001126 def __init__(self, lst):
1127 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001128
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001129 def writable(self):
1130 return True
1131
1132 def seekable(self):
1133 return True
1134
1135 def write(self, b):
1136 self.lst.append(b)
1137
1138 def tell(self):
1139 return len(self.lst)
1140
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001141def tostringlist(element, encoding=None, method=None, *,
1142 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001143 lst = []
1144 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001145 ElementTree(element).write(stream, encoding, method=method,
1146 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001147 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001148
Armin Rigo9ed73062005-12-14 18:10:45 +00001149
1150def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001151 """Write element tree or element structure to sys.stdout.
1152
1153 This function should be used for debugging only.
1154
1155 *elem* is either an ElementTree, or a single Element. The exact output
1156 format is implementation dependent. In this version, it's written as an
1157 ordinary XML file.
1158
1159 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001160 # debugging
1161 if not isinstance(elem, ElementTree):
1162 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001163 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001164 tail = elem.getroot().tail
1165 if not tail or tail[-1] != "\n":
1166 sys.stdout.write("\n")
1167
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001168# --------------------------------------------------------------------
1169# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001170
Armin Rigo9ed73062005-12-14 18:10:45 +00001171
1172def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001173 """Parse XML document into element tree.
1174
1175 *source* is a filename or file object containing XML data,
1176 *parser* is an optional parser instance defaulting to XMLParser.
1177
1178 Return an ElementTree instance.
1179
1180 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001181 tree = ElementTree()
1182 tree.parse(source, parser)
1183 return tree
1184
Armin Rigo9ed73062005-12-14 18:10:45 +00001185
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001186def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001187 """Incrementally parse XML document into ElementTree.
1188
1189 This class also reports what's going on to the user based on the
1190 *events* it is initialized with. The supported events are the strings
1191 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1192 detailed namespace information). If *events* is omitted, only
1193 "end" events are reported.
1194
1195 *source* is a filename or file object containing XML data, *events* is
1196 a list of events to report back, *parser* is an optional parser instance.
1197
1198 Returns an iterator providing (event, elem) pairs.
1199
1200 """
Antoine Pitroue033e062010-10-29 10:38:18 +00001201 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001202 if not hasattr(source, "read"):
1203 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001204 close_source = True
Antoine Pitroue033e062010-10-29 10:38:18 +00001205 return _IterParseIterator(source, events, parser, close_source)
Armin Rigo9ed73062005-12-14 18:10:45 +00001206
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001207
Eli Benderskyb5869342013-08-30 05:51:20 -07001208class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001209
Eli Benderskyb5869342013-08-30 05:51:20 -07001210 def __init__(self, events=None, *, _parser=None):
1211 # The _parser argument is for internal use only and must not be relied
1212 # upon in user code. It will be removed in a future release.
1213 # See http://bugs.python.org/issue17741 for more details.
1214
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001215 # _elementtree.c expects a list, not a deque
1216 self._events_queue = []
Armin Rigo9ed73062005-12-14 18:10:45 +00001217 self._index = 0
Eli Benderskyb5869342013-08-30 05:51:20 -07001218 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001219 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001220 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001221 events = ("end",)
1222 self._parser._setevents(self._events_queue, events)
1223
Eli Benderskyb5869342013-08-30 05:51:20 -07001224 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001225 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001226 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001227 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001228 if data:
1229 try:
1230 self._parser.feed(data)
1231 except SyntaxError as exc:
1232 self._events_queue.append(exc)
1233
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001234 def _close_and_return_root(self):
1235 # iterparse needs this to set its root attribute properly :(
1236 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001237 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001238 return root
1239
1240 def close(self):
1241 """Finish feeding data to parser.
1242
1243 Unlike XMLParser, does not return the root element. Use
1244 read_events() to consume elements from XMLPullParser.
1245 """
1246 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001247
Eli Benderskyb5869342013-08-30 05:51:20 -07001248 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001249 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001250
1251 Events are consumed from the internal event queue as they are
1252 retrieved from the iterator.
1253 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001254 events = self._events_queue
1255 while True:
1256 index = self._index
1257 try:
1258 event = events[self._index]
1259 # Avoid retaining references to past events
1260 events[self._index] = None
1261 except IndexError:
1262 break
1263 index += 1
1264 # Compact the list in a O(1) amortized fashion
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001265 # As noted above, _elementree.c needs a list, not a deque
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001266 if index * 2 >= len(events):
1267 events[:index] = []
1268 self._index = 0
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001269 else:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001270 self._index = index
1271 if isinstance(event, Exception):
1272 raise event
1273 else:
1274 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001275
1276
Antoine Pitrou0acbcb52013-08-23 23:04:30 +02001277class _IterParseIterator:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001278
1279 def __init__(self, source, events, parser, close_source=False):
Eli Benderskyb5869342013-08-30 05:51:20 -07001280 # Use the internal, undocumented _parser argument for now; When the
1281 # parser argument of iterparse is removed, this can be killed.
1282 self._parser = XMLPullParser(events=events, _parser=parser)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001283 self._file = source
1284 self._close_file = close_source
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001285 self.root = self._root = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001286
Georg Brandla18af4e2007-04-21 15:47:16 +00001287 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +00001288 while 1:
Eli Benderskyb5869342013-08-30 05:51:20 -07001289 for event in self._parser.read_events():
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001290 return event
Antoine Pitrou0acbcb52013-08-23 23:04:30 +02001291 if self._parser._parser is None:
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001292 self.root = self._root
Florent Xicluna91d51932011-11-01 23:31:09 +01001293 if self._close_file:
1294 self._file.close()
1295 raise StopIteration
1296 # load event buffer
Eli Benderskyb5869342013-08-30 05:51:20 -07001297 data = self._file.read(16 * 1024)
Florent Xicluna91d51932011-11-01 23:31:09 +01001298 if data:
Eli Benderskyb5869342013-08-30 05:51:20 -07001299 self._parser.feed(data)
Florent Xicluna91d51932011-11-01 23:31:09 +01001300 else:
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001301 self._root = self._parser._close_and_return_root()
Armin Rigo9ed73062005-12-14 18:10:45 +00001302
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001303 def __iter__(self):
1304 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001305
Armin Rigo9ed73062005-12-14 18:10:45 +00001306
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001307def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001308 """Parse XML document from string constant.
1309
1310 This function can be used to embed "XML Literals" in Python code.
1311
1312 *text* is a string containing XML data, *parser* is an
1313 optional parser instance, defaulting to the standard XMLParser.
1314
1315 Returns an Element instance.
1316
1317 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001318 if not parser:
1319 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001320 parser.feed(text)
1321 return parser.close()
1322
Armin Rigo9ed73062005-12-14 18:10:45 +00001323
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001324def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001325 """Parse XML document from string constant for its IDs.
1326
1327 *text* is a string containing XML data, *parser* is an
1328 optional parser instance, defaulting to the standard XMLParser.
1329
1330 Returns an (Element, dict) tuple, in which the
1331 dict maps element id:s to elements.
1332
1333 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001334 if not parser:
1335 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001336 parser.feed(text)
1337 tree = parser.close()
1338 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001339 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001340 id = elem.get("id")
1341 if id:
1342 ids[id] = elem
1343 return tree, ids
1344
Victor Stinner765531d2013-03-26 01:11:54 +01001345# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001346fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001347
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001348def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001349 """Parse XML document from sequence of string fragments.
1350
1351 *sequence* is a list of other sequence, *parser* is an optional parser
1352 instance, defaulting to the standard XMLParser.
1353
1354 Returns an Element instance.
1355
1356 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001357 if not parser:
1358 parser = XMLParser(target=TreeBuilder())
1359 for text in sequence:
1360 parser.feed(text)
1361 return parser.close()
1362
1363# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001364
Armin Rigo9ed73062005-12-14 18:10:45 +00001365
1366class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001367 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001368
Eli Bendersky84fae782013-03-09 07:12:48 -08001369 This builder converts a sequence of start, data, and end method
1370 calls to a well-formed element structure.
1371
1372 You can use this class to build an element structure using a custom XML
1373 parser, or a parser for some other XML-like format.
1374
1375 *element_factory* is an optional element factory which is called
1376 to create new Element instances, as necessary.
1377
1378 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001379 def __init__(self, element_factory=None):
1380 self._data = [] # data collector
1381 self._elem = [] # element stack
1382 self._last = None # last element
1383 self._tail = None # true if we're after an end tag
1384 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001385 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001386 self._factory = element_factory
1387
Armin Rigo9ed73062005-12-14 18:10:45 +00001388 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001389 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001390 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001391 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001392 return self._last
1393
1394 def _flush(self):
1395 if self._data:
1396 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001397 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001398 if self._tail:
1399 assert self._last.tail is None, "internal error (tail)"
1400 self._last.tail = text
1401 else:
1402 assert self._last.text is None, "internal error (text)"
1403 self._last.text = text
1404 self._data = []
1405
Armin Rigo9ed73062005-12-14 18:10:45 +00001406 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001407 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001408 self._data.append(data)
1409
Armin Rigo9ed73062005-12-14 18:10:45 +00001410 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001411 """Open new element and return it.
1412
1413 *tag* is the element name, *attrs* is a dict containing element
1414 attributes.
1415
1416 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001417 self._flush()
1418 self._last = elem = self._factory(tag, attrs)
1419 if self._elem:
1420 self._elem[-1].append(elem)
1421 self._elem.append(elem)
1422 self._tail = 0
1423 return elem
1424
Armin Rigo9ed73062005-12-14 18:10:45 +00001425 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001426 """Close and return current Element.
1427
1428 *tag* is the element name.
1429
1430 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001431 self._flush()
1432 self._last = self._elem.pop()
1433 assert self._last.tag == tag,\
1434 "end tag mismatch (expected %s, got %s)" % (
1435 self._last.tag, tag)
1436 self._tail = 1
1437 return self._last
1438
Armin Rigo9ed73062005-12-14 18:10:45 +00001439
Eli Bendersky84fae782013-03-09 07:12:48 -08001440# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001441class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001442 """Element structure builder for XML source data based on the expat parser.
1443
1444 *html* are predefined HTML entities (not supported currently),
1445 *target* is an optional target object which defaults to an instance of the
1446 standard TreeBuilder class, *encoding* is an optional encoding string
1447 which if given, overrides the encoding specified in the XML file:
1448 http://www.iana.org/assignments/character-sets
1449
1450 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001451
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001452 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001453 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001454 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001455 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001456 try:
1457 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001458 except ImportError:
1459 raise ImportError(
1460 "No module named expat; use SimpleXMLTreeBuilder instead"
1461 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001462 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001463 if target is None:
1464 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001465 # underscored names are provided for compatibility only
1466 self.parser = self._parser = parser
1467 self.target = self._target = target
1468 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001469 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001470 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001471 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001472 if hasattr(target, 'start'):
1473 parser.StartElementHandler = self._start
1474 if hasattr(target, 'end'):
1475 parser.EndElementHandler = self._end
1476 if hasattr(target, 'data'):
1477 parser.CharacterDataHandler = target.data
1478 # miscellaneous callbacks
1479 if hasattr(target, 'comment'):
1480 parser.CommentHandler = target.comment
1481 if hasattr(target, 'pi'):
1482 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001483 # Configure pyexpat: buffering, new-style attribute handling.
1484 parser.buffer_text = 1
1485 parser.ordered_attributes = 1
1486 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001487 self._doctype = None
1488 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001489 try:
1490 self.version = "Expat %d.%d.%d" % expat.version_info
1491 except AttributeError:
1492 pass # unknown
1493
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001494 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001495 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001496 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001497 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001498 # events_queue: a list of actual parsing events that will be populated
1499 # by the underlying parser.
1500 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001501 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001502 append = events_queue.append
1503 for event_name in events_to_report:
1504 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001505 parser.ordered_attributes = 1
1506 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001507 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001508 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001509 append((event, start(tag, attrib_in)))
1510 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001511 elif event_name == "end":
1512 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001513 end=self._end):
1514 append((event, end(tag)))
1515 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001516 elif event_name == "start-ns":
1517 def handler(prefix, uri, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001518 append((event, (prefix or "", uri or "")))
1519 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001520 elif event_name == "end-ns":
1521 def handler(prefix, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001522 append((event, None))
1523 parser.EndNamespaceDeclHandler = handler
1524 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001525 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001526
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001527 def _raiseerror(self, value):
1528 err = ParseError(value)
1529 err.code = value.code
1530 err.position = value.lineno, value.offset
1531 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001532
Armin Rigo9ed73062005-12-14 18:10:45 +00001533 def _fixname(self, key):
1534 # expand qname, and convert name string to ascii, if possible
1535 try:
1536 name = self._names[key]
1537 except KeyError:
1538 name = key
1539 if "}" in name:
1540 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001541 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001542 return name
1543
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001544 def _start(self, tag, attr_list):
1545 # Handler for expat's StartElementHandler. Since ordered_attributes
1546 # is set, the attributes are reported as a list of alternating
1547 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001548 fixname = self._fixname
1549 tag = fixname(tag)
1550 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001551 if attr_list:
1552 for i in range(0, len(attr_list), 2):
1553 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001554 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001555
Armin Rigo9ed73062005-12-14 18:10:45 +00001556 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001557 return self.target.end(self._fixname(tag))
1558
Armin Rigo9ed73062005-12-14 18:10:45 +00001559 def _default(self, text):
1560 prefix = text[:1]
1561 if prefix == "&":
1562 # deal with undefined entities
1563 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001564 data_handler = self.target.data
1565 except AttributeError:
1566 return
1567 try:
1568 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001569 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001570 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001571 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001572 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001573 (text, self.parser.ErrorLineNumber,
1574 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001575 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001576 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001577 err.lineno = self.parser.ErrorLineNumber
1578 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001579 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001580 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1581 self._doctype = [] # inside a doctype declaration
1582 elif self._doctype is not None:
1583 # parse doctype contents
1584 if prefix == ">":
1585 self._doctype = None
1586 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001587 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001588 if not text:
1589 return
1590 self._doctype.append(text)
1591 n = len(self._doctype)
1592 if n > 2:
1593 type = self._doctype[1]
1594 if type == "PUBLIC" and n == 4:
1595 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001596 if pubid:
1597 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001598 elif type == "SYSTEM" and n == 3:
1599 name, type, system = self._doctype
1600 pubid = None
1601 else:
1602 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001603 if hasattr(self.target, "doctype"):
1604 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001605 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001606 # warn about deprecated call
1607 self._XMLParser__doctype(name, pubid, system[1:-1])
1608 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001609 self._doctype = None
1610
Armin Rigo9ed73062005-12-14 18:10:45 +00001611 def doctype(self, name, pubid, system):
Eli Bendersky84fae782013-03-09 07:12:48 -08001612 """(Deprecated) Handle doctype declaration
1613
1614 *name* is the Doctype name, *pubid* is the public identifier,
1615 and *system* is the system identifier.
1616
1617 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001618 warnings.warn(
1619 "This method of XMLParser is deprecated. Define doctype() "
1620 "method on the TreeBuilder target.",
1621 DeprecationWarning,
1622 )
1623
1624 # sentinel, if doctype is redefined in a subclass
1625 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001626
Armin Rigo9ed73062005-12-14 18:10:45 +00001627 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001628 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001629 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001630 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001631 except self._error as v:
1632 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001633
Armin Rigo9ed73062005-12-14 18:10:45 +00001634 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001635 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001636 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001637 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001638 except self._error as v:
1639 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001640 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001641 close_handler = self.target.close
1642 except AttributeError:
1643 pass
1644 else:
1645 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001646 finally:
1647 # get rid of circular references
1648 del self.parser, self._parser
1649 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001650
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001651
1652# Import the C accelerators
1653try:
Eli Bendersky46955b22013-05-19 09:20:50 -07001654 # Element is going to be shadowed by the C implementation. We need to keep
1655 # the Python version of it accessible for some "creative" by external code
1656 # (see tests)
1657 _Element_Py = Element
1658
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001659 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1660 from _elementtree import *
Eli Benderskyc4e98a62013-05-19 09:24:43 -07001661except ImportError:
1662 pass