blob: 5b26ac72fd1aaea96caa93c95bf02b49299e255d [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Martin Panterdcfebb32016-04-01 06:55:55 +000088 "XMLParser", "XMLPullParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +020098import collections
Serhiy Storchaka2e576f52017-04-24 09:05:00 +030099import collections.abc
Eli Bendersky00f402b2012-07-15 06:02:22 +0300100import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000101
Eli Bendersky27cbb192012-06-15 09:03:19 +0300102from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000103
Armin Rigo9ed73062005-12-14 18:10:45 +0000104
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000105class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800106 """An error when parsing an XML document.
107
108 In addition to its exception value, a ParseError contains
109 two extra attributes:
110 'code' - the specific exception code
111 'position' - the line and column of the error
112
113 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000114 pass
115
116# --------------------------------------------------------------------
117
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000118
119def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800120 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100121 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000122
Armin Rigo9ed73062005-12-14 18:10:45 +0000123
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000124class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800125 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000126
Eli Bendersky84fae782013-03-09 07:12:48 -0800127 This class is the reference implementation of the Element interface.
128
129 An element's length is its number of subelements. That means if you
Serhiy Storchaka56a6d852014-12-01 18:28:43 +0200130 want to check if an element is truly empty, you should check BOTH
Eli Bendersky84fae782013-03-09 07:12:48 -0800131 its length AND its text attribute.
132
133 The element tag, attribute names, and attribute values can be either
134 bytes or strings.
135
136 *tag* is the element name. *attrib* is an optional dictionary containing
137 element attributes. *extra* are additional element attributes given as
138 keyword arguments.
139
140 Example form:
141 <tag attrib>text<child/>...</tag>tail
142
143 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000144
145 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800146 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000147
148 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800149 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000150
151 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800152 """
153 Text before first subelement. This is either a string or the value None.
154 Note that if there is no text, this attribute may be either
155 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000156
Eli Bendersky84fae782013-03-09 07:12:48 -0800157 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000158
Eli Bendersky84fae782013-03-09 07:12:48 -0800159 tail = None
160 """
161 Text after this element's end tag, but before the next sibling element's
162 start tag. This is either a string or the value None. Note that if there
163 was no text, this attribute may be either None or an empty string,
164 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000165
Eli Bendersky84fae782013-03-09 07:12:48 -0800166 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000167
168 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300169 if not isinstance(attrib, dict):
170 raise TypeError("attrib must be dict, not %s" % (
171 attrib.__class__.__name__,))
Armin Rigo9ed73062005-12-14 18:10:45 +0000172 self.tag = tag
Serhiy Storchakada084702019-03-27 08:02:28 +0200173 self.attrib = {**attrib, **extra}
Armin Rigo9ed73062005-12-14 18:10:45 +0000174 self._children = []
175
176 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300177 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000178
Armin Rigo9ed73062005-12-14 18:10:45 +0000179 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800180 """Create a new element with the same type.
181
182 *tag* is a string containing the element name.
183 *attrib* is a dictionary containing the element attributes.
184
185 Do not call this method, use the SubElement factory function instead.
186
187 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000188 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000189
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000190 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800191 """Return copy of current element.
192
193 This creates a shallow copy. Subelements will be shared with the
194 original tree.
195
196 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000197 elem = self.makeelement(self.tag, self.attrib)
198 elem.text = self.text
199 elem.tail = self.tail
200 elem[:] = self
201 return elem
202
Armin Rigo9ed73062005-12-14 18:10:45 +0000203 def __len__(self):
204 return len(self._children)
205
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 def __bool__(self):
207 warnings.warn(
208 "The behavior of this method will change in future versions. "
209 "Use specific 'len(elem)' or 'elem is not None' test instead.",
210 FutureWarning, stacklevel=2
211 )
212 return len(self._children) != 0 # emulate old behaviour, for now
213
Armin Rigo9ed73062005-12-14 18:10:45 +0000214 def __getitem__(self, index):
215 return self._children[index]
216
Armin Rigo9ed73062005-12-14 18:10:45 +0000217 def __setitem__(self, index, element):
Serhiy Storchakaf081fd82018-10-19 12:12:57 +0300218 if isinstance(index, slice):
219 for elt in element:
220 self._assert_is_element(elt)
221 else:
222 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000223 self._children[index] = element
224
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 def __delitem__(self, index):
226 del self._children[index]
227
Eli Bendersky84fae782013-03-09 07:12:48 -0800228 def append(self, subelement):
229 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000230
Eli Bendersky84fae782013-03-09 07:12:48 -0800231 The new element will appear in document order after the last existing
232 subelement (or directly after the text, if it's the first subelement),
233 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000234
Eli Bendersky84fae782013-03-09 07:12:48 -0800235 """
236 self._assert_is_element(subelement)
237 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000238
239 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800240 """Append subelements from a sequence.
241
242 *elements* is a sequence with zero or more elements.
243
244 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200245 for element in elements:
246 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000247 self._children.extend(elements)
248
Eli Bendersky84fae782013-03-09 07:12:48 -0800249 def insert(self, index, subelement):
250 """Insert *subelement* at position *index*."""
251 self._assert_is_element(subelement)
252 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000253
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200254 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200255 # Need to refer to the actual Python implementation, not the
256 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700257 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200258 raise TypeError('expected an Element, not %s' % type(e).__name__)
259
Eli Bendersky84fae782013-03-09 07:12:48 -0800260 def remove(self, subelement):
261 """Remove matching subelement.
262
263 Unlike the find methods, this method compares elements based on
264 identity, NOT ON tag value or contents. To remove subelements by
265 other means, the easiest way is to use a list comprehension to
266 select what elements to keep, and then use slice assignment to update
267 the parent element.
268
269 ValueError is raised if a matching element could not be found.
270
271 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000272 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800273 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000274
275 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800276 """(Deprecated) Return all subelements.
277
278 Elements are returned in document order.
279
280 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000281 warnings.warn(
282 "This method will be removed in future versions. "
283 "Use 'list(elem)' or iteration over elem instead.",
284 DeprecationWarning, stacklevel=2
285 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000286 return self._children
287
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000288 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800289 """Find first matching element by tag name or path.
290
291 *path* is a string having either an element tag or an XPath,
292 *namespaces* is an optional mapping from namespace prefix to full name.
293
294 Return the first matching element, or None if no element was found.
295
296 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000297 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000298
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000299 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800300 """Find text for first matching element by tag name or path.
301
302 *path* is a string having either an element tag or an XPath,
303 *default* is the value to return if the element was not found,
304 *namespaces* is an optional mapping from namespace prefix to full name.
305
306 Return text content of first matching element, or default value if
307 none was found. Note that if an element is found having no text
308 content, the empty string is returned.
309
310 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000311 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000312
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000313 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800314 """Find all matching subelements by tag name or path.
315
316 *path* is a string having either an element tag or an XPath,
317 *namespaces* is an optional mapping from namespace prefix to full name.
318
319 Returns list containing all matching elements in document order.
320
321 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000322 return ElementPath.findall(self, path, namespaces)
323
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000324 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800325 """Find all matching subelements by tag name or path.
326
327 *path* is a string having either an element tag or an XPath,
328 *namespaces* is an optional mapping from namespace prefix to full name.
329
330 Return an iterable yielding all matching elements in document order.
331
332 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000333 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000334
Armin Rigo9ed73062005-12-14 18:10:45 +0000335 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800336 """Reset element.
337
338 This function removes all subelements, clears all attributes, and sets
339 the text and tail attributes to None.
340
341 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000342 self.attrib.clear()
343 self._children = []
344 self.text = self.tail = None
345
Armin Rigo9ed73062005-12-14 18:10:45 +0000346 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800347 """Get element attribute.
348
349 Equivalent to attrib.get, but some implementations may handle this a
350 bit more efficiently. *key* is what attribute to look for, and
351 *default* is what to return if the attribute was not found.
352
353 Returns a string containing the attribute value, or the default if
354 attribute was not found.
355
356 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000357 return self.attrib.get(key, default)
358
Armin Rigo9ed73062005-12-14 18:10:45 +0000359 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800360 """Set element attribute.
361
362 Equivalent to attrib[key] = value, but some implementations may handle
363 this a bit more efficiently. *key* is what attribute to set, and
364 *value* is the attribute value to set it to.
365
366 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000367 self.attrib[key] = value
368
Armin Rigo9ed73062005-12-14 18:10:45 +0000369 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800370 """Get list of attribute names.
371
372 Names are returned in an arbitrary order, just like an ordinary
373 Python dict. Equivalent to attrib.keys()
374
375 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000376 return self.attrib.keys()
377
Armin Rigo9ed73062005-12-14 18:10:45 +0000378 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800379 """Get element attributes as a sequence.
380
381 The attributes are returned in arbitrary order. Equivalent to
382 attrib.items().
383
384 Return a list of (name, value) tuples.
385
386 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000387 return self.attrib.items()
388
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000389 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800390 """Create tree iterator.
391
392 The iterator loops over the element and all subelements in document
393 order, returning all elements with a matching tag.
394
395 If the tree structure is modified during iteration, new or removed
396 elements may or may not be included. To get a stable set, use the
397 list() function on the iterator, and loop over the resulting list.
398
399 *tag* is what tags to look for (default is to return all elements)
400
401 Return an iterator containing all the matching elements.
402
403 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000404 if tag == "*":
405 tag = None
406 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000407 yield self
408 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700409 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000410
411 # compatibility
412 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000413 warnings.warn(
414 "This method will be removed in future versions. "
415 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +0300416 DeprecationWarning, stacklevel=2
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000417 )
418 return list(self.iter(tag))
419
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000420 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800421 """Create text iterator.
422
423 The iterator loops over the element and all subelements in document
424 order, returning all inner text.
425
426 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000427 tag = self.tag
428 if not isinstance(tag, str) and tag is not None:
429 return
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200430 t = self.text
431 if t:
432 yield t
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000433 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700434 yield from e.itertext()
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200435 t = e.tail
436 if t:
437 yield t
Armin Rigo9ed73062005-12-14 18:10:45 +0000438
Armin Rigo9ed73062005-12-14 18:10:45 +0000439
440def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800441 """Subelement factory which creates an element instance, and appends it
442 to an existing parent.
443
444 The element tag, attribute names, and attribute values can be either
445 bytes or Unicode strings.
446
447 *parent* is the parent element, *tag* is the subelements name, *attrib* is
448 an optional directory containing element attributes, *extra* are
449 additional attributes given as keyword arguments.
450
451 """
Serhiy Storchakada084702019-03-27 08:02:28 +0200452 attrib = {**attrib, **extra}
Armin Rigo9ed73062005-12-14 18:10:45 +0000453 element = parent.makeelement(tag, attrib)
454 parent.append(element)
455 return element
456
Armin Rigo9ed73062005-12-14 18:10:45 +0000457
458def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800459 """Comment element factory.
460
461 This function creates a special element which the standard serializer
462 serializes as an XML comment.
463
464 *text* is a string containing the comment string.
465
466 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000467 element = Element(Comment)
468 element.text = text
469 return element
470
Armin Rigo9ed73062005-12-14 18:10:45 +0000471
472def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800473 """Processing Instruction element factory.
474
475 This function creates a special element which the standard serializer
476 serializes as an XML comment.
477
478 *target* is a string containing the processing instruction, *text* is a
479 string containing the processing instruction contents, if any.
480
481 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000482 element = Element(ProcessingInstruction)
483 element.text = target
484 if text:
485 element.text = element.text + " " + text
486 return element
487
488PI = ProcessingInstruction
489
Armin Rigo9ed73062005-12-14 18:10:45 +0000490
491class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800492 """Qualified name wrapper.
493
494 This class can be used to wrap a QName attribute value in order to get
495 proper namespace handing on output.
496
497 *text_or_uri* is a string containing the QName value either in the form
498 {uri}local, or if the tag argument is given, the URI part of a QName.
499
500 *tag* is an optional argument which if given, will make the first
501 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
502 be interpreted as a local name.
503
504 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000505 def __init__(self, text_or_uri, tag=None):
506 if tag:
507 text_or_uri = "{%s}%s" % (text_or_uri, tag)
508 self.text = text_or_uri
509 def __str__(self):
510 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000511 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300512 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000513 def __hash__(self):
514 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000515 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000516 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000517 return self.text <= other.text
518 return self.text <= other
519 def __lt__(self, other):
520 if isinstance(other, QName):
521 return self.text < other.text
522 return self.text < other
523 def __ge__(self, other):
524 if isinstance(other, QName):
525 return self.text >= other.text
526 return self.text >= other
527 def __gt__(self, other):
528 if isinstance(other, QName):
529 return self.text > other.text
530 return self.text > other
531 def __eq__(self, other):
532 if isinstance(other, QName):
533 return self.text == other.text
534 return self.text == other
Armin Rigo9ed73062005-12-14 18:10:45 +0000535
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000536# --------------------------------------------------------------------
537
Armin Rigo9ed73062005-12-14 18:10:45 +0000538
539class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800540 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000541
Eli Bendersky84fae782013-03-09 07:12:48 -0800542 This class also provides support for serialization to and from
543 standard XML.
544
545 *element* is an optional root element node,
546 *file* is an optional file handle or file name of an XML file whose
547 contents will be used to initialize the tree with.
548
549 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000550 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000551 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000552 self._root = element # first node
553 if file:
554 self.parse(file)
555
Armin Rigo9ed73062005-12-14 18:10:45 +0000556 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800557 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000558 return self._root
559
Armin Rigo9ed73062005-12-14 18:10:45 +0000560 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800561 """Replace root element of this tree.
562
563 This will discard the current contents of the tree and replace it
564 with the given element. Use with care!
565
566 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000567 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000568 self._root = element
569
Armin Rigo9ed73062005-12-14 18:10:45 +0000570 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800571 """Load external XML document into element tree.
572
573 *source* is a file name or file object, *parser* is an optional parser
574 instance that defaults to XMLParser.
575
576 ParseError is raised if the parser fails to parse the document.
577
578 Returns the root element of the given source document.
579
580 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000581 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000582 if not hasattr(source, "read"):
583 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000584 close_source = True
585 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700586 if parser is None:
587 # If no parser was specified, create a default XMLParser
588 parser = XMLParser()
589 if hasattr(parser, '_parse_whole'):
590 # The default XMLParser, when it comes from an accelerator,
591 # can define an internal _parse_whole API for efficiency.
592 # It can be used to parse the whole source without feeding
593 # it with chunks.
594 self._root = parser._parse_whole(source)
595 return self._root
596 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000597 data = source.read(65536)
598 if not data:
599 break
600 parser.feed(data)
601 self._root = parser.close()
602 return self._root
603 finally:
604 if close_source:
605 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000606
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000607 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800608 """Create and return tree iterator for the root element.
609
610 The iterator loops over all elements in this tree, in document order.
611
612 *tag* is a string with the tag name to iterate over
613 (default is to return all elements).
614
615 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000616 # assert self._root is not None
617 return self._root.iter(tag)
618
619 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000620 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000621 warnings.warn(
622 "This method will be removed in future versions. "
623 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +0300624 DeprecationWarning, stacklevel=2
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000625 )
626 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000627
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000628 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800629 """Find first matching element by tag name or path.
630
631 Same as getroot().find(path), which is Element.find()
632
633 *path* is a string having either an element tag or an XPath,
634 *namespaces* is an optional mapping from namespace prefix to full name.
635
636 Return the first matching element, or None if no element was found.
637
638 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000639 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000640 if path[:1] == "/":
641 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000642 warnings.warn(
643 "This search is broken in 1.3 and earlier, and will be "
644 "fixed in a future version. If you rely on the current "
645 "behaviour, change it to %r" % path,
646 FutureWarning, stacklevel=2
647 )
648 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000649
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000650 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800651 """Find first matching element by tag name or path.
652
653 Same as getroot().findtext(path), which is Element.findtext()
654
655 *path* is a string having either an element tag or an XPath,
656 *namespaces* is an optional mapping from namespace prefix to full name.
657
658 Return the first matching element, or None if no element was found.
659
660 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000661 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000662 if path[:1] == "/":
663 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000664 warnings.warn(
665 "This search is broken in 1.3 and earlier, and will be "
666 "fixed in a future version. If you rely on the current "
667 "behaviour, change it to %r" % path,
668 FutureWarning, stacklevel=2
669 )
670 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000671
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000672 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800673 """Find all matching subelements by tag name or path.
674
675 Same as getroot().findall(path), which is Element.findall().
676
677 *path* is a string having either an element tag or an XPath,
678 *namespaces* is an optional mapping from namespace prefix to full name.
679
680 Return list containing all matching elements in document order.
681
682 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000683 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000684 if path[:1] == "/":
685 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000686 warnings.warn(
687 "This search is broken in 1.3 and earlier, and will be "
688 "fixed in a future version. If you rely on the current "
689 "behaviour, change it to %r" % path,
690 FutureWarning, stacklevel=2
691 )
692 return self._root.findall(path, namespaces)
693
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000694 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800695 """Find all matching subelements by tag name or path.
696
697 Same as getroot().iterfind(path), which is element.iterfind()
698
699 *path* is a string having either an element tag or an XPath,
700 *namespaces* is an optional mapping from namespace prefix to full name.
701
702 Return an iterable yielding all matching elements in document order.
703
704 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000705 # assert self._root is not None
706 if path[:1] == "/":
707 path = "." + path
708 warnings.warn(
709 "This search is broken in 1.3 and earlier, and will be "
710 "fixed in a future version. If you rely on the current "
711 "behaviour, change it to %r" % path,
712 FutureWarning, stacklevel=2
713 )
714 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000715
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000716 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000717 encoding=None,
718 xml_declaration=None,
719 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800720 method=None, *,
721 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800722 """Write element tree to a file as XML.
723
724 Arguments:
725 *file_or_filename* -- file name or a file object opened for writing
726
727 *encoding* -- the output encoding (default: US-ASCII)
728
729 *xml_declaration* -- bool indicating if an XML declaration should be
730 added to the output. If None, an XML declaration
731 is added if encoding IS NOT either of:
732 US-ASCII, UTF-8, or Unicode
733
734 *default_namespace* -- sets the default XML namespace (for "xmlns")
735
736 *method* -- either "xml" (default), "html, "text", or "c14n"
737
738 *short_empty_elements* -- controls the formatting of elements
739 that contain no content. If True (default)
740 they are emitted as a single self-closed
741 tag, otherwise they are emitted as a pair
742 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800743
744 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000745 if not method:
746 method = "xml"
747 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000748 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000749 if not encoding:
750 if method == "c14n":
751 encoding = "utf-8"
752 else:
753 encoding = "us-ascii"
Martin Panter89f76d32015-09-23 01:14:35 +0000754 enc_lower = encoding.lower()
755 with _get_writer(file_or_filename, enc_lower) as write:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300756 if method == "xml" and (xml_declaration or
757 (xml_declaration is None and
Martin Panter89f76d32015-09-23 01:14:35 +0000758 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
Eli Bendersky00f402b2012-07-15 06:02:22 +0300759 declared_encoding = encoding
Martin Panter89f76d32015-09-23 01:14:35 +0000760 if enc_lower == "unicode":
Eli Bendersky00f402b2012-07-15 06:02:22 +0300761 # Retrieve the default encoding for the xml declaration
762 import locale
763 declared_encoding = locale.getpreferredencoding()
764 write("<?xml version='1.0' encoding='%s'?>\n" % (
765 declared_encoding,))
766 if method == "text":
767 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000768 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300769 qnames, namespaces = _namespaces(self._root, default_namespace)
770 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800771 serialize(write, self._root, qnames, namespaces,
772 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000773
774 def write_c14n(self, file):
775 # lxml.etree compatibility. use output method instead
776 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000777
778# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000779# serialization support
780
Eli Bendersky00f402b2012-07-15 06:02:22 +0300781@contextlib.contextmanager
782def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300783 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300784 try:
785 write = file_or_filename.write
786 except AttributeError:
787 # file_or_filename is a file name
788 if encoding == "unicode":
789 file = open(file_or_filename, "w")
790 else:
791 file = open(file_or_filename, "w", encoding=encoding,
792 errors="xmlcharrefreplace")
793 with file:
794 yield file.write
795 else:
796 # file_or_filename is a file-like object
797 # encoding determines if it is a text or binary writer
798 if encoding == "unicode":
799 # use a text writer as is
800 yield write
801 else:
802 # wrap a binary writer with TextIOWrapper
803 with contextlib.ExitStack() as stack:
804 if isinstance(file_or_filename, io.BufferedIOBase):
805 file = file_or_filename
806 elif isinstance(file_or_filename, io.RawIOBase):
807 file = io.BufferedWriter(file_or_filename)
808 # Keep the original file open when the BufferedWriter is
809 # destroyed
810 stack.callback(file.detach)
811 else:
812 # This is to handle passed objects that aren't in the
813 # IOBase hierarchy, but just have a write method
814 file = io.BufferedIOBase()
815 file.writable = lambda: True
816 file.write = write
817 try:
818 # TextIOWrapper uses this methods to determine
819 # if BOM (for UTF-16, etc) should be added
820 file.seekable = file_or_filename.seekable
821 file.tell = file_or_filename.tell
822 except AttributeError:
823 pass
824 file = io.TextIOWrapper(file,
825 encoding=encoding,
826 errors="xmlcharrefreplace",
827 newline="\n")
828 # Keep the original file open when the TextIOWrapper is
829 # destroyed
830 stack.callback(file.detach)
831 yield file.write
832
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000833def _namespaces(elem, default_namespace=None):
834 # identify namespaces used in this tree
835
836 # maps qnames to *encoded* prefix:local names
837 qnames = {None: None}
838
839 # maps uri:s to prefixes
840 namespaces = {}
841 if default_namespace:
842 namespaces[default_namespace] = ""
843
844 def add_qname(qname):
845 # calculate serialized qname representation
846 try:
847 if qname[:1] == "{":
848 uri, tag = qname[1:].rsplit("}", 1)
849 prefix = namespaces.get(uri)
850 if prefix is None:
851 prefix = _namespace_map.get(uri)
852 if prefix is None:
853 prefix = "ns%d" % len(namespaces)
854 if prefix != "xml":
855 namespaces[uri] = prefix
856 if prefix:
857 qnames[qname] = "%s:%s" % (prefix, tag)
858 else:
859 qnames[qname] = tag # default element
860 else:
861 if default_namespace:
862 # FIXME: can this be handled in XML 1.0?
863 raise ValueError(
864 "cannot use non-qualified names with "
865 "default_namespace option"
866 )
867 qnames[qname] = qname
868 except TypeError:
869 _raise_serialization_error(qname)
870
871 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300872 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000873 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000874 if isinstance(tag, QName):
875 if tag.text not in qnames:
876 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000877 elif isinstance(tag, str):
878 if tag not in qnames:
879 add_qname(tag)
880 elif tag is not None and tag is not Comment and tag is not PI:
881 _raise_serialization_error(tag)
882 for key, value in elem.items():
883 if isinstance(key, QName):
884 key = key.text
885 if key not in qnames:
886 add_qname(key)
887 if isinstance(value, QName) and value.text not in qnames:
888 add_qname(value.text)
889 text = elem.text
890 if isinstance(text, QName) and text.text not in qnames:
891 add_qname(text.text)
892 return qnames, namespaces
893
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800894def _serialize_xml(write, elem, qnames, namespaces,
895 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000896 tag = elem.tag
897 text = elem.text
898 if tag is Comment:
899 write("<!--%s-->" % text)
900 elif tag is ProcessingInstruction:
901 write("<?%s?>" % text)
902 else:
903 tag = qnames[tag]
904 if tag is None:
905 if text:
906 write(_escape_cdata(text))
907 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800908 _serialize_xml(write, e, qnames, None,
909 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000910 else:
911 write("<" + tag)
912 items = list(elem.items())
913 if items or namespaces:
914 if namespaces:
915 for v, k in sorted(namespaces.items(),
916 key=lambda x: x[1]): # sort on prefix
917 if k:
918 k = ":" + k
919 write(" xmlns%s=\"%s\"" % (
920 k,
921 _escape_attrib(v)
922 ))
Raymond Hettingere3685fd2018-10-28 11:18:22 -0700923 for k, v in items:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000924 if isinstance(k, QName):
925 k = k.text
926 if isinstance(v, QName):
927 v = qnames[v.text]
928 else:
929 v = _escape_attrib(v)
930 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800931 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000932 write(">")
933 if text:
934 write(_escape_cdata(text))
935 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800936 _serialize_xml(write, e, qnames, None,
937 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000938 write("</" + tag + ">")
939 else:
940 write(" />")
941 if elem.tail:
942 write(_escape_cdata(elem.tail))
943
944HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300945 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000946
947try:
948 HTML_EMPTY = set(HTML_EMPTY)
949except NameError:
950 pass
951
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800952def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000953 tag = elem.tag
954 text = elem.text
955 if tag is Comment:
956 write("<!--%s-->" % _escape_cdata(text))
957 elif tag is ProcessingInstruction:
958 write("<?%s?>" % _escape_cdata(text))
959 else:
960 tag = qnames[tag]
961 if tag is None:
962 if text:
963 write(_escape_cdata(text))
964 for e in elem:
965 _serialize_html(write, e, qnames, None)
966 else:
967 write("<" + tag)
968 items = list(elem.items())
969 if items or namespaces:
970 if namespaces:
971 for v, k in sorted(namespaces.items(),
972 key=lambda x: x[1]): # sort on prefix
973 if k:
974 k = ":" + k
975 write(" xmlns%s=\"%s\"" % (
976 k,
977 _escape_attrib(v)
978 ))
Serhiy Storchaka3b05ad72018-10-29 19:31:04 +0200979 for k, v in items:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000980 if isinstance(k, QName):
981 k = k.text
982 if isinstance(v, QName):
983 v = qnames[v.text]
984 else:
985 v = _escape_attrib_html(v)
986 # FIXME: handle boolean attributes
987 write(" %s=\"%s\"" % (qnames[k], v))
988 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200989 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000990 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200991 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000992 write(text)
993 else:
994 write(_escape_cdata(text))
995 for e in elem:
996 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +0200997 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000998 write("</" + tag + ">")
999 if elem.tail:
1000 write(_escape_cdata(elem.tail))
1001
1002def _serialize_text(write, elem):
1003 for part in elem.itertext():
1004 write(part)
1005 if elem.tail:
1006 write(elem.tail)
1007
1008_serialize = {
1009 "xml": _serialize_xml,
1010 "html": _serialize_html,
1011 "text": _serialize_text,
1012# this optional method is imported at the end of the module
1013# "c14n": _serialize_c14n,
1014}
Armin Rigo9ed73062005-12-14 18:10:45 +00001015
Armin Rigo9ed73062005-12-14 18:10:45 +00001016
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001017def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001018 """Register a namespace prefix.
1019
1020 The registry is global, and any existing mapping for either the
1021 given prefix or the namespace URI will be removed.
1022
1023 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1024 attributes in this namespace will be serialized with prefix if possible.
1025
1026 ValueError is raised if prefix is reserved or is invalid.
1027
1028 """
R David Murray44b548d2016-09-08 13:59:53 -04001029 if re.match(r"ns\d+$", prefix):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001030 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001031 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001032 if k == uri or v == prefix:
1033 del _namespace_map[k]
1034 _namespace_map[uri] = prefix
1035
1036_namespace_map = {
1037 # "well-known" namespace prefixes
1038 "http://www.w3.org/XML/1998/namespace": "xml",
1039 "http://www.w3.org/1999/xhtml": "html",
1040 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1041 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1042 # xml schema
1043 "http://www.w3.org/2001/XMLSchema": "xs",
1044 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1045 # dublin core
1046 "http://purl.org/dc/elements/1.1/": "dc",
1047}
Florent Xicluna16395052012-02-16 23:28:35 +01001048# For tests and troubleshooting
1049register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001050
1051def _raise_serialization_error(text):
1052 raise TypeError(
1053 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1054 )
1055
1056def _escape_cdata(text):
1057 # escape character data
1058 try:
1059 # it's worth avoiding do-nothing calls for strings that are
Mike53f7a7c2017-12-14 14:04:53 +03001060 # shorter than 500 characters, or so. assume that's, by far,
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001061 # the most common case in most applications.
1062 if "&" in text:
1063 text = text.replace("&", "&amp;")
1064 if "<" in text:
1065 text = text.replace("<", "&lt;")
1066 if ">" in text:
1067 text = text.replace(">", "&gt;")
1068 return text
1069 except (TypeError, AttributeError):
1070 _raise_serialization_error(text)
1071
1072def _escape_attrib(text):
1073 # escape attribute value
1074 try:
1075 if "&" in text:
1076 text = text.replace("&", "&amp;")
1077 if "<" in text:
1078 text = text.replace("<", "&lt;")
1079 if ">" in text:
1080 text = text.replace(">", "&gt;")
1081 if "\"" in text:
1082 text = text.replace("\"", "&quot;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001083 # The following business with carriage returns is to satisfy
Raymond Hettinger11fa3ff2016-09-11 23:23:24 -07001084 # Section 2.11 of the XML specification, stating that
Raymond Hettinger076366c2016-09-11 23:18:03 -07001085 # CR or CR LN should be replaced with just LN
1086 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1087 if "\r\n" in text:
1088 text = text.replace("\r\n", "\n")
1089 if "\r" in text:
1090 text = text.replace("\r", "\n")
1091 #The following four lines are issue 17582
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001092 if "\n" in text:
1093 text = text.replace("\n", "&#10;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001094 if "\t" in text:
1095 text = text.replace("\t", "&#09;")
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001096 return text
1097 except (TypeError, AttributeError):
1098 _raise_serialization_error(text)
1099
1100def _escape_attrib_html(text):
1101 # escape attribute value
1102 try:
1103 if "&" in text:
1104 text = text.replace("&", "&amp;")
1105 if ">" in text:
1106 text = text.replace(">", "&gt;")
1107 if "\"" in text:
1108 text = text.replace("\"", "&quot;")
1109 return text
1110 except (TypeError, AttributeError):
1111 _raise_serialization_error(text)
1112
1113# --------------------------------------------------------------------
1114
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001115def tostring(element, encoding=None, method=None, *,
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001116 xml_declaration=None, default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001117 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001118 """Generate string representation of XML element.
1119
1120 All subelements are included. If encoding is "unicode", a string
1121 is returned. Otherwise a bytestring is returned.
1122
1123 *element* is an Element instance, *encoding* is an optional output
1124 encoding defaulting to US-ASCII, *method* is an optional output which can
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001125 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1126 sets the default XML namespace (for "xmlns").
Eli Bendersky84fae782013-03-09 07:12:48 -08001127
1128 Returns an (optionally) encoded string containing the XML data.
1129
1130 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001131 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001132 ElementTree(element).write(stream, encoding,
1133 xml_declaration=xml_declaration,
1134 default_namespace=default_namespace,
1135 method=method,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001136 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001137 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001138
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001139class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001140 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001141 def __init__(self, lst):
1142 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001143
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001144 def writable(self):
1145 return True
1146
1147 def seekable(self):
1148 return True
1149
1150 def write(self, b):
1151 self.lst.append(b)
1152
1153 def tell(self):
1154 return len(self.lst)
1155
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001156def tostringlist(element, encoding=None, method=None, *,
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001157 xml_declaration=None, default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001158 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001159 lst = []
1160 stream = _ListDataStream(lst)
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001161 ElementTree(element).write(stream, encoding,
1162 xml_declaration=xml_declaration,
1163 default_namespace=default_namespace,
1164 method=method,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001165 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001166 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001167
Armin Rigo9ed73062005-12-14 18:10:45 +00001168
1169def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001170 """Write element tree or element structure to sys.stdout.
1171
1172 This function should be used for debugging only.
1173
1174 *elem* is either an ElementTree, or a single Element. The exact output
1175 format is implementation dependent. In this version, it's written as an
1176 ordinary XML file.
1177
1178 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001179 # debugging
1180 if not isinstance(elem, ElementTree):
1181 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001182 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001183 tail = elem.getroot().tail
1184 if not tail or tail[-1] != "\n":
1185 sys.stdout.write("\n")
1186
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001187# --------------------------------------------------------------------
1188# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001189
Armin Rigo9ed73062005-12-14 18:10:45 +00001190
1191def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001192 """Parse XML document into element tree.
1193
1194 *source* is a filename or file object containing XML data,
1195 *parser* is an optional parser instance defaulting to XMLParser.
1196
1197 Return an ElementTree instance.
1198
1199 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001200 tree = ElementTree()
1201 tree.parse(source, parser)
1202 return tree
1203
Armin Rigo9ed73062005-12-14 18:10:45 +00001204
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001205def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001206 """Incrementally parse XML document into ElementTree.
1207
1208 This class also reports what's going on to the user based on the
1209 *events* it is initialized with. The supported events are the strings
1210 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1211 detailed namespace information). If *events* is omitted, only
1212 "end" events are reported.
1213
1214 *source* is a filename or file object containing XML data, *events* is
1215 a list of events to report back, *parser* is an optional parser instance.
1216
1217 Returns an iterator providing (event, elem) pairs.
1218
1219 """
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001220 # Use the internal, undocumented _parser argument for now; When the
1221 # parser argument of iterparse is removed, this can be killed.
1222 pullparser = XMLPullParser(events=events, _parser=parser)
1223 def iterator():
1224 try:
1225 while True:
1226 yield from pullparser.read_events()
1227 # load event buffer
1228 data = source.read(16 * 1024)
1229 if not data:
1230 break
1231 pullparser.feed(data)
1232 root = pullparser._close_and_return_root()
1233 yield from pullparser.read_events()
1234 it.root = root
1235 finally:
1236 if close_source:
1237 source.close()
1238
Serhiy Storchaka2e576f52017-04-24 09:05:00 +03001239 class IterParseIterator(collections.abc.Iterator):
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001240 __next__ = iterator().__next__
1241 it = IterParseIterator()
1242 it.root = None
1243 del iterator, IterParseIterator
1244
Antoine Pitroue033e062010-10-29 10:38:18 +00001245 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001246 if not hasattr(source, "read"):
1247 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001248 close_source = True
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001249
1250 return it
Armin Rigo9ed73062005-12-14 18:10:45 +00001251
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001252
Eli Benderskyb5869342013-08-30 05:51:20 -07001253class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001254
Eli Benderskyb5869342013-08-30 05:51:20 -07001255 def __init__(self, events=None, *, _parser=None):
1256 # The _parser argument is for internal use only and must not be relied
1257 # upon in user code. It will be removed in a future release.
1258 # See http://bugs.python.org/issue17741 for more details.
1259
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001260 self._events_queue = collections.deque()
Eli Benderskyb5869342013-08-30 05:51:20 -07001261 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001262 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001263 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001264 events = ("end",)
1265 self._parser._setevents(self._events_queue, events)
1266
Eli Benderskyb5869342013-08-30 05:51:20 -07001267 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001268 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001269 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001270 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001271 if data:
1272 try:
1273 self._parser.feed(data)
1274 except SyntaxError as exc:
1275 self._events_queue.append(exc)
1276
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001277 def _close_and_return_root(self):
1278 # iterparse needs this to set its root attribute properly :(
1279 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001280 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001281 return root
1282
1283 def close(self):
1284 """Finish feeding data to parser.
1285
1286 Unlike XMLParser, does not return the root element. Use
1287 read_events() to consume elements from XMLPullParser.
1288 """
1289 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001290
Eli Benderskyb5869342013-08-30 05:51:20 -07001291 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001292 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001293
1294 Events are consumed from the internal event queue as they are
1295 retrieved from the iterator.
1296 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001297 events = self._events_queue
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001298 while events:
1299 event = events.popleft()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001300 if isinstance(event, Exception):
1301 raise event
1302 else:
1303 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001304
1305
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001306def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001307 """Parse XML document from string constant.
1308
1309 This function can be used to embed "XML Literals" in Python code.
1310
1311 *text* is a string containing XML data, *parser* is an
1312 optional parser instance, defaulting to the standard XMLParser.
1313
1314 Returns an Element instance.
1315
1316 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001317 if not parser:
1318 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001319 parser.feed(text)
1320 return parser.close()
1321
Armin Rigo9ed73062005-12-14 18:10:45 +00001322
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001323def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001324 """Parse XML document from string constant for its IDs.
1325
1326 *text* is a string containing XML data, *parser* is an
1327 optional parser instance, defaulting to the standard XMLParser.
1328
1329 Returns an (Element, dict) tuple, in which the
1330 dict maps element id:s to elements.
1331
1332 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001333 if not parser:
1334 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001335 parser.feed(text)
1336 tree = parser.close()
1337 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001338 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001339 id = elem.get("id")
1340 if id:
1341 ids[id] = elem
1342 return tree, ids
1343
Victor Stinner765531d2013-03-26 01:11:54 +01001344# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001345fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001346
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001347def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001348 """Parse XML document from sequence of string fragments.
1349
1350 *sequence* is a list of other sequence, *parser* is an optional parser
1351 instance, defaulting to the standard XMLParser.
1352
1353 Returns an Element instance.
1354
1355 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001356 if not parser:
1357 parser = XMLParser(target=TreeBuilder())
1358 for text in sequence:
1359 parser.feed(text)
1360 return parser.close()
1361
1362# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001363
Armin Rigo9ed73062005-12-14 18:10:45 +00001364
1365class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001366 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001367
Eli Bendersky84fae782013-03-09 07:12:48 -08001368 This builder converts a sequence of start, data, and end method
1369 calls to a well-formed element structure.
1370
1371 You can use this class to build an element structure using a custom XML
1372 parser, or a parser for some other XML-like format.
1373
1374 *element_factory* is an optional element factory which is called
1375 to create new Element instances, as necessary.
1376
Stefan Behnel43851a22019-05-01 21:20:38 +02001377 *comment_factory* is a factory to create comments to be used instead of
1378 the standard factory. If *insert_comments* is false (the default),
1379 comments will not be inserted into the tree.
1380
1381 *pi_factory* is a factory to create processing instructions to be used
1382 instead of the standard factory. If *insert_pis* is false (the default),
1383 processing instructions will not be inserted into the tree.
Eli Bendersky84fae782013-03-09 07:12:48 -08001384 """
Stefan Behnel43851a22019-05-01 21:20:38 +02001385 def __init__(self, element_factory=None, *,
1386 comment_factory=None, pi_factory=None,
1387 insert_comments=False, insert_pis=False):
Armin Rigo9ed73062005-12-14 18:10:45 +00001388 self._data = [] # data collector
1389 self._elem = [] # element stack
1390 self._last = None # last element
Stefan Behnel43851a22019-05-01 21:20:38 +02001391 self._root = None # root element
Armin Rigo9ed73062005-12-14 18:10:45 +00001392 self._tail = None # true if we're after an end tag
Stefan Behnel43851a22019-05-01 21:20:38 +02001393 if comment_factory is None:
1394 comment_factory = Comment
1395 self._comment_factory = comment_factory
1396 self.insert_comments = insert_comments
1397 if pi_factory is None:
1398 pi_factory = ProcessingInstruction
1399 self._pi_factory = pi_factory
1400 self.insert_pis = insert_pis
Armin Rigo9ed73062005-12-14 18:10:45 +00001401 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001402 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001403 self._factory = element_factory
1404
Armin Rigo9ed73062005-12-14 18:10:45 +00001405 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001406 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001407 assert len(self._elem) == 0, "missing end tags"
Stefan Behnel43851a22019-05-01 21:20:38 +02001408 assert self._root is not None, "missing toplevel element"
1409 return self._root
Armin Rigo9ed73062005-12-14 18:10:45 +00001410
1411 def _flush(self):
1412 if self._data:
1413 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001414 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001415 if self._tail:
1416 assert self._last.tail is None, "internal error (tail)"
1417 self._last.tail = text
1418 else:
1419 assert self._last.text is None, "internal error (text)"
1420 self._last.text = text
1421 self._data = []
1422
Armin Rigo9ed73062005-12-14 18:10:45 +00001423 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001424 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001425 self._data.append(data)
1426
Armin Rigo9ed73062005-12-14 18:10:45 +00001427 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001428 """Open new element and return it.
1429
1430 *tag* is the element name, *attrs* is a dict containing element
1431 attributes.
1432
1433 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001434 self._flush()
1435 self._last = elem = self._factory(tag, attrs)
1436 if self._elem:
1437 self._elem[-1].append(elem)
Stefan Behnel43851a22019-05-01 21:20:38 +02001438 elif self._root is None:
1439 self._root = elem
Armin Rigo9ed73062005-12-14 18:10:45 +00001440 self._elem.append(elem)
1441 self._tail = 0
1442 return elem
1443
Armin Rigo9ed73062005-12-14 18:10:45 +00001444 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001445 """Close and return current Element.
1446
1447 *tag* is the element name.
1448
1449 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001450 self._flush()
1451 self._last = self._elem.pop()
1452 assert self._last.tag == tag,\
1453 "end tag mismatch (expected %s, got %s)" % (
1454 self._last.tag, tag)
1455 self._tail = 1
1456 return self._last
1457
Stefan Behnel43851a22019-05-01 21:20:38 +02001458 def comment(self, text):
1459 """Create a comment using the comment_factory.
1460
1461 *text* is the text of the comment.
1462 """
1463 return self._handle_single(
1464 self._comment_factory, self.insert_comments, text)
1465
1466 def pi(self, target, text=None):
1467 """Create a processing instruction using the pi_factory.
1468
1469 *target* is the target name of the processing instruction.
1470 *text* is the data of the processing instruction, or ''.
1471 """
1472 return self._handle_single(
1473 self._pi_factory, self.insert_pis, target, text)
1474
1475 def _handle_single(self, factory, insert, *args):
1476 elem = factory(*args)
1477 if insert:
1478 self._flush()
1479 self._last = elem
1480 if self._elem:
1481 self._elem[-1].append(elem)
1482 self._tail = 1
1483 return elem
1484
Armin Rigo9ed73062005-12-14 18:10:45 +00001485
Eli Bendersky84fae782013-03-09 07:12:48 -08001486# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001487class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001488 """Element structure builder for XML source data based on the expat parser.
1489
Eli Bendersky84fae782013-03-09 07:12:48 -08001490 *target* is an optional target object which defaults to an instance of the
1491 standard TreeBuilder class, *encoding* is an optional encoding string
1492 which if given, overrides the encoding specified in the XML file:
1493 http://www.iana.org/assignments/character-sets
1494
1495 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001496
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +03001497 def __init__(self, *, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001498 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001499 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001500 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001501 try:
1502 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001503 except ImportError:
1504 raise ImportError(
1505 "No module named expat; use SimpleXMLTreeBuilder instead"
1506 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001507 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001508 if target is None:
1509 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001510 # underscored names are provided for compatibility only
1511 self.parser = self._parser = parser
1512 self.target = self._target = target
1513 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001514 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001515 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001516 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001517 if hasattr(target, 'start'):
1518 parser.StartElementHandler = self._start
1519 if hasattr(target, 'end'):
1520 parser.EndElementHandler = self._end
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001521 if hasattr(target, 'start_ns'):
1522 parser.StartNamespaceDeclHandler = self._start_ns
1523 if hasattr(target, 'end_ns'):
1524 parser.EndNamespaceDeclHandler = self._end_ns
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001525 if hasattr(target, 'data'):
1526 parser.CharacterDataHandler = target.data
1527 # miscellaneous callbacks
1528 if hasattr(target, 'comment'):
1529 parser.CommentHandler = target.comment
1530 if hasattr(target, 'pi'):
1531 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001532 # Configure pyexpat: buffering, new-style attribute handling.
1533 parser.buffer_text = 1
1534 parser.ordered_attributes = 1
1535 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001536 self._doctype = None
1537 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001538 try:
1539 self.version = "Expat %d.%d.%d" % expat.version_info
1540 except AttributeError:
1541 pass # unknown
1542
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001543 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001544 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001545 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001546 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001547 # events_queue: a list of actual parsing events that will be populated
1548 # by the underlying parser.
1549 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001550 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001551 append = events_queue.append
1552 for event_name in events_to_report:
1553 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001554 parser.ordered_attributes = 1
1555 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001556 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001557 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001558 append((event, start(tag, attrib_in)))
1559 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001560 elif event_name == "end":
1561 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001562 end=self._end):
1563 append((event, end(tag)))
1564 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001565 elif event_name == "start-ns":
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001566 # TreeBuilder does not implement .start_ns()
1567 if hasattr(self.target, "start_ns"):
1568 def handler(prefix, uri, event=event_name, append=append,
1569 start_ns=self._start_ns):
1570 append((event, start_ns(prefix, uri)))
1571 else:
1572 def handler(prefix, uri, event=event_name, append=append):
1573 append((event, (prefix or '', uri or '')))
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001574 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001575 elif event_name == "end-ns":
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001576 # TreeBuilder does not implement .end_ns()
1577 if hasattr(self.target, "end_ns"):
1578 def handler(prefix, event=event_name, append=append,
1579 end_ns=self._end_ns):
1580 append((event, end_ns(prefix)))
1581 else:
1582 def handler(prefix, event=event_name, append=append):
1583 append((event, None))
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001584 parser.EndNamespaceDeclHandler = handler
Stefan Behnel43851a22019-05-01 21:20:38 +02001585 elif event_name == 'comment':
1586 def handler(text, event=event_name, append=append, self=self):
1587 append((event, self.target.comment(text)))
1588 parser.CommentHandler = handler
1589 elif event_name == 'pi':
1590 def handler(pi_target, data, event=event_name, append=append,
1591 self=self):
1592 append((event, self.target.pi(pi_target, data)))
1593 parser.ProcessingInstructionHandler = handler
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001594 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001595 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001596
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001597 def _raiseerror(self, value):
1598 err = ParseError(value)
1599 err.code = value.code
1600 err.position = value.lineno, value.offset
1601 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001602
Armin Rigo9ed73062005-12-14 18:10:45 +00001603 def _fixname(self, key):
1604 # expand qname, and convert name string to ascii, if possible
1605 try:
1606 name = self._names[key]
1607 except KeyError:
1608 name = key
1609 if "}" in name:
1610 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001611 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001612 return name
1613
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001614 def _start_ns(self, prefix, uri):
1615 return self.target.start_ns(prefix or '', uri or '')
1616
1617 def _end_ns(self, prefix):
1618 return self.target.end_ns(prefix or '')
1619
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001620 def _start(self, tag, attr_list):
1621 # Handler for expat's StartElementHandler. Since ordered_attributes
1622 # is set, the attributes are reported as a list of alternating
1623 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001624 fixname = self._fixname
1625 tag = fixname(tag)
1626 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001627 if attr_list:
1628 for i in range(0, len(attr_list), 2):
1629 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001630 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001631
Armin Rigo9ed73062005-12-14 18:10:45 +00001632 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001633 return self.target.end(self._fixname(tag))
1634
Armin Rigo9ed73062005-12-14 18:10:45 +00001635 def _default(self, text):
1636 prefix = text[:1]
1637 if prefix == "&":
1638 # deal with undefined entities
1639 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001640 data_handler = self.target.data
1641 except AttributeError:
1642 return
1643 try:
1644 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001645 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001646 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001647 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001648 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001649 (text, self.parser.ErrorLineNumber,
1650 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001651 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001652 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001653 err.lineno = self.parser.ErrorLineNumber
1654 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001655 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001656 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1657 self._doctype = [] # inside a doctype declaration
1658 elif self._doctype is not None:
1659 # parse doctype contents
1660 if prefix == ">":
1661 self._doctype = None
1662 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001663 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001664 if not text:
1665 return
1666 self._doctype.append(text)
1667 n = len(self._doctype)
1668 if n > 2:
1669 type = self._doctype[1]
1670 if type == "PUBLIC" and n == 4:
1671 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001672 if pubid:
1673 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001674 elif type == "SYSTEM" and n == 3:
1675 name, type, system = self._doctype
1676 pubid = None
1677 else:
1678 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001679 if hasattr(self.target, "doctype"):
1680 self.target.doctype(name, pubid, system[1:-1])
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +03001681 elif hasattr(self, "doctype"):
1682 warnings.warn(
1683 "The doctype() method of XMLParser is ignored. "
1684 "Define doctype() method on the TreeBuilder target.",
1685 RuntimeWarning)
1686
Armin Rigo9ed73062005-12-14 18:10:45 +00001687 self._doctype = None
1688
Armin Rigo9ed73062005-12-14 18:10:45 +00001689 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001690 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001691 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001692 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001693 except self._error as v:
1694 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001695
Armin Rigo9ed73062005-12-14 18:10:45 +00001696 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001697 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001698 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001699 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001700 except self._error as v:
1701 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001702 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001703 close_handler = self.target.close
1704 except AttributeError:
1705 pass
1706 else:
1707 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001708 finally:
1709 # get rid of circular references
1710 del self.parser, self._parser
1711 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001712
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001713
1714# Import the C accelerators
1715try:
Eli Bendersky46955b22013-05-19 09:20:50 -07001716 # Element is going to be shadowed by the C implementation. We need to keep
1717 # the Python version of it accessible for some "creative" by external code
1718 # (see tests)
1719 _Element_Py = Element
1720
Stefan Behnel43851a22019-05-01 21:20:38 +02001721 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001722 from _elementtree import *
Stefan Behnel43851a22019-05-01 21:20:38 +02001723 from _elementtree import _set_factories
Eli Benderskyc4e98a62013-05-19 09:24:43 -07001724except ImportError:
1725 pass
Stefan Behnel43851a22019-05-01 21:20:38 +02001726else:
1727 _set_factories(Comment, ProcessingInstruction)