blob: 431ecd0dddf1bcf6ba23a1103a5edec0c9edee2a [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Stefan Behnelb5d3cee2019-08-23 16:44:25 +020079 "indent", "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Martin Panterdcfebb32016-04-01 06:55:55 +000088 "XMLParser", "XMLPullParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Stefan Behnele1d5dd62019-05-01 22:34:13 +020090 "canonicalize", "C14NWriterTarget",
Armin Rigo9ed73062005-12-14 18:10:45 +000091 ]
92
Florent Xiclunaf15351d2010-03-13 23:24:31 +000093VERSION = "1.3.0"
94
Florent Xiclunaf15351d2010-03-13 23:24:31 +000095import sys
96import re
97import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030098import io
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +020099import collections
Serhiy Storchaka2e576f52017-04-24 09:05:00 +0300100import collections.abc
Eli Bendersky00f402b2012-07-15 06:02:22 +0300101import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000102
Eli Bendersky27cbb192012-06-15 09:03:19 +0300103from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000104
Armin Rigo9ed73062005-12-14 18:10:45 +0000105
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000106class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800107 """An error when parsing an XML document.
108
109 In addition to its exception value, a ParseError contains
110 two extra attributes:
111 'code' - the specific exception code
112 'position' - the line and column of the error
113
114 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000115 pass
116
117# --------------------------------------------------------------------
118
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000119
120def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800121 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100122 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000123
Armin Rigo9ed73062005-12-14 18:10:45 +0000124
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000125class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800126 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000127
Eli Bendersky84fae782013-03-09 07:12:48 -0800128 This class is the reference implementation of the Element interface.
129
130 An element's length is its number of subelements. That means if you
Serhiy Storchaka56a6d852014-12-01 18:28:43 +0200131 want to check if an element is truly empty, you should check BOTH
Eli Bendersky84fae782013-03-09 07:12:48 -0800132 its length AND its text attribute.
133
134 The element tag, attribute names, and attribute values can be either
135 bytes or strings.
136
137 *tag* is the element name. *attrib* is an optional dictionary containing
138 element attributes. *extra* are additional element attributes given as
139 keyword arguments.
140
141 Example form:
142 <tag attrib>text<child/>...</tag>tail
143
144 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000145
146 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800147 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000148
149 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800150 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000151
152 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800153 """
154 Text before first subelement. This is either a string or the value None.
155 Note that if there is no text, this attribute may be either
156 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000157
Eli Bendersky84fae782013-03-09 07:12:48 -0800158 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000159
Eli Bendersky84fae782013-03-09 07:12:48 -0800160 tail = None
161 """
162 Text after this element's end tag, but before the next sibling element's
163 start tag. This is either a string or the value None. Note that if there
164 was no text, this attribute may be either None or an empty string,
165 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000166
Eli Bendersky84fae782013-03-09 07:12:48 -0800167 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000168
169 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300170 if not isinstance(attrib, dict):
171 raise TypeError("attrib must be dict, not %s" % (
172 attrib.__class__.__name__,))
Armin Rigo9ed73062005-12-14 18:10:45 +0000173 self.tag = tag
Serhiy Storchakada084702019-03-27 08:02:28 +0200174 self.attrib = {**attrib, **extra}
Armin Rigo9ed73062005-12-14 18:10:45 +0000175 self._children = []
176
177 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000179
Armin Rigo9ed73062005-12-14 18:10:45 +0000180 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800181 """Create a new element with the same type.
182
183 *tag* is a string containing the element name.
184 *attrib* is a dictionary containing the element attributes.
185
186 Do not call this method, use the SubElement factory function instead.
187
188 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000189 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000190
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000191 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800192 """Return copy of current element.
193
194 This creates a shallow copy. Subelements will be shared with the
195 original tree.
196
197 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000198 elem = self.makeelement(self.tag, self.attrib)
199 elem.text = self.text
200 elem.tail = self.tail
201 elem[:] = self
202 return elem
203
Armin Rigo9ed73062005-12-14 18:10:45 +0000204 def __len__(self):
205 return len(self._children)
206
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000207 def __bool__(self):
208 warnings.warn(
209 "The behavior of this method will change in future versions. "
210 "Use specific 'len(elem)' or 'elem is not None' test instead.",
211 FutureWarning, stacklevel=2
212 )
213 return len(self._children) != 0 # emulate old behaviour, for now
214
Armin Rigo9ed73062005-12-14 18:10:45 +0000215 def __getitem__(self, index):
216 return self._children[index]
217
Armin Rigo9ed73062005-12-14 18:10:45 +0000218 def __setitem__(self, index, element):
Serhiy Storchakaf081fd82018-10-19 12:12:57 +0300219 if isinstance(index, slice):
220 for elt in element:
221 self._assert_is_element(elt)
222 else:
223 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000224 self._children[index] = element
225
Armin Rigo9ed73062005-12-14 18:10:45 +0000226 def __delitem__(self, index):
227 del self._children[index]
228
Eli Bendersky84fae782013-03-09 07:12:48 -0800229 def append(self, subelement):
230 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000231
Eli Bendersky84fae782013-03-09 07:12:48 -0800232 The new element will appear in document order after the last existing
233 subelement (or directly after the text, if it's the first subelement),
234 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000235
Eli Bendersky84fae782013-03-09 07:12:48 -0800236 """
237 self._assert_is_element(subelement)
238 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000239
240 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800241 """Append subelements from a sequence.
242
243 *elements* is a sequence with zero or more elements.
244
245 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200246 for element in elements:
247 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000248 self._children.extend(elements)
249
Eli Bendersky84fae782013-03-09 07:12:48 -0800250 def insert(self, index, subelement):
251 """Insert *subelement* at position *index*."""
252 self._assert_is_element(subelement)
253 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000254
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200255 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200256 # Need to refer to the actual Python implementation, not the
257 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700258 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200259 raise TypeError('expected an Element, not %s' % type(e).__name__)
260
Eli Bendersky84fae782013-03-09 07:12:48 -0800261 def remove(self, subelement):
262 """Remove matching subelement.
263
264 Unlike the find methods, this method compares elements based on
265 identity, NOT ON tag value or contents. To remove subelements by
266 other means, the easiest way is to use a list comprehension to
267 select what elements to keep, and then use slice assignment to update
268 the parent element.
269
270 ValueError is raised if a matching element could not be found.
271
272 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000273 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800274 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000275
276 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800277 """(Deprecated) Return all subelements.
278
279 Elements are returned in document order.
280
281 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000282 warnings.warn(
283 "This method will be removed in future versions. "
284 "Use 'list(elem)' or iteration over elem instead.",
285 DeprecationWarning, stacklevel=2
286 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000287 return self._children
288
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000289 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800290 """Find first matching element by tag name or path.
291
292 *path* is a string having either an element tag or an XPath,
293 *namespaces* is an optional mapping from namespace prefix to full name.
294
295 Return the first matching element, or None if no element was found.
296
297 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000298 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000299
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000300 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800301 """Find text for first matching element by tag name or path.
302
303 *path* is a string having either an element tag or an XPath,
304 *default* is the value to return if the element was not found,
305 *namespaces* is an optional mapping from namespace prefix to full name.
306
307 Return text content of first matching element, or default value if
308 none was found. Note that if an element is found having no text
309 content, the empty string is returned.
310
311 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000312 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000313
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000314 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800315 """Find all matching subelements by tag name or path.
316
317 *path* is a string having either an element tag or an XPath,
318 *namespaces* is an optional mapping from namespace prefix to full name.
319
320 Returns list containing all matching elements in document order.
321
322 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000323 return ElementPath.findall(self, path, namespaces)
324
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000325 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800326 """Find all matching subelements by tag name or path.
327
328 *path* is a string having either an element tag or an XPath,
329 *namespaces* is an optional mapping from namespace prefix to full name.
330
331 Return an iterable yielding all matching elements in document order.
332
333 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000334 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000335
Armin Rigo9ed73062005-12-14 18:10:45 +0000336 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800337 """Reset element.
338
339 This function removes all subelements, clears all attributes, and sets
340 the text and tail attributes to None.
341
342 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000343 self.attrib.clear()
344 self._children = []
345 self.text = self.tail = None
346
Armin Rigo9ed73062005-12-14 18:10:45 +0000347 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800348 """Get element attribute.
349
350 Equivalent to attrib.get, but some implementations may handle this a
351 bit more efficiently. *key* is what attribute to look for, and
352 *default* is what to return if the attribute was not found.
353
354 Returns a string containing the attribute value, or the default if
355 attribute was not found.
356
357 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000358 return self.attrib.get(key, default)
359
Armin Rigo9ed73062005-12-14 18:10:45 +0000360 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800361 """Set element attribute.
362
363 Equivalent to attrib[key] = value, but some implementations may handle
364 this a bit more efficiently. *key* is what attribute to set, and
365 *value* is the attribute value to set it to.
366
367 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000368 self.attrib[key] = value
369
Armin Rigo9ed73062005-12-14 18:10:45 +0000370 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800371 """Get list of attribute names.
372
373 Names are returned in an arbitrary order, just like an ordinary
374 Python dict. Equivalent to attrib.keys()
375
376 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000377 return self.attrib.keys()
378
Armin Rigo9ed73062005-12-14 18:10:45 +0000379 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800380 """Get element attributes as a sequence.
381
382 The attributes are returned in arbitrary order. Equivalent to
383 attrib.items().
384
385 Return a list of (name, value) tuples.
386
387 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000388 return self.attrib.items()
389
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000390 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800391 """Create tree iterator.
392
393 The iterator loops over the element and all subelements in document
394 order, returning all elements with a matching tag.
395
396 If the tree structure is modified during iteration, new or removed
397 elements may or may not be included. To get a stable set, use the
398 list() function on the iterator, and loop over the resulting list.
399
400 *tag* is what tags to look for (default is to return all elements)
401
402 Return an iterator containing all the matching elements.
403
404 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000405 if tag == "*":
406 tag = None
407 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000408 yield self
409 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700410 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000411
412 # compatibility
413 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000414 warnings.warn(
415 "This method will be removed in future versions. "
416 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +0300417 DeprecationWarning, stacklevel=2
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000418 )
419 return list(self.iter(tag))
420
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000421 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800422 """Create text iterator.
423
424 The iterator loops over the element and all subelements in document
425 order, returning all inner text.
426
427 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000428 tag = self.tag
429 if not isinstance(tag, str) and tag is not None:
430 return
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200431 t = self.text
432 if t:
433 yield t
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000434 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700435 yield from e.itertext()
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200436 t = e.tail
437 if t:
438 yield t
Armin Rigo9ed73062005-12-14 18:10:45 +0000439
Armin Rigo9ed73062005-12-14 18:10:45 +0000440
441def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800442 """Subelement factory which creates an element instance, and appends it
443 to an existing parent.
444
445 The element tag, attribute names, and attribute values can be either
446 bytes or Unicode strings.
447
448 *parent* is the parent element, *tag* is the subelements name, *attrib* is
449 an optional directory containing element attributes, *extra* are
450 additional attributes given as keyword arguments.
451
452 """
Serhiy Storchakada084702019-03-27 08:02:28 +0200453 attrib = {**attrib, **extra}
Armin Rigo9ed73062005-12-14 18:10:45 +0000454 element = parent.makeelement(tag, attrib)
455 parent.append(element)
456 return element
457
Armin Rigo9ed73062005-12-14 18:10:45 +0000458
459def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800460 """Comment element factory.
461
462 This function creates a special element which the standard serializer
463 serializes as an XML comment.
464
465 *text* is a string containing the comment string.
466
467 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000468 element = Element(Comment)
469 element.text = text
470 return element
471
Armin Rigo9ed73062005-12-14 18:10:45 +0000472
473def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800474 """Processing Instruction element factory.
475
476 This function creates a special element which the standard serializer
477 serializes as an XML comment.
478
479 *target* is a string containing the processing instruction, *text* is a
480 string containing the processing instruction contents, if any.
481
482 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000483 element = Element(ProcessingInstruction)
484 element.text = target
485 if text:
486 element.text = element.text + " " + text
487 return element
488
489PI = ProcessingInstruction
490
Armin Rigo9ed73062005-12-14 18:10:45 +0000491
492class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800493 """Qualified name wrapper.
494
495 This class can be used to wrap a QName attribute value in order to get
496 proper namespace handing on output.
497
498 *text_or_uri* is a string containing the QName value either in the form
499 {uri}local, or if the tag argument is given, the URI part of a QName.
500
501 *tag* is an optional argument which if given, will make the first
502 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
503 be interpreted as a local name.
504
505 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000506 def __init__(self, text_or_uri, tag=None):
507 if tag:
508 text_or_uri = "{%s}%s" % (text_or_uri, tag)
509 self.text = text_or_uri
510 def __str__(self):
511 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000512 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300513 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000514 def __hash__(self):
515 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000516 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000517 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000518 return self.text <= other.text
519 return self.text <= other
520 def __lt__(self, other):
521 if isinstance(other, QName):
522 return self.text < other.text
523 return self.text < other
524 def __ge__(self, other):
525 if isinstance(other, QName):
526 return self.text >= other.text
527 return self.text >= other
528 def __gt__(self, other):
529 if isinstance(other, QName):
530 return self.text > other.text
531 return self.text > other
532 def __eq__(self, other):
533 if isinstance(other, QName):
534 return self.text == other.text
535 return self.text == other
Armin Rigo9ed73062005-12-14 18:10:45 +0000536
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000537# --------------------------------------------------------------------
538
Armin Rigo9ed73062005-12-14 18:10:45 +0000539
540class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800541 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000542
Eli Bendersky84fae782013-03-09 07:12:48 -0800543 This class also provides support for serialization to and from
544 standard XML.
545
546 *element* is an optional root element node,
547 *file* is an optional file handle or file name of an XML file whose
548 contents will be used to initialize the tree with.
549
550 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000551 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000552 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000553 self._root = element # first node
554 if file:
555 self.parse(file)
556
Armin Rigo9ed73062005-12-14 18:10:45 +0000557 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800558 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000559 return self._root
560
Armin Rigo9ed73062005-12-14 18:10:45 +0000561 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800562 """Replace root element of this tree.
563
564 This will discard the current contents of the tree and replace it
565 with the given element. Use with care!
566
567 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000568 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000569 self._root = element
570
Armin Rigo9ed73062005-12-14 18:10:45 +0000571 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800572 """Load external XML document into element tree.
573
574 *source* is a file name or file object, *parser* is an optional parser
575 instance that defaults to XMLParser.
576
577 ParseError is raised if the parser fails to parse the document.
578
579 Returns the root element of the given source document.
580
581 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000582 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000583 if not hasattr(source, "read"):
584 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000585 close_source = True
586 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700587 if parser is None:
588 # If no parser was specified, create a default XMLParser
589 parser = XMLParser()
590 if hasattr(parser, '_parse_whole'):
591 # The default XMLParser, when it comes from an accelerator,
592 # can define an internal _parse_whole API for efficiency.
593 # It can be used to parse the whole source without feeding
594 # it with chunks.
595 self._root = parser._parse_whole(source)
596 return self._root
597 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000598 data = source.read(65536)
599 if not data:
600 break
601 parser.feed(data)
602 self._root = parser.close()
603 return self._root
604 finally:
605 if close_source:
606 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000607
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000608 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800609 """Create and return tree iterator for the root element.
610
611 The iterator loops over all elements in this tree, in document order.
612
613 *tag* is a string with the tag name to iterate over
614 (default is to return all elements).
615
616 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000617 # assert self._root is not None
618 return self._root.iter(tag)
619
620 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000621 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000622 warnings.warn(
623 "This method will be removed in future versions. "
624 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +0300625 DeprecationWarning, stacklevel=2
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000626 )
627 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000628
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000629 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800630 """Find first matching element by tag name or path.
631
632 Same as getroot().find(path), which is Element.find()
633
634 *path* is a string having either an element tag or an XPath,
635 *namespaces* is an optional mapping from namespace prefix to full name.
636
637 Return the first matching element, or None if no element was found.
638
639 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000640 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000641 if path[:1] == "/":
642 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000643 warnings.warn(
644 "This search is broken in 1.3 and earlier, and will be "
645 "fixed in a future version. If you rely on the current "
646 "behaviour, change it to %r" % path,
647 FutureWarning, stacklevel=2
648 )
649 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000650
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000651 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800652 """Find first matching element by tag name or path.
653
654 Same as getroot().findtext(path), which is Element.findtext()
655
656 *path* is a string having either an element tag or an XPath,
657 *namespaces* is an optional mapping from namespace prefix to full name.
658
659 Return the first matching element, or None if no element was found.
660
661 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000662 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000663 if path[:1] == "/":
664 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000665 warnings.warn(
666 "This search is broken in 1.3 and earlier, and will be "
667 "fixed in a future version. If you rely on the current "
668 "behaviour, change it to %r" % path,
669 FutureWarning, stacklevel=2
670 )
671 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000672
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000673 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800674 """Find all matching subelements by tag name or path.
675
676 Same as getroot().findall(path), which is Element.findall().
677
678 *path* is a string having either an element tag or an XPath,
679 *namespaces* is an optional mapping from namespace prefix to full name.
680
681 Return list containing all matching elements in document order.
682
683 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000684 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000685 if path[:1] == "/":
686 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000687 warnings.warn(
688 "This search is broken in 1.3 and earlier, and will be "
689 "fixed in a future version. If you rely on the current "
690 "behaviour, change it to %r" % path,
691 FutureWarning, stacklevel=2
692 )
693 return self._root.findall(path, namespaces)
694
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000695 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800696 """Find all matching subelements by tag name or path.
697
698 Same as getroot().iterfind(path), which is element.iterfind()
699
700 *path* is a string having either an element tag or an XPath,
701 *namespaces* is an optional mapping from namespace prefix to full name.
702
703 Return an iterable yielding all matching elements in document order.
704
705 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000706 # assert self._root is not None
707 if path[:1] == "/":
708 path = "." + path
709 warnings.warn(
710 "This search is broken in 1.3 and earlier, and will be "
711 "fixed in a future version. If you rely on the current "
712 "behaviour, change it to %r" % path,
713 FutureWarning, stacklevel=2
714 )
715 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000716
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000717 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000718 encoding=None,
719 xml_declaration=None,
720 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800721 method=None, *,
722 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800723 """Write element tree to a file as XML.
724
725 Arguments:
726 *file_or_filename* -- file name or a file object opened for writing
727
728 *encoding* -- the output encoding (default: US-ASCII)
729
730 *xml_declaration* -- bool indicating if an XML declaration should be
731 added to the output. If None, an XML declaration
732 is added if encoding IS NOT either of:
733 US-ASCII, UTF-8, or Unicode
734
735 *default_namespace* -- sets the default XML namespace (for "xmlns")
736
737 *method* -- either "xml" (default), "html, "text", or "c14n"
738
739 *short_empty_elements* -- controls the formatting of elements
740 that contain no content. If True (default)
741 they are emitted as a single self-closed
742 tag, otherwise they are emitted as a pair
743 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800744
745 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000746 if not method:
747 method = "xml"
748 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000749 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000750 if not encoding:
751 if method == "c14n":
752 encoding = "utf-8"
753 else:
754 encoding = "us-ascii"
Martin Panter89f76d32015-09-23 01:14:35 +0000755 enc_lower = encoding.lower()
756 with _get_writer(file_or_filename, enc_lower) as write:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300757 if method == "xml" and (xml_declaration or
758 (xml_declaration is None and
Martin Panter89f76d32015-09-23 01:14:35 +0000759 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
Eli Bendersky00f402b2012-07-15 06:02:22 +0300760 declared_encoding = encoding
Martin Panter89f76d32015-09-23 01:14:35 +0000761 if enc_lower == "unicode":
Eli Bendersky00f402b2012-07-15 06:02:22 +0300762 # Retrieve the default encoding for the xml declaration
763 import locale
764 declared_encoding = locale.getpreferredencoding()
765 write("<?xml version='1.0' encoding='%s'?>\n" % (
766 declared_encoding,))
767 if method == "text":
768 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000769 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300770 qnames, namespaces = _namespaces(self._root, default_namespace)
771 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800772 serialize(write, self._root, qnames, namespaces,
773 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000774
775 def write_c14n(self, file):
776 # lxml.etree compatibility. use output method instead
777 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000778
779# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000780# serialization support
781
Eli Bendersky00f402b2012-07-15 06:02:22 +0300782@contextlib.contextmanager
783def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300784 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300785 try:
786 write = file_or_filename.write
787 except AttributeError:
788 # file_or_filename is a file name
789 if encoding == "unicode":
790 file = open(file_or_filename, "w")
791 else:
792 file = open(file_or_filename, "w", encoding=encoding,
793 errors="xmlcharrefreplace")
794 with file:
795 yield file.write
796 else:
797 # file_or_filename is a file-like object
798 # encoding determines if it is a text or binary writer
799 if encoding == "unicode":
800 # use a text writer as is
801 yield write
802 else:
803 # wrap a binary writer with TextIOWrapper
804 with contextlib.ExitStack() as stack:
805 if isinstance(file_or_filename, io.BufferedIOBase):
806 file = file_or_filename
807 elif isinstance(file_or_filename, io.RawIOBase):
808 file = io.BufferedWriter(file_or_filename)
809 # Keep the original file open when the BufferedWriter is
810 # destroyed
811 stack.callback(file.detach)
812 else:
813 # This is to handle passed objects that aren't in the
814 # IOBase hierarchy, but just have a write method
815 file = io.BufferedIOBase()
816 file.writable = lambda: True
817 file.write = write
818 try:
819 # TextIOWrapper uses this methods to determine
820 # if BOM (for UTF-16, etc) should be added
821 file.seekable = file_or_filename.seekable
822 file.tell = file_or_filename.tell
823 except AttributeError:
824 pass
825 file = io.TextIOWrapper(file,
826 encoding=encoding,
827 errors="xmlcharrefreplace",
828 newline="\n")
829 # Keep the original file open when the TextIOWrapper is
830 # destroyed
831 stack.callback(file.detach)
832 yield file.write
833
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000834def _namespaces(elem, default_namespace=None):
835 # identify namespaces used in this tree
836
837 # maps qnames to *encoded* prefix:local names
838 qnames = {None: None}
839
840 # maps uri:s to prefixes
841 namespaces = {}
842 if default_namespace:
843 namespaces[default_namespace] = ""
844
845 def add_qname(qname):
846 # calculate serialized qname representation
847 try:
848 if qname[:1] == "{":
849 uri, tag = qname[1:].rsplit("}", 1)
850 prefix = namespaces.get(uri)
851 if prefix is None:
852 prefix = _namespace_map.get(uri)
853 if prefix is None:
854 prefix = "ns%d" % len(namespaces)
855 if prefix != "xml":
856 namespaces[uri] = prefix
857 if prefix:
858 qnames[qname] = "%s:%s" % (prefix, tag)
859 else:
860 qnames[qname] = tag # default element
861 else:
862 if default_namespace:
863 # FIXME: can this be handled in XML 1.0?
864 raise ValueError(
865 "cannot use non-qualified names with "
866 "default_namespace option"
867 )
868 qnames[qname] = qname
869 except TypeError:
870 _raise_serialization_error(qname)
871
872 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300873 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000874 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000875 if isinstance(tag, QName):
876 if tag.text not in qnames:
877 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000878 elif isinstance(tag, str):
879 if tag not in qnames:
880 add_qname(tag)
881 elif tag is not None and tag is not Comment and tag is not PI:
882 _raise_serialization_error(tag)
883 for key, value in elem.items():
884 if isinstance(key, QName):
885 key = key.text
886 if key not in qnames:
887 add_qname(key)
888 if isinstance(value, QName) and value.text not in qnames:
889 add_qname(value.text)
890 text = elem.text
891 if isinstance(text, QName) and text.text not in qnames:
892 add_qname(text.text)
893 return qnames, namespaces
894
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800895def _serialize_xml(write, elem, qnames, namespaces,
896 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000897 tag = elem.tag
898 text = elem.text
899 if tag is Comment:
900 write("<!--%s-->" % text)
901 elif tag is ProcessingInstruction:
902 write("<?%s?>" % text)
903 else:
904 tag = qnames[tag]
905 if tag is None:
906 if text:
907 write(_escape_cdata(text))
908 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800909 _serialize_xml(write, e, qnames, None,
910 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000911 else:
912 write("<" + tag)
913 items = list(elem.items())
914 if items or namespaces:
915 if namespaces:
916 for v, k in sorted(namespaces.items(),
917 key=lambda x: x[1]): # sort on prefix
918 if k:
919 k = ":" + k
920 write(" xmlns%s=\"%s\"" % (
921 k,
922 _escape_attrib(v)
923 ))
Raymond Hettingere3685fd2018-10-28 11:18:22 -0700924 for k, v in items:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000925 if isinstance(k, QName):
926 k = k.text
927 if isinstance(v, QName):
928 v = qnames[v.text]
929 else:
930 v = _escape_attrib(v)
931 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800932 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000933 write(">")
934 if text:
935 write(_escape_cdata(text))
936 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800937 _serialize_xml(write, e, qnames, None,
938 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000939 write("</" + tag + ">")
940 else:
941 write(" />")
942 if elem.tail:
943 write(_escape_cdata(elem.tail))
944
945HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300946 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000947
948try:
949 HTML_EMPTY = set(HTML_EMPTY)
950except NameError:
951 pass
952
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800953def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000954 tag = elem.tag
955 text = elem.text
956 if tag is Comment:
957 write("<!--%s-->" % _escape_cdata(text))
958 elif tag is ProcessingInstruction:
959 write("<?%s?>" % _escape_cdata(text))
960 else:
961 tag = qnames[tag]
962 if tag is None:
963 if text:
964 write(_escape_cdata(text))
965 for e in elem:
966 _serialize_html(write, e, qnames, None)
967 else:
968 write("<" + tag)
969 items = list(elem.items())
970 if items or namespaces:
971 if namespaces:
972 for v, k in sorted(namespaces.items(),
973 key=lambda x: x[1]): # sort on prefix
974 if k:
975 k = ":" + k
976 write(" xmlns%s=\"%s\"" % (
977 k,
978 _escape_attrib(v)
979 ))
Serhiy Storchaka3b05ad72018-10-29 19:31:04 +0200980 for k, v in items:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000981 if isinstance(k, QName):
982 k = k.text
983 if isinstance(v, QName):
984 v = qnames[v.text]
985 else:
986 v = _escape_attrib_html(v)
987 # FIXME: handle boolean attributes
988 write(" %s=\"%s\"" % (qnames[k], v))
989 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200990 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000991 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200992 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000993 write(text)
994 else:
995 write(_escape_cdata(text))
996 for e in elem:
997 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +0200998 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000999 write("</" + tag + ">")
1000 if elem.tail:
1001 write(_escape_cdata(elem.tail))
1002
1003def _serialize_text(write, elem):
1004 for part in elem.itertext():
1005 write(part)
1006 if elem.tail:
1007 write(elem.tail)
1008
1009_serialize = {
1010 "xml": _serialize_xml,
1011 "html": _serialize_html,
1012 "text": _serialize_text,
1013# this optional method is imported at the end of the module
1014# "c14n": _serialize_c14n,
1015}
Armin Rigo9ed73062005-12-14 18:10:45 +00001016
Armin Rigo9ed73062005-12-14 18:10:45 +00001017
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001018def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001019 """Register a namespace prefix.
1020
1021 The registry is global, and any existing mapping for either the
1022 given prefix or the namespace URI will be removed.
1023
1024 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1025 attributes in this namespace will be serialized with prefix if possible.
1026
1027 ValueError is raised if prefix is reserved or is invalid.
1028
1029 """
R David Murray44b548d2016-09-08 13:59:53 -04001030 if re.match(r"ns\d+$", prefix):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001031 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001032 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001033 if k == uri or v == prefix:
1034 del _namespace_map[k]
1035 _namespace_map[uri] = prefix
1036
1037_namespace_map = {
1038 # "well-known" namespace prefixes
1039 "http://www.w3.org/XML/1998/namespace": "xml",
1040 "http://www.w3.org/1999/xhtml": "html",
1041 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1042 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1043 # xml schema
1044 "http://www.w3.org/2001/XMLSchema": "xs",
1045 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1046 # dublin core
1047 "http://purl.org/dc/elements/1.1/": "dc",
1048}
Florent Xicluna16395052012-02-16 23:28:35 +01001049# For tests and troubleshooting
1050register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001051
1052def _raise_serialization_error(text):
1053 raise TypeError(
1054 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1055 )
1056
1057def _escape_cdata(text):
1058 # escape character data
1059 try:
1060 # it's worth avoiding do-nothing calls for strings that are
Mike53f7a7c2017-12-14 14:04:53 +03001061 # shorter than 500 characters, or so. assume that's, by far,
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001062 # the most common case in most applications.
1063 if "&" in text:
1064 text = text.replace("&", "&amp;")
1065 if "<" in text:
1066 text = text.replace("<", "&lt;")
1067 if ">" in text:
1068 text = text.replace(">", "&gt;")
1069 return text
1070 except (TypeError, AttributeError):
1071 _raise_serialization_error(text)
1072
1073def _escape_attrib(text):
1074 # escape attribute value
1075 try:
1076 if "&" in text:
1077 text = text.replace("&", "&amp;")
1078 if "<" in text:
1079 text = text.replace("<", "&lt;")
1080 if ">" in text:
1081 text = text.replace(">", "&gt;")
1082 if "\"" in text:
1083 text = text.replace("\"", "&quot;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001084 # The following business with carriage returns is to satisfy
Raymond Hettinger11fa3ff2016-09-11 23:23:24 -07001085 # Section 2.11 of the XML specification, stating that
Raymond Hettinger076366c2016-09-11 23:18:03 -07001086 # CR or CR LN should be replaced with just LN
1087 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1088 if "\r\n" in text:
1089 text = text.replace("\r\n", "\n")
1090 if "\r" in text:
1091 text = text.replace("\r", "\n")
1092 #The following four lines are issue 17582
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001093 if "\n" in text:
1094 text = text.replace("\n", "&#10;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001095 if "\t" in text:
1096 text = text.replace("\t", "&#09;")
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001097 return text
1098 except (TypeError, AttributeError):
1099 _raise_serialization_error(text)
1100
1101def _escape_attrib_html(text):
1102 # escape attribute value
1103 try:
1104 if "&" in text:
1105 text = text.replace("&", "&amp;")
1106 if ">" in text:
1107 text = text.replace(">", "&gt;")
1108 if "\"" in text:
1109 text = text.replace("\"", "&quot;")
1110 return text
1111 except (TypeError, AttributeError):
1112 _raise_serialization_error(text)
1113
1114# --------------------------------------------------------------------
1115
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001116def tostring(element, encoding=None, method=None, *,
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001117 xml_declaration=None, default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001118 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001119 """Generate string representation of XML element.
1120
1121 All subelements are included. If encoding is "unicode", a string
1122 is returned. Otherwise a bytestring is returned.
1123
1124 *element* is an Element instance, *encoding* is an optional output
1125 encoding defaulting to US-ASCII, *method* is an optional output which can
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001126 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1127 sets the default XML namespace (for "xmlns").
Eli Bendersky84fae782013-03-09 07:12:48 -08001128
1129 Returns an (optionally) encoded string containing the XML data.
1130
1131 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001132 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001133 ElementTree(element).write(stream, encoding,
1134 xml_declaration=xml_declaration,
1135 default_namespace=default_namespace,
1136 method=method,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001137 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001138 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001139
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001140class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001141 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001142 def __init__(self, lst):
1143 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001144
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001145 def writable(self):
1146 return True
1147
1148 def seekable(self):
1149 return True
1150
1151 def write(self, b):
1152 self.lst.append(b)
1153
1154 def tell(self):
1155 return len(self.lst)
1156
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001157def tostringlist(element, encoding=None, method=None, *,
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001158 xml_declaration=None, default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001159 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001160 lst = []
1161 stream = _ListDataStream(lst)
Bernt Røskar Brennaffca16e2019-04-14 10:07:02 +02001162 ElementTree(element).write(stream, encoding,
1163 xml_declaration=xml_declaration,
1164 default_namespace=default_namespace,
1165 method=method,
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001166 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001167 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001168
Armin Rigo9ed73062005-12-14 18:10:45 +00001169
1170def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001171 """Write element tree or element structure to sys.stdout.
1172
1173 This function should be used for debugging only.
1174
1175 *elem* is either an ElementTree, or a single Element. The exact output
1176 format is implementation dependent. In this version, it's written as an
1177 ordinary XML file.
1178
1179 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001180 # debugging
1181 if not isinstance(elem, ElementTree):
1182 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001183 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001184 tail = elem.getroot().tail
1185 if not tail or tail[-1] != "\n":
1186 sys.stdout.write("\n")
1187
Stefan Behnelb5d3cee2019-08-23 16:44:25 +02001188
1189def indent(tree, space=" ", level=0):
1190 """Indent an XML document by inserting newlines and indentation space
1191 after elements.
1192
1193 *tree* is the ElementTree or Element to modify. The (root) element
1194 itself will not be changed, but the tail text of all elements in its
1195 subtree will be adapted.
1196
1197 *space* is the whitespace to insert for each indentation level, two
1198 space characters by default.
1199
1200 *level* is the initial indentation level. Setting this to a higher
1201 value than 0 can be used for indenting subtrees that are more deeply
1202 nested inside of a document.
1203 """
1204 if isinstance(tree, ElementTree):
1205 tree = tree.getroot()
1206 if level < 0:
1207 raise ValueError(f"Initial indentation level must be >= 0, got {level}")
1208 if not len(tree):
1209 return
1210
1211 # Reduce the memory consumption by reusing indentation strings.
1212 indentations = ["\n" + level * space]
1213
1214 def _indent_children(elem, level):
1215 # Start a new indentation level for the first child.
1216 child_level = level + 1
1217 try:
1218 child_indentation = indentations[child_level]
1219 except IndexError:
1220 child_indentation = indentations[level] + space
1221 indentations.append(child_indentation)
1222
1223 if not elem.text or not elem.text.strip():
1224 elem.text = child_indentation
1225
1226 for child in elem:
1227 if len(child):
1228 _indent_children(child, child_level)
1229 if not child.tail or not child.tail.strip():
1230 child.tail = child_indentation
1231
1232 # Dedent after the last child by overwriting the previous indentation.
1233 if not child.tail.strip():
1234 child.tail = indentations[level]
1235
1236 _indent_children(tree, 0)
1237
1238
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001239# --------------------------------------------------------------------
1240# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001241
Armin Rigo9ed73062005-12-14 18:10:45 +00001242
1243def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001244 """Parse XML document into element tree.
1245
1246 *source* is a filename or file object containing XML data,
1247 *parser* is an optional parser instance defaulting to XMLParser.
1248
1249 Return an ElementTree instance.
1250
1251 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001252 tree = ElementTree()
1253 tree.parse(source, parser)
1254 return tree
1255
Armin Rigo9ed73062005-12-14 18:10:45 +00001256
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001257def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001258 """Incrementally parse XML document into ElementTree.
1259
1260 This class also reports what's going on to the user based on the
1261 *events* it is initialized with. The supported events are the strings
1262 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1263 detailed namespace information). If *events* is omitted, only
1264 "end" events are reported.
1265
1266 *source* is a filename or file object containing XML data, *events* is
1267 a list of events to report back, *parser* is an optional parser instance.
1268
1269 Returns an iterator providing (event, elem) pairs.
1270
1271 """
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001272 # Use the internal, undocumented _parser argument for now; When the
1273 # parser argument of iterparse is removed, this can be killed.
1274 pullparser = XMLPullParser(events=events, _parser=parser)
1275 def iterator():
1276 try:
1277 while True:
1278 yield from pullparser.read_events()
1279 # load event buffer
1280 data = source.read(16 * 1024)
1281 if not data:
1282 break
1283 pullparser.feed(data)
1284 root = pullparser._close_and_return_root()
1285 yield from pullparser.read_events()
1286 it.root = root
1287 finally:
1288 if close_source:
1289 source.close()
1290
Serhiy Storchaka2e576f52017-04-24 09:05:00 +03001291 class IterParseIterator(collections.abc.Iterator):
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001292 __next__ = iterator().__next__
1293 it = IterParseIterator()
1294 it.root = None
1295 del iterator, IterParseIterator
1296
Antoine Pitroue033e062010-10-29 10:38:18 +00001297 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001298 if not hasattr(source, "read"):
1299 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001300 close_source = True
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001301
1302 return it
Armin Rigo9ed73062005-12-14 18:10:45 +00001303
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001304
Eli Benderskyb5869342013-08-30 05:51:20 -07001305class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001306
Eli Benderskyb5869342013-08-30 05:51:20 -07001307 def __init__(self, events=None, *, _parser=None):
1308 # The _parser argument is for internal use only and must not be relied
1309 # upon in user code. It will be removed in a future release.
1310 # See http://bugs.python.org/issue17741 for more details.
1311
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001312 self._events_queue = collections.deque()
Eli Benderskyb5869342013-08-30 05:51:20 -07001313 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001314 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001315 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001316 events = ("end",)
1317 self._parser._setevents(self._events_queue, events)
1318
Eli Benderskyb5869342013-08-30 05:51:20 -07001319 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001320 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001321 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001322 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001323 if data:
1324 try:
1325 self._parser.feed(data)
1326 except SyntaxError as exc:
1327 self._events_queue.append(exc)
1328
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001329 def _close_and_return_root(self):
1330 # iterparse needs this to set its root attribute properly :(
1331 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001332 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001333 return root
1334
1335 def close(self):
1336 """Finish feeding data to parser.
1337
1338 Unlike XMLParser, does not return the root element. Use
1339 read_events() to consume elements from XMLPullParser.
1340 """
1341 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001342
Eli Benderskyb5869342013-08-30 05:51:20 -07001343 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001344 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001345
1346 Events are consumed from the internal event queue as they are
1347 retrieved from the iterator.
1348 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001349 events = self._events_queue
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001350 while events:
1351 event = events.popleft()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001352 if isinstance(event, Exception):
1353 raise event
1354 else:
1355 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001356
1357
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001358def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001359 """Parse XML document from string constant.
1360
1361 This function can be used to embed "XML Literals" in Python code.
1362
1363 *text* is a string containing XML data, *parser* is an
1364 optional parser instance, defaulting to the standard XMLParser.
1365
1366 Returns an Element instance.
1367
1368 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001369 if not parser:
1370 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001371 parser.feed(text)
1372 return parser.close()
1373
Armin Rigo9ed73062005-12-14 18:10:45 +00001374
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001375def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001376 """Parse XML document from string constant for its IDs.
1377
1378 *text* is a string containing XML data, *parser* is an
1379 optional parser instance, defaulting to the standard XMLParser.
1380
1381 Returns an (Element, dict) tuple, in which the
1382 dict maps element id:s to elements.
1383
1384 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001385 if not parser:
1386 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001387 parser.feed(text)
1388 tree = parser.close()
1389 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001390 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001391 id = elem.get("id")
1392 if id:
1393 ids[id] = elem
1394 return tree, ids
1395
Victor Stinner765531d2013-03-26 01:11:54 +01001396# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001397fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001398
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001399def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001400 """Parse XML document from sequence of string fragments.
1401
1402 *sequence* is a list of other sequence, *parser* is an optional parser
1403 instance, defaulting to the standard XMLParser.
1404
1405 Returns an Element instance.
1406
1407 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001408 if not parser:
1409 parser = XMLParser(target=TreeBuilder())
1410 for text in sequence:
1411 parser.feed(text)
1412 return parser.close()
1413
1414# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001415
Armin Rigo9ed73062005-12-14 18:10:45 +00001416
1417class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001418 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001419
Eli Bendersky84fae782013-03-09 07:12:48 -08001420 This builder converts a sequence of start, data, and end method
1421 calls to a well-formed element structure.
1422
1423 You can use this class to build an element structure using a custom XML
1424 parser, or a parser for some other XML-like format.
1425
1426 *element_factory* is an optional element factory which is called
1427 to create new Element instances, as necessary.
1428
Stefan Behnel43851a22019-05-01 21:20:38 +02001429 *comment_factory* is a factory to create comments to be used instead of
1430 the standard factory. If *insert_comments* is false (the default),
1431 comments will not be inserted into the tree.
1432
1433 *pi_factory* is a factory to create processing instructions to be used
1434 instead of the standard factory. If *insert_pis* is false (the default),
1435 processing instructions will not be inserted into the tree.
Eli Bendersky84fae782013-03-09 07:12:48 -08001436 """
Stefan Behnel43851a22019-05-01 21:20:38 +02001437 def __init__(self, element_factory=None, *,
1438 comment_factory=None, pi_factory=None,
1439 insert_comments=False, insert_pis=False):
Armin Rigo9ed73062005-12-14 18:10:45 +00001440 self._data = [] # data collector
1441 self._elem = [] # element stack
1442 self._last = None # last element
Stefan Behnel43851a22019-05-01 21:20:38 +02001443 self._root = None # root element
Armin Rigo9ed73062005-12-14 18:10:45 +00001444 self._tail = None # true if we're after an end tag
Stefan Behnel43851a22019-05-01 21:20:38 +02001445 if comment_factory is None:
1446 comment_factory = Comment
1447 self._comment_factory = comment_factory
1448 self.insert_comments = insert_comments
1449 if pi_factory is None:
1450 pi_factory = ProcessingInstruction
1451 self._pi_factory = pi_factory
1452 self.insert_pis = insert_pis
Armin Rigo9ed73062005-12-14 18:10:45 +00001453 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001454 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001455 self._factory = element_factory
1456
Armin Rigo9ed73062005-12-14 18:10:45 +00001457 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001458 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001459 assert len(self._elem) == 0, "missing end tags"
Stefan Behnel43851a22019-05-01 21:20:38 +02001460 assert self._root is not None, "missing toplevel element"
1461 return self._root
Armin Rigo9ed73062005-12-14 18:10:45 +00001462
1463 def _flush(self):
1464 if self._data:
1465 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001466 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001467 if self._tail:
1468 assert self._last.tail is None, "internal error (tail)"
1469 self._last.tail = text
1470 else:
1471 assert self._last.text is None, "internal error (text)"
1472 self._last.text = text
1473 self._data = []
1474
Armin Rigo9ed73062005-12-14 18:10:45 +00001475 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001476 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001477 self._data.append(data)
1478
Armin Rigo9ed73062005-12-14 18:10:45 +00001479 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001480 """Open new element and return it.
1481
1482 *tag* is the element name, *attrs* is a dict containing element
1483 attributes.
1484
1485 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001486 self._flush()
1487 self._last = elem = self._factory(tag, attrs)
1488 if self._elem:
1489 self._elem[-1].append(elem)
Stefan Behnel43851a22019-05-01 21:20:38 +02001490 elif self._root is None:
1491 self._root = elem
Armin Rigo9ed73062005-12-14 18:10:45 +00001492 self._elem.append(elem)
1493 self._tail = 0
1494 return elem
1495
Armin Rigo9ed73062005-12-14 18:10:45 +00001496 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001497 """Close and return current Element.
1498
1499 *tag* is the element name.
1500
1501 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001502 self._flush()
1503 self._last = self._elem.pop()
1504 assert self._last.tag == tag,\
1505 "end tag mismatch (expected %s, got %s)" % (
1506 self._last.tag, tag)
1507 self._tail = 1
1508 return self._last
1509
Stefan Behnel43851a22019-05-01 21:20:38 +02001510 def comment(self, text):
1511 """Create a comment using the comment_factory.
1512
1513 *text* is the text of the comment.
1514 """
1515 return self._handle_single(
1516 self._comment_factory, self.insert_comments, text)
1517
1518 def pi(self, target, text=None):
1519 """Create a processing instruction using the pi_factory.
1520
1521 *target* is the target name of the processing instruction.
1522 *text* is the data of the processing instruction, or ''.
1523 """
1524 return self._handle_single(
1525 self._pi_factory, self.insert_pis, target, text)
1526
1527 def _handle_single(self, factory, insert, *args):
1528 elem = factory(*args)
1529 if insert:
1530 self._flush()
1531 self._last = elem
1532 if self._elem:
1533 self._elem[-1].append(elem)
1534 self._tail = 1
1535 return elem
1536
Armin Rigo9ed73062005-12-14 18:10:45 +00001537
Eli Bendersky84fae782013-03-09 07:12:48 -08001538# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001539class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001540 """Element structure builder for XML source data based on the expat parser.
1541
Eli Bendersky84fae782013-03-09 07:12:48 -08001542 *target* is an optional target object which defaults to an instance of the
1543 standard TreeBuilder class, *encoding* is an optional encoding string
1544 which if given, overrides the encoding specified in the XML file:
1545 http://www.iana.org/assignments/character-sets
1546
1547 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001548
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +03001549 def __init__(self, *, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001550 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001551 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001552 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001553 try:
1554 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001555 except ImportError:
1556 raise ImportError(
1557 "No module named expat; use SimpleXMLTreeBuilder instead"
1558 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001559 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001560 if target is None:
1561 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001562 # underscored names are provided for compatibility only
1563 self.parser = self._parser = parser
1564 self.target = self._target = target
1565 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001566 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001567 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001568 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001569 if hasattr(target, 'start'):
1570 parser.StartElementHandler = self._start
1571 if hasattr(target, 'end'):
1572 parser.EndElementHandler = self._end
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001573 if hasattr(target, 'start_ns'):
1574 parser.StartNamespaceDeclHandler = self._start_ns
1575 if hasattr(target, 'end_ns'):
1576 parser.EndNamespaceDeclHandler = self._end_ns
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001577 if hasattr(target, 'data'):
1578 parser.CharacterDataHandler = target.data
1579 # miscellaneous callbacks
1580 if hasattr(target, 'comment'):
1581 parser.CommentHandler = target.comment
1582 if hasattr(target, 'pi'):
1583 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001584 # Configure pyexpat: buffering, new-style attribute handling.
1585 parser.buffer_text = 1
1586 parser.ordered_attributes = 1
1587 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001588 self._doctype = None
1589 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001590 try:
1591 self.version = "Expat %d.%d.%d" % expat.version_info
1592 except AttributeError:
1593 pass # unknown
1594
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001595 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001596 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001597 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001598 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001599 # events_queue: a list of actual parsing events that will be populated
1600 # by the underlying parser.
1601 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001602 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001603 append = events_queue.append
1604 for event_name in events_to_report:
1605 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001606 parser.ordered_attributes = 1
1607 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001608 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001609 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001610 append((event, start(tag, attrib_in)))
1611 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001612 elif event_name == "end":
1613 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001614 end=self._end):
1615 append((event, end(tag)))
1616 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001617 elif event_name == "start-ns":
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001618 # TreeBuilder does not implement .start_ns()
1619 if hasattr(self.target, "start_ns"):
1620 def handler(prefix, uri, event=event_name, append=append,
1621 start_ns=self._start_ns):
1622 append((event, start_ns(prefix, uri)))
1623 else:
1624 def handler(prefix, uri, event=event_name, append=append):
1625 append((event, (prefix or '', uri or '')))
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001626 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001627 elif event_name == "end-ns":
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001628 # TreeBuilder does not implement .end_ns()
1629 if hasattr(self.target, "end_ns"):
1630 def handler(prefix, event=event_name, append=append,
1631 end_ns=self._end_ns):
1632 append((event, end_ns(prefix)))
1633 else:
1634 def handler(prefix, event=event_name, append=append):
1635 append((event, None))
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001636 parser.EndNamespaceDeclHandler = handler
Stefan Behnel43851a22019-05-01 21:20:38 +02001637 elif event_name == 'comment':
1638 def handler(text, event=event_name, append=append, self=self):
1639 append((event, self.target.comment(text)))
1640 parser.CommentHandler = handler
1641 elif event_name == 'pi':
1642 def handler(pi_target, data, event=event_name, append=append,
1643 self=self):
1644 append((event, self.target.pi(pi_target, data)))
1645 parser.ProcessingInstructionHandler = handler
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001646 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001647 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001648
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001649 def _raiseerror(self, value):
1650 err = ParseError(value)
1651 err.code = value.code
1652 err.position = value.lineno, value.offset
1653 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001654
Armin Rigo9ed73062005-12-14 18:10:45 +00001655 def _fixname(self, key):
1656 # expand qname, and convert name string to ascii, if possible
1657 try:
1658 name = self._names[key]
1659 except KeyError:
1660 name = key
1661 if "}" in name:
1662 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001663 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001664 return name
1665
Stefan Behneldde3eeb2019-05-01 21:49:58 +02001666 def _start_ns(self, prefix, uri):
1667 return self.target.start_ns(prefix or '', uri or '')
1668
1669 def _end_ns(self, prefix):
1670 return self.target.end_ns(prefix or '')
1671
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001672 def _start(self, tag, attr_list):
1673 # Handler for expat's StartElementHandler. Since ordered_attributes
1674 # is set, the attributes are reported as a list of alternating
1675 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001676 fixname = self._fixname
1677 tag = fixname(tag)
1678 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001679 if attr_list:
1680 for i in range(0, len(attr_list), 2):
1681 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001682 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001683
Armin Rigo9ed73062005-12-14 18:10:45 +00001684 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001685 return self.target.end(self._fixname(tag))
1686
Armin Rigo9ed73062005-12-14 18:10:45 +00001687 def _default(self, text):
1688 prefix = text[:1]
1689 if prefix == "&":
1690 # deal with undefined entities
1691 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001692 data_handler = self.target.data
1693 except AttributeError:
1694 return
1695 try:
1696 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001697 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001698 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001699 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001700 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001701 (text, self.parser.ErrorLineNumber,
1702 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001703 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001704 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001705 err.lineno = self.parser.ErrorLineNumber
1706 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001707 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001708 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1709 self._doctype = [] # inside a doctype declaration
1710 elif self._doctype is not None:
1711 # parse doctype contents
1712 if prefix == ">":
1713 self._doctype = None
1714 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001715 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001716 if not text:
1717 return
1718 self._doctype.append(text)
1719 n = len(self._doctype)
1720 if n > 2:
1721 type = self._doctype[1]
1722 if type == "PUBLIC" and n == 4:
1723 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001724 if pubid:
1725 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001726 elif type == "SYSTEM" and n == 3:
1727 name, type, system = self._doctype
1728 pubid = None
1729 else:
1730 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001731 if hasattr(self.target, "doctype"):
1732 self.target.doctype(name, pubid, system[1:-1])
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +03001733 elif hasattr(self, "doctype"):
1734 warnings.warn(
1735 "The doctype() method of XMLParser is ignored. "
1736 "Define doctype() method on the TreeBuilder target.",
1737 RuntimeWarning)
1738
Armin Rigo9ed73062005-12-14 18:10:45 +00001739 self._doctype = None
1740
Armin Rigo9ed73062005-12-14 18:10:45 +00001741 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001742 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001743 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001744 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001745 except self._error as v:
1746 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001747
Armin Rigo9ed73062005-12-14 18:10:45 +00001748 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001749 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001750 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001751 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001752 except self._error as v:
1753 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001754 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001755 close_handler = self.target.close
1756 except AttributeError:
1757 pass
1758 else:
1759 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001760 finally:
1761 # get rid of circular references
1762 del self.parser, self._parser
1763 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001764
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001765
Stefan Behnele1d5dd62019-05-01 22:34:13 +02001766# --------------------------------------------------------------------
1767# C14N 2.0
1768
1769def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1770 """Convert XML to its C14N 2.0 serialised form.
1771
1772 If *out* is provided, it must be a file or file-like object that receives
1773 the serialised canonical XML output (text, not bytes) through its ``.write()``
1774 method. To write to a file, open it in text mode with encoding "utf-8".
1775 If *out* is not provided, this function returns the output as text string.
1776
1777 Either *xml_data* (an XML string) or *from_file* (a file path or
1778 file-like object) must be provided as input.
1779
1780 The configuration options are the same as for the ``C14NWriterTarget``.
1781 """
1782 if xml_data is None and from_file is None:
1783 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1784 sio = None
1785 if out is None:
1786 sio = out = io.StringIO()
1787
1788 parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1789
1790 if xml_data is not None:
1791 parser.feed(xml_data)
1792 parser.close()
1793 elif from_file is not None:
1794 parse(from_file, parser=parser)
1795
1796 return sio.getvalue() if sio is not None else None
1797
1798
1799_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1800
1801
1802class C14NWriterTarget:
1803 """
1804 Canonicalization writer target for the XMLParser.
1805
1806 Serialises parse events to XML C14N 2.0.
1807
1808 The *write* function is used for writing out the resulting data stream
1809 as text (not bytes). To write to a file, open it in text mode with encoding
1810 "utf-8" and pass its ``.write`` method.
1811
1812 Configuration options:
1813
1814 - *with_comments*: set to true to include comments
1815 - *strip_text*: set to true to strip whitespace before and after text content
1816 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1817 - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1818 should be replaced in text content
1819 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1820 should be replaced in text content
1821 - *exclude_attrs*: a set of attribute names that should not be serialised
1822 - *exclude_tags*: a set of tag names that should not be serialised
1823 """
1824 def __init__(self, write, *,
1825 with_comments=False, strip_text=False, rewrite_prefixes=False,
1826 qname_aware_tags=None, qname_aware_attrs=None,
1827 exclude_attrs=None, exclude_tags=None):
1828 self._write = write
1829 self._data = []
1830 self._with_comments = with_comments
1831 self._strip_text = strip_text
1832 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1833 self._exclude_tags = set(exclude_tags) if exclude_tags else None
1834
1835 self._rewrite_prefixes = rewrite_prefixes
1836 if qname_aware_tags:
1837 self._qname_aware_tags = set(qname_aware_tags)
1838 else:
1839 self._qname_aware_tags = None
1840 if qname_aware_attrs:
1841 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1842 else:
1843 self._find_qname_aware_attrs = None
1844
1845 # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1846 self._declared_ns_stack = [[
1847 ("http://www.w3.org/XML/1998/namespace", "xml"),
1848 ]]
1849 # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1850 self._ns_stack = []
1851 if not rewrite_prefixes:
1852 self._ns_stack.append(list(_namespace_map.items()))
1853 self._ns_stack.append([])
1854 self._prefix_map = {}
1855 self._preserve_space = [False]
1856 self._pending_start = None
1857 self._root_seen = False
1858 self._root_done = False
1859 self._ignored_depth = 0
1860
1861 def _iter_namespaces(self, ns_stack, _reversed=reversed):
1862 for namespaces in _reversed(ns_stack):
1863 if namespaces: # almost no element declares new namespaces
1864 yield from namespaces
1865
1866 def _resolve_prefix_name(self, prefixed_name):
1867 prefix, name = prefixed_name.split(':', 1)
1868 for uri, p in self._iter_namespaces(self._ns_stack):
1869 if p == prefix:
1870 return f'{{{uri}}}{name}'
1871 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1872
1873 def _qname(self, qname, uri=None):
1874 if uri is None:
1875 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1876 else:
1877 tag = qname
1878
1879 prefixes_seen = set()
1880 for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1881 if u == uri and prefix not in prefixes_seen:
1882 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1883 prefixes_seen.add(prefix)
1884
1885 # Not declared yet => add new declaration.
1886 if self._rewrite_prefixes:
1887 if uri in self._prefix_map:
1888 prefix = self._prefix_map[uri]
1889 else:
1890 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1891 self._declared_ns_stack[-1].append((uri, prefix))
1892 return f'{prefix}:{tag}', tag, uri
1893
1894 if not uri and '' not in prefixes_seen:
1895 # No default namespace declared => no prefix needed.
1896 return tag, tag, uri
1897
1898 for u, prefix in self._iter_namespaces(self._ns_stack):
1899 if u == uri:
1900 self._declared_ns_stack[-1].append((uri, prefix))
1901 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1902
1903 raise ValueError(f'Namespace "{uri}" is not declared in scope')
1904
1905 def data(self, data):
1906 if not self._ignored_depth:
1907 self._data.append(data)
1908
1909 def _flush(self, _join_text=''.join):
1910 data = _join_text(self._data)
1911 del self._data[:]
1912 if self._strip_text and not self._preserve_space[-1]:
1913 data = data.strip()
1914 if self._pending_start is not None:
1915 args, self._pending_start = self._pending_start, None
1916 qname_text = data if data and _looks_like_prefix_name(data) else None
1917 self._start(*args, qname_text)
1918 if qname_text is not None:
1919 return
1920 if data and self._root_seen:
1921 self._write(_escape_cdata_c14n(data))
1922
1923 def start_ns(self, prefix, uri):
1924 if self._ignored_depth:
1925 return
1926 # we may have to resolve qnames in text content
1927 if self._data:
1928 self._flush()
1929 self._ns_stack[-1].append((uri, prefix))
1930
1931 def start(self, tag, attrs):
1932 if self._exclude_tags is not None and (
1933 self._ignored_depth or tag in self._exclude_tags):
1934 self._ignored_depth += 1
1935 return
1936 if self._data:
1937 self._flush()
1938
1939 new_namespaces = []
1940 self._declared_ns_stack.append(new_namespaces)
1941
1942 if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1943 # Need to parse text first to see if it requires a prefix declaration.
1944 self._pending_start = (tag, attrs, new_namespaces)
1945 return
1946 self._start(tag, attrs, new_namespaces)
1947
1948 def _start(self, tag, attrs, new_namespaces, qname_text=None):
1949 if self._exclude_attrs is not None and attrs:
1950 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1951
1952 qnames = {tag, *attrs}
1953 resolved_names = {}
1954
1955 # Resolve prefixes in attribute and tag text.
1956 if qname_text is not None:
1957 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1958 qnames.add(qname)
1959 if self._find_qname_aware_attrs is not None and attrs:
1960 qattrs = self._find_qname_aware_attrs(attrs)
1961 if qattrs:
1962 for attr_name in qattrs:
1963 value = attrs[attr_name]
1964 if _looks_like_prefix_name(value):
1965 qname = resolved_names[value] = self._resolve_prefix_name(value)
1966 qnames.add(qname)
1967 else:
1968 qattrs = None
1969 else:
1970 qattrs = None
1971
1972 # Assign prefixes in lexicographical order of used URIs.
1973 parse_qname = self._qname
1974 parsed_qnames = {n: parse_qname(n) for n in sorted(
1975 qnames, key=lambda n: n.split('}', 1))}
1976
1977 # Write namespace declarations in prefix order ...
1978 if new_namespaces:
1979 attr_list = [
1980 ('xmlns:' + prefix if prefix else 'xmlns', uri)
1981 for uri, prefix in new_namespaces
1982 ]
1983 attr_list.sort()
1984 else:
1985 # almost always empty
1986 attr_list = []
1987
1988 # ... followed by attributes in URI+name order
1989 if attrs:
1990 for k, v in sorted(attrs.items()):
1991 if qattrs is not None and k in qattrs and v in resolved_names:
1992 v = parsed_qnames[resolved_names[v]][0]
1993 attr_qname, attr_name, uri = parsed_qnames[k]
1994 # No prefix for attributes in default ('') namespace.
1995 attr_list.append((attr_qname if uri else attr_name, v))
1996
1997 # Honour xml:space attributes.
1998 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1999 self._preserve_space.append(
2000 space_behaviour == 'preserve' if space_behaviour
2001 else self._preserve_space[-1])
2002
2003 # Write the tag.
2004 write = self._write
2005 write('<' + parsed_qnames[tag][0])
2006 if attr_list:
2007 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
2008 write('>')
2009
2010 # Write the resolved qname text content.
2011 if qname_text is not None:
2012 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
2013
2014 self._root_seen = True
2015 self._ns_stack.append([])
2016
2017 def end(self, tag):
2018 if self._ignored_depth:
2019 self._ignored_depth -= 1
2020 return
2021 if self._data:
2022 self._flush()
2023 self._write(f'</{self._qname(tag)[0]}>')
2024 self._preserve_space.pop()
2025 self._root_done = len(self._preserve_space) == 1
2026 self._declared_ns_stack.pop()
2027 self._ns_stack.pop()
2028
2029 def comment(self, text):
2030 if not self._with_comments:
2031 return
2032 if self._ignored_depth:
2033 return
2034 if self._root_done:
2035 self._write('\n')
2036 elif self._root_seen and self._data:
2037 self._flush()
2038 self._write(f'<!--{_escape_cdata_c14n(text)}-->')
2039 if not self._root_seen:
2040 self._write('\n')
2041
2042 def pi(self, target, data):
2043 if self._ignored_depth:
2044 return
2045 if self._root_done:
2046 self._write('\n')
2047 elif self._root_seen and self._data:
2048 self._flush()
2049 self._write(
2050 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2051 if not self._root_seen:
2052 self._write('\n')
2053
2054
2055def _escape_cdata_c14n(text):
2056 # escape character data
2057 try:
2058 # it's worth avoiding do-nothing calls for strings that are
2059 # shorter than 500 character, or so. assume that's, by far,
2060 # the most common case in most applications.
2061 if '&' in text:
2062 text = text.replace('&', '&amp;')
2063 if '<' in text:
2064 text = text.replace('<', '&lt;')
2065 if '>' in text:
2066 text = text.replace('>', '&gt;')
2067 if '\r' in text:
2068 text = text.replace('\r', '&#xD;')
2069 return text
2070 except (TypeError, AttributeError):
2071 _raise_serialization_error(text)
2072
2073
2074def _escape_attrib_c14n(text):
2075 # escape attribute value
2076 try:
2077 if '&' in text:
2078 text = text.replace('&', '&amp;')
2079 if '<' in text:
2080 text = text.replace('<', '&lt;')
2081 if '"' in text:
2082 text = text.replace('"', '&quot;')
2083 if '\t' in text:
2084 text = text.replace('\t', '&#x9;')
2085 if '\n' in text:
2086 text = text.replace('\n', '&#xA;')
2087 if '\r' in text:
2088 text = text.replace('\r', '&#xD;')
2089 return text
2090 except (TypeError, AttributeError):
2091 _raise_serialization_error(text)
2092
2093
2094# --------------------------------------------------------------------
2095
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01002096# Import the C accelerators
2097try:
Eli Bendersky46955b22013-05-19 09:20:50 -07002098 # Element is going to be shadowed by the C implementation. We need to keep
2099 # the Python version of it accessible for some "creative" by external code
2100 # (see tests)
2101 _Element_Py = Element
2102
Stefan Behnel43851a22019-05-01 21:20:38 +02002103 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01002104 from _elementtree import *
Stefan Behnel43851a22019-05-01 21:20:38 +02002105 from _elementtree import _set_factories
Eli Benderskyc4e98a62013-05-19 09:24:43 -07002106except ImportError:
2107 pass
Stefan Behnel43851a22019-05-01 21:20:38 +02002108else:
2109 _set_factories(Comment, ProcessingInstruction)