blob: c1cf483cf56bb26a94c26d7f4f05e5874f24f1a6 [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Martin Panterdcfebb32016-04-01 06:55:55 +000088 "XMLParser", "XMLPullParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +020098import collections
Serhiy Storchaka2e576f52017-04-24 09:05:00 +030099import collections.abc
Eli Bendersky00f402b2012-07-15 06:02:22 +0300100import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000101
Eli Bendersky27cbb192012-06-15 09:03:19 +0300102from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000103
Armin Rigo9ed73062005-12-14 18:10:45 +0000104
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000105class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800106 """An error when parsing an XML document.
107
108 In addition to its exception value, a ParseError contains
109 two extra attributes:
110 'code' - the specific exception code
111 'position' - the line and column of the error
112
113 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000114 pass
115
116# --------------------------------------------------------------------
117
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000118
119def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800120 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100121 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000122
Armin Rigo9ed73062005-12-14 18:10:45 +0000123
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000124class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800125 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000126
Eli Bendersky84fae782013-03-09 07:12:48 -0800127 This class is the reference implementation of the Element interface.
128
129 An element's length is its number of subelements. That means if you
Serhiy Storchaka56a6d852014-12-01 18:28:43 +0200130 want to check if an element is truly empty, you should check BOTH
Eli Bendersky84fae782013-03-09 07:12:48 -0800131 its length AND its text attribute.
132
133 The element tag, attribute names, and attribute values can be either
134 bytes or strings.
135
136 *tag* is the element name. *attrib* is an optional dictionary containing
137 element attributes. *extra* are additional element attributes given as
138 keyword arguments.
139
140 Example form:
141 <tag attrib>text<child/>...</tag>tail
142
143 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000144
145 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800146 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000147
148 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800149 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000150
151 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800152 """
153 Text before first subelement. This is either a string or the value None.
154 Note that if there is no text, this attribute may be either
155 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000156
Eli Bendersky84fae782013-03-09 07:12:48 -0800157 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000158
Eli Bendersky84fae782013-03-09 07:12:48 -0800159 tail = None
160 """
161 Text after this element's end tag, but before the next sibling element's
162 start tag. This is either a string or the value None. Note that if there
163 was no text, this attribute may be either None or an empty string,
164 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000165
Eli Bendersky84fae782013-03-09 07:12:48 -0800166 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000167
168 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300169 if not isinstance(attrib, dict):
170 raise TypeError("attrib must be dict, not %s" % (
171 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000172 attrib = attrib.copy()
173 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000174 self.tag = tag
175 self.attrib = attrib
176 self._children = []
177
178 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300179 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000180
Armin Rigo9ed73062005-12-14 18:10:45 +0000181 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800182 """Create a new element with the same type.
183
184 *tag* is a string containing the element name.
185 *attrib* is a dictionary containing the element attributes.
186
187 Do not call this method, use the SubElement factory function instead.
188
189 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000190 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000191
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000192 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800193 """Return copy of current element.
194
195 This creates a shallow copy. Subelements will be shared with the
196 original tree.
197
198 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000199 elem = self.makeelement(self.tag, self.attrib)
200 elem.text = self.text
201 elem.tail = self.tail
202 elem[:] = self
203 return elem
204
Armin Rigo9ed73062005-12-14 18:10:45 +0000205 def __len__(self):
206 return len(self._children)
207
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000208 def __bool__(self):
209 warnings.warn(
210 "The behavior of this method will change in future versions. "
211 "Use specific 'len(elem)' or 'elem is not None' test instead.",
212 FutureWarning, stacklevel=2
213 )
214 return len(self._children) != 0 # emulate old behaviour, for now
215
Armin Rigo9ed73062005-12-14 18:10:45 +0000216 def __getitem__(self, index):
217 return self._children[index]
218
Armin Rigo9ed73062005-12-14 18:10:45 +0000219 def __setitem__(self, index, element):
Serhiy Storchakaf081fd82018-10-19 12:12:57 +0300220 if isinstance(index, slice):
221 for elt in element:
222 self._assert_is_element(elt)
223 else:
224 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 self._children[index] = element
226
Armin Rigo9ed73062005-12-14 18:10:45 +0000227 def __delitem__(self, index):
228 del self._children[index]
229
Eli Bendersky84fae782013-03-09 07:12:48 -0800230 def append(self, subelement):
231 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000232
Eli Bendersky84fae782013-03-09 07:12:48 -0800233 The new element will appear in document order after the last existing
234 subelement (or directly after the text, if it's the first subelement),
235 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000236
Eli Bendersky84fae782013-03-09 07:12:48 -0800237 """
238 self._assert_is_element(subelement)
239 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000240
241 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800242 """Append subelements from a sequence.
243
244 *elements* is a sequence with zero or more elements.
245
246 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200247 for element in elements:
248 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000249 self._children.extend(elements)
250
Eli Bendersky84fae782013-03-09 07:12:48 -0800251 def insert(self, index, subelement):
252 """Insert *subelement* at position *index*."""
253 self._assert_is_element(subelement)
254 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000255
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200256 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200257 # Need to refer to the actual Python implementation, not the
258 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700259 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200260 raise TypeError('expected an Element, not %s' % type(e).__name__)
261
Eli Bendersky84fae782013-03-09 07:12:48 -0800262 def remove(self, subelement):
263 """Remove matching subelement.
264
265 Unlike the find methods, this method compares elements based on
266 identity, NOT ON tag value or contents. To remove subelements by
267 other means, the easiest way is to use a list comprehension to
268 select what elements to keep, and then use slice assignment to update
269 the parent element.
270
271 ValueError is raised if a matching element could not be found.
272
273 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000274 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800275 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000276
277 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800278 """(Deprecated) Return all subelements.
279
280 Elements are returned in document order.
281
282 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000283 warnings.warn(
284 "This method will be removed in future versions. "
285 "Use 'list(elem)' or iteration over elem instead.",
286 DeprecationWarning, stacklevel=2
287 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000288 return self._children
289
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000290 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800291 """Find first matching element by tag name or path.
292
293 *path* is a string having either an element tag or an XPath,
294 *namespaces* is an optional mapping from namespace prefix to full name.
295
296 Return the first matching element, or None if no element was found.
297
298 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000299 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000300
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000301 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800302 """Find text for first matching element by tag name or path.
303
304 *path* is a string having either an element tag or an XPath,
305 *default* is the value to return if the element was not found,
306 *namespaces* is an optional mapping from namespace prefix to full name.
307
308 Return text content of first matching element, or default value if
309 none was found. Note that if an element is found having no text
310 content, the empty string is returned.
311
312 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000313 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000314
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000315 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800316 """Find all matching subelements by tag name or path.
317
318 *path* is a string having either an element tag or an XPath,
319 *namespaces* is an optional mapping from namespace prefix to full name.
320
321 Returns list containing all matching elements in document order.
322
323 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000324 return ElementPath.findall(self, path, namespaces)
325
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000326 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800327 """Find all matching subelements by tag name or path.
328
329 *path* is a string having either an element tag or an XPath,
330 *namespaces* is an optional mapping from namespace prefix to full name.
331
332 Return an iterable yielding all matching elements in document order.
333
334 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000335 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000336
Armin Rigo9ed73062005-12-14 18:10:45 +0000337 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800338 """Reset element.
339
340 This function removes all subelements, clears all attributes, and sets
341 the text and tail attributes to None.
342
343 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000344 self.attrib.clear()
345 self._children = []
346 self.text = self.tail = None
347
Armin Rigo9ed73062005-12-14 18:10:45 +0000348 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800349 """Get element attribute.
350
351 Equivalent to attrib.get, but some implementations may handle this a
352 bit more efficiently. *key* is what attribute to look for, and
353 *default* is what to return if the attribute was not found.
354
355 Returns a string containing the attribute value, or the default if
356 attribute was not found.
357
358 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000359 return self.attrib.get(key, default)
360
Armin Rigo9ed73062005-12-14 18:10:45 +0000361 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800362 """Set element attribute.
363
364 Equivalent to attrib[key] = value, but some implementations may handle
365 this a bit more efficiently. *key* is what attribute to set, and
366 *value* is the attribute value to set it to.
367
368 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000369 self.attrib[key] = value
370
Armin Rigo9ed73062005-12-14 18:10:45 +0000371 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800372 """Get list of attribute names.
373
374 Names are returned in an arbitrary order, just like an ordinary
375 Python dict. Equivalent to attrib.keys()
376
377 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000378 return self.attrib.keys()
379
Armin Rigo9ed73062005-12-14 18:10:45 +0000380 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800381 """Get element attributes as a sequence.
382
383 The attributes are returned in arbitrary order. Equivalent to
384 attrib.items().
385
386 Return a list of (name, value) tuples.
387
388 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000389 return self.attrib.items()
390
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000391 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800392 """Create tree iterator.
393
394 The iterator loops over the element and all subelements in document
395 order, returning all elements with a matching tag.
396
397 If the tree structure is modified during iteration, new or removed
398 elements may or may not be included. To get a stable set, use the
399 list() function on the iterator, and loop over the resulting list.
400
401 *tag* is what tags to look for (default is to return all elements)
402
403 Return an iterator containing all the matching elements.
404
405 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000406 if tag == "*":
407 tag = None
408 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000409 yield self
410 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700411 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000412
413 # compatibility
414 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000415 warnings.warn(
416 "This method will be removed in future versions. "
417 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +0300418 DeprecationWarning, stacklevel=2
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000419 )
420 return list(self.iter(tag))
421
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000422 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800423 """Create text iterator.
424
425 The iterator loops over the element and all subelements in document
426 order, returning all inner text.
427
428 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000429 tag = self.tag
430 if not isinstance(tag, str) and tag is not None:
431 return
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200432 t = self.text
433 if t:
434 yield t
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000435 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700436 yield from e.itertext()
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200437 t = e.tail
438 if t:
439 yield t
Armin Rigo9ed73062005-12-14 18:10:45 +0000440
Armin Rigo9ed73062005-12-14 18:10:45 +0000441
442def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800443 """Subelement factory which creates an element instance, and appends it
444 to an existing parent.
445
446 The element tag, attribute names, and attribute values can be either
447 bytes or Unicode strings.
448
449 *parent* is the parent element, *tag* is the subelements name, *attrib* is
450 an optional directory containing element attributes, *extra* are
451 additional attributes given as keyword arguments.
452
453 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000454 attrib = attrib.copy()
455 attrib.update(extra)
456 element = parent.makeelement(tag, attrib)
457 parent.append(element)
458 return element
459
Armin Rigo9ed73062005-12-14 18:10:45 +0000460
461def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800462 """Comment element factory.
463
464 This function creates a special element which the standard serializer
465 serializes as an XML comment.
466
467 *text* is a string containing the comment string.
468
469 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000470 element = Element(Comment)
471 element.text = text
472 return element
473
Armin Rigo9ed73062005-12-14 18:10:45 +0000474
475def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800476 """Processing Instruction element factory.
477
478 This function creates a special element which the standard serializer
479 serializes as an XML comment.
480
481 *target* is a string containing the processing instruction, *text* is a
482 string containing the processing instruction contents, if any.
483
484 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000485 element = Element(ProcessingInstruction)
486 element.text = target
487 if text:
488 element.text = element.text + " " + text
489 return element
490
491PI = ProcessingInstruction
492
Armin Rigo9ed73062005-12-14 18:10:45 +0000493
494class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800495 """Qualified name wrapper.
496
497 This class can be used to wrap a QName attribute value in order to get
498 proper namespace handing on output.
499
500 *text_or_uri* is a string containing the QName value either in the form
501 {uri}local, or if the tag argument is given, the URI part of a QName.
502
503 *tag* is an optional argument which if given, will make the first
504 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
505 be interpreted as a local name.
506
507 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000508 def __init__(self, text_or_uri, tag=None):
509 if tag:
510 text_or_uri = "{%s}%s" % (text_or_uri, tag)
511 self.text = text_or_uri
512 def __str__(self):
513 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000514 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300515 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000516 def __hash__(self):
517 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000518 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000519 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000520 return self.text <= other.text
521 return self.text <= other
522 def __lt__(self, other):
523 if isinstance(other, QName):
524 return self.text < other.text
525 return self.text < other
526 def __ge__(self, other):
527 if isinstance(other, QName):
528 return self.text >= other.text
529 return self.text >= other
530 def __gt__(self, other):
531 if isinstance(other, QName):
532 return self.text > other.text
533 return self.text > other
534 def __eq__(self, other):
535 if isinstance(other, QName):
536 return self.text == other.text
537 return self.text == other
Armin Rigo9ed73062005-12-14 18:10:45 +0000538
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000539# --------------------------------------------------------------------
540
Armin Rigo9ed73062005-12-14 18:10:45 +0000541
542class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800543 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000544
Eli Bendersky84fae782013-03-09 07:12:48 -0800545 This class also provides support for serialization to and from
546 standard XML.
547
548 *element* is an optional root element node,
549 *file* is an optional file handle or file name of an XML file whose
550 contents will be used to initialize the tree with.
551
552 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000553 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000554 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000555 self._root = element # first node
556 if file:
557 self.parse(file)
558
Armin Rigo9ed73062005-12-14 18:10:45 +0000559 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800560 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000561 return self._root
562
Armin Rigo9ed73062005-12-14 18:10:45 +0000563 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800564 """Replace root element of this tree.
565
566 This will discard the current contents of the tree and replace it
567 with the given element. Use with care!
568
569 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000570 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000571 self._root = element
572
Armin Rigo9ed73062005-12-14 18:10:45 +0000573 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800574 """Load external XML document into element tree.
575
576 *source* is a file name or file object, *parser* is an optional parser
577 instance that defaults to XMLParser.
578
579 ParseError is raised if the parser fails to parse the document.
580
581 Returns the root element of the given source document.
582
583 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000584 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000585 if not hasattr(source, "read"):
586 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000587 close_source = True
588 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700589 if parser is None:
590 # If no parser was specified, create a default XMLParser
591 parser = XMLParser()
592 if hasattr(parser, '_parse_whole'):
593 # The default XMLParser, when it comes from an accelerator,
594 # can define an internal _parse_whole API for efficiency.
595 # It can be used to parse the whole source without feeding
596 # it with chunks.
597 self._root = parser._parse_whole(source)
598 return self._root
599 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000600 data = source.read(65536)
601 if not data:
602 break
603 parser.feed(data)
604 self._root = parser.close()
605 return self._root
606 finally:
607 if close_source:
608 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000609
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000610 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800611 """Create and return tree iterator for the root element.
612
613 The iterator loops over all elements in this tree, in document order.
614
615 *tag* is a string with the tag name to iterate over
616 (default is to return all elements).
617
618 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000619 # assert self._root is not None
620 return self._root.iter(tag)
621
622 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000623 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000624 warnings.warn(
625 "This method will be removed in future versions. "
626 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +0300627 DeprecationWarning, stacklevel=2
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000628 )
629 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000630
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000631 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800632 """Find first matching element by tag name or path.
633
634 Same as getroot().find(path), which is Element.find()
635
636 *path* is a string having either an element tag or an XPath,
637 *namespaces* is an optional mapping from namespace prefix to full name.
638
639 Return the first matching element, or None if no element was found.
640
641 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000642 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000643 if path[:1] == "/":
644 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000645 warnings.warn(
646 "This search is broken in 1.3 and earlier, and will be "
647 "fixed in a future version. If you rely on the current "
648 "behaviour, change it to %r" % path,
649 FutureWarning, stacklevel=2
650 )
651 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000652
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000653 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800654 """Find first matching element by tag name or path.
655
656 Same as getroot().findtext(path), which is Element.findtext()
657
658 *path* is a string having either an element tag or an XPath,
659 *namespaces* is an optional mapping from namespace prefix to full name.
660
661 Return the first matching element, or None if no element was found.
662
663 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000664 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000665 if path[:1] == "/":
666 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000667 warnings.warn(
668 "This search is broken in 1.3 and earlier, and will be "
669 "fixed in a future version. If you rely on the current "
670 "behaviour, change it to %r" % path,
671 FutureWarning, stacklevel=2
672 )
673 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000674
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000675 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800676 """Find all matching subelements by tag name or path.
677
678 Same as getroot().findall(path), which is Element.findall().
679
680 *path* is a string having either an element tag or an XPath,
681 *namespaces* is an optional mapping from namespace prefix to full name.
682
683 Return list containing all matching elements in document order.
684
685 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000686 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000687 if path[:1] == "/":
688 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000689 warnings.warn(
690 "This search is broken in 1.3 and earlier, and will be "
691 "fixed in a future version. If you rely on the current "
692 "behaviour, change it to %r" % path,
693 FutureWarning, stacklevel=2
694 )
695 return self._root.findall(path, namespaces)
696
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000697 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800698 """Find all matching subelements by tag name or path.
699
700 Same as getroot().iterfind(path), which is element.iterfind()
701
702 *path* is a string having either an element tag or an XPath,
703 *namespaces* is an optional mapping from namespace prefix to full name.
704
705 Return an iterable yielding all matching elements in document order.
706
707 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000708 # assert self._root is not None
709 if path[:1] == "/":
710 path = "." + path
711 warnings.warn(
712 "This search is broken in 1.3 and earlier, and will be "
713 "fixed in a future version. If you rely on the current "
714 "behaviour, change it to %r" % path,
715 FutureWarning, stacklevel=2
716 )
717 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000718
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000719 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000720 encoding=None,
721 xml_declaration=None,
722 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800723 method=None, *,
724 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800725 """Write element tree to a file as XML.
726
727 Arguments:
728 *file_or_filename* -- file name or a file object opened for writing
729
730 *encoding* -- the output encoding (default: US-ASCII)
731
732 *xml_declaration* -- bool indicating if an XML declaration should be
733 added to the output. If None, an XML declaration
734 is added if encoding IS NOT either of:
735 US-ASCII, UTF-8, or Unicode
736
737 *default_namespace* -- sets the default XML namespace (for "xmlns")
738
739 *method* -- either "xml" (default), "html, "text", or "c14n"
740
741 *short_empty_elements* -- controls the formatting of elements
742 that contain no content. If True (default)
743 they are emitted as a single self-closed
744 tag, otherwise they are emitted as a pair
745 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800746
747 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000748 if not method:
749 method = "xml"
750 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000751 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000752 if not encoding:
753 if method == "c14n":
754 encoding = "utf-8"
755 else:
756 encoding = "us-ascii"
Martin Panter89f76d32015-09-23 01:14:35 +0000757 enc_lower = encoding.lower()
758 with _get_writer(file_or_filename, enc_lower) as write:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300759 if method == "xml" and (xml_declaration or
760 (xml_declaration is None and
Martin Panter89f76d32015-09-23 01:14:35 +0000761 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
Eli Bendersky00f402b2012-07-15 06:02:22 +0300762 declared_encoding = encoding
Martin Panter89f76d32015-09-23 01:14:35 +0000763 if enc_lower == "unicode":
Eli Bendersky00f402b2012-07-15 06:02:22 +0300764 # Retrieve the default encoding for the xml declaration
765 import locale
766 declared_encoding = locale.getpreferredencoding()
767 write("<?xml version='1.0' encoding='%s'?>\n" % (
768 declared_encoding,))
769 if method == "text":
770 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000771 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300772 qnames, namespaces = _namespaces(self._root, default_namespace)
773 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800774 serialize(write, self._root, qnames, namespaces,
775 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000776
777 def write_c14n(self, file):
778 # lxml.etree compatibility. use output method instead
779 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000780
781# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000782# serialization support
783
Eli Bendersky00f402b2012-07-15 06:02:22 +0300784@contextlib.contextmanager
785def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300786 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300787 try:
788 write = file_or_filename.write
789 except AttributeError:
790 # file_or_filename is a file name
791 if encoding == "unicode":
792 file = open(file_or_filename, "w")
793 else:
794 file = open(file_or_filename, "w", encoding=encoding,
795 errors="xmlcharrefreplace")
796 with file:
797 yield file.write
798 else:
799 # file_or_filename is a file-like object
800 # encoding determines if it is a text or binary writer
801 if encoding == "unicode":
802 # use a text writer as is
803 yield write
804 else:
805 # wrap a binary writer with TextIOWrapper
806 with contextlib.ExitStack() as stack:
807 if isinstance(file_or_filename, io.BufferedIOBase):
808 file = file_or_filename
809 elif isinstance(file_or_filename, io.RawIOBase):
810 file = io.BufferedWriter(file_or_filename)
811 # Keep the original file open when the BufferedWriter is
812 # destroyed
813 stack.callback(file.detach)
814 else:
815 # This is to handle passed objects that aren't in the
816 # IOBase hierarchy, but just have a write method
817 file = io.BufferedIOBase()
818 file.writable = lambda: True
819 file.write = write
820 try:
821 # TextIOWrapper uses this methods to determine
822 # if BOM (for UTF-16, etc) should be added
823 file.seekable = file_or_filename.seekable
824 file.tell = file_or_filename.tell
825 except AttributeError:
826 pass
827 file = io.TextIOWrapper(file,
828 encoding=encoding,
829 errors="xmlcharrefreplace",
830 newline="\n")
831 # Keep the original file open when the TextIOWrapper is
832 # destroyed
833 stack.callback(file.detach)
834 yield file.write
835
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000836def _namespaces(elem, default_namespace=None):
837 # identify namespaces used in this tree
838
839 # maps qnames to *encoded* prefix:local names
840 qnames = {None: None}
841
842 # maps uri:s to prefixes
843 namespaces = {}
844 if default_namespace:
845 namespaces[default_namespace] = ""
846
847 def add_qname(qname):
848 # calculate serialized qname representation
849 try:
850 if qname[:1] == "{":
851 uri, tag = qname[1:].rsplit("}", 1)
852 prefix = namespaces.get(uri)
853 if prefix is None:
854 prefix = _namespace_map.get(uri)
855 if prefix is None:
856 prefix = "ns%d" % len(namespaces)
857 if prefix != "xml":
858 namespaces[uri] = prefix
859 if prefix:
860 qnames[qname] = "%s:%s" % (prefix, tag)
861 else:
862 qnames[qname] = tag # default element
863 else:
864 if default_namespace:
865 # FIXME: can this be handled in XML 1.0?
866 raise ValueError(
867 "cannot use non-qualified names with "
868 "default_namespace option"
869 )
870 qnames[qname] = qname
871 except TypeError:
872 _raise_serialization_error(qname)
873
874 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300875 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000876 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000877 if isinstance(tag, QName):
878 if tag.text not in qnames:
879 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000880 elif isinstance(tag, str):
881 if tag not in qnames:
882 add_qname(tag)
883 elif tag is not None and tag is not Comment and tag is not PI:
884 _raise_serialization_error(tag)
885 for key, value in elem.items():
886 if isinstance(key, QName):
887 key = key.text
888 if key not in qnames:
889 add_qname(key)
890 if isinstance(value, QName) and value.text not in qnames:
891 add_qname(value.text)
892 text = elem.text
893 if isinstance(text, QName) and text.text not in qnames:
894 add_qname(text.text)
895 return qnames, namespaces
896
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800897def _serialize_xml(write, elem, qnames, namespaces,
898 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000899 tag = elem.tag
900 text = elem.text
901 if tag is Comment:
902 write("<!--%s-->" % text)
903 elif tag is ProcessingInstruction:
904 write("<?%s?>" % text)
905 else:
906 tag = qnames[tag]
907 if tag is None:
908 if text:
909 write(_escape_cdata(text))
910 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800911 _serialize_xml(write, e, qnames, None,
912 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000913 else:
914 write("<" + tag)
915 items = list(elem.items())
916 if items or namespaces:
917 if namespaces:
918 for v, k in sorted(namespaces.items(),
919 key=lambda x: x[1]): # sort on prefix
920 if k:
921 k = ":" + k
922 write(" xmlns%s=\"%s\"" % (
923 k,
924 _escape_attrib(v)
925 ))
Raymond Hettingere3685fd2018-10-28 11:18:22 -0700926 for k, v in items:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000927 if isinstance(k, QName):
928 k = k.text
929 if isinstance(v, QName):
930 v = qnames[v.text]
931 else:
932 v = _escape_attrib(v)
933 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800934 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000935 write(">")
936 if text:
937 write(_escape_cdata(text))
938 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800939 _serialize_xml(write, e, qnames, None,
940 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000941 write("</" + tag + ">")
942 else:
943 write(" />")
944 if elem.tail:
945 write(_escape_cdata(elem.tail))
946
947HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300948 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000949
950try:
951 HTML_EMPTY = set(HTML_EMPTY)
952except NameError:
953 pass
954
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800955def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000956 tag = elem.tag
957 text = elem.text
958 if tag is Comment:
959 write("<!--%s-->" % _escape_cdata(text))
960 elif tag is ProcessingInstruction:
961 write("<?%s?>" % _escape_cdata(text))
962 else:
963 tag = qnames[tag]
964 if tag is None:
965 if text:
966 write(_escape_cdata(text))
967 for e in elem:
968 _serialize_html(write, e, qnames, None)
969 else:
970 write("<" + tag)
971 items = list(elem.items())
972 if items or namespaces:
973 if namespaces:
974 for v, k in sorted(namespaces.items(),
975 key=lambda x: x[1]): # sort on prefix
976 if k:
977 k = ":" + k
978 write(" xmlns%s=\"%s\"" % (
979 k,
980 _escape_attrib(v)
981 ))
Serhiy Storchaka3b05ad72018-10-29 19:31:04 +0200982 for k, v in items:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000983 if isinstance(k, QName):
984 k = k.text
985 if isinstance(v, QName):
986 v = qnames[v.text]
987 else:
988 v = _escape_attrib_html(v)
989 # FIXME: handle boolean attributes
990 write(" %s=\"%s\"" % (qnames[k], v))
991 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200992 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000993 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200994 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000995 write(text)
996 else:
997 write(_escape_cdata(text))
998 for e in elem:
999 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +02001000 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001001 write("</" + tag + ">")
1002 if elem.tail:
1003 write(_escape_cdata(elem.tail))
1004
1005def _serialize_text(write, elem):
1006 for part in elem.itertext():
1007 write(part)
1008 if elem.tail:
1009 write(elem.tail)
1010
1011_serialize = {
1012 "xml": _serialize_xml,
1013 "html": _serialize_html,
1014 "text": _serialize_text,
1015# this optional method is imported at the end of the module
1016# "c14n": _serialize_c14n,
1017}
Armin Rigo9ed73062005-12-14 18:10:45 +00001018
Armin Rigo9ed73062005-12-14 18:10:45 +00001019
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001020def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001021 """Register a namespace prefix.
1022
1023 The registry is global, and any existing mapping for either the
1024 given prefix or the namespace URI will be removed.
1025
1026 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1027 attributes in this namespace will be serialized with prefix if possible.
1028
1029 ValueError is raised if prefix is reserved or is invalid.
1030
1031 """
R David Murray44b548d2016-09-08 13:59:53 -04001032 if re.match(r"ns\d+$", prefix):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001033 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001034 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001035 if k == uri or v == prefix:
1036 del _namespace_map[k]
1037 _namespace_map[uri] = prefix
1038
1039_namespace_map = {
1040 # "well-known" namespace prefixes
1041 "http://www.w3.org/XML/1998/namespace": "xml",
1042 "http://www.w3.org/1999/xhtml": "html",
1043 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1044 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1045 # xml schema
1046 "http://www.w3.org/2001/XMLSchema": "xs",
1047 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1048 # dublin core
1049 "http://purl.org/dc/elements/1.1/": "dc",
1050}
Florent Xicluna16395052012-02-16 23:28:35 +01001051# For tests and troubleshooting
1052register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001053
1054def _raise_serialization_error(text):
1055 raise TypeError(
1056 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1057 )
1058
1059def _escape_cdata(text):
1060 # escape character data
1061 try:
1062 # it's worth avoiding do-nothing calls for strings that are
Mike53f7a7c2017-12-14 14:04:53 +03001063 # shorter than 500 characters, or so. assume that's, by far,
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001064 # the most common case in most applications.
1065 if "&" in text:
1066 text = text.replace("&", "&amp;")
1067 if "<" in text:
1068 text = text.replace("<", "&lt;")
1069 if ">" in text:
1070 text = text.replace(">", "&gt;")
1071 return text
1072 except (TypeError, AttributeError):
1073 _raise_serialization_error(text)
1074
1075def _escape_attrib(text):
1076 # escape attribute value
1077 try:
1078 if "&" in text:
1079 text = text.replace("&", "&amp;")
1080 if "<" in text:
1081 text = text.replace("<", "&lt;")
1082 if ">" in text:
1083 text = text.replace(">", "&gt;")
1084 if "\"" in text:
1085 text = text.replace("\"", "&quot;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001086 # The following business with carriage returns is to satisfy
Raymond Hettinger11fa3ff2016-09-11 23:23:24 -07001087 # Section 2.11 of the XML specification, stating that
Raymond Hettinger076366c2016-09-11 23:18:03 -07001088 # CR or CR LN should be replaced with just LN
1089 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1090 if "\r\n" in text:
1091 text = text.replace("\r\n", "\n")
1092 if "\r" in text:
1093 text = text.replace("\r", "\n")
1094 #The following four lines are issue 17582
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001095 if "\n" in text:
1096 text = text.replace("\n", "&#10;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001097 if "\t" in text:
1098 text = text.replace("\t", "&#09;")
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001099 return text
1100 except (TypeError, AttributeError):
1101 _raise_serialization_error(text)
1102
1103def _escape_attrib_html(text):
1104 # escape attribute value
1105 try:
1106 if "&" in text:
1107 text = text.replace("&", "&amp;")
1108 if ">" in text:
1109 text = text.replace(">", "&gt;")
1110 if "\"" in text:
1111 text = text.replace("\"", "&quot;")
1112 return text
1113 except (TypeError, AttributeError):
1114 _raise_serialization_error(text)
1115
1116# --------------------------------------------------------------------
1117
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001118def tostring(element, encoding=None, method=None, *,
1119 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001120 """Generate string representation of XML element.
1121
1122 All subelements are included. If encoding is "unicode", a string
1123 is returned. Otherwise a bytestring is returned.
1124
1125 *element* is an Element instance, *encoding* is an optional output
1126 encoding defaulting to US-ASCII, *method* is an optional output which can
1127 be one of "xml" (default), "html", "text" or "c14n".
1128
1129 Returns an (optionally) encoded string containing the XML data.
1130
1131 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001132 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001133 ElementTree(element).write(stream, encoding, method=method,
1134 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001135 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001136
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001137class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001138 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001139 def __init__(self, lst):
1140 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001141
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001142 def writable(self):
1143 return True
1144
1145 def seekable(self):
1146 return True
1147
1148 def write(self, b):
1149 self.lst.append(b)
1150
1151 def tell(self):
1152 return len(self.lst)
1153
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001154def tostringlist(element, encoding=None, method=None, *,
1155 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001156 lst = []
1157 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001158 ElementTree(element).write(stream, encoding, method=method,
1159 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001160 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001161
Armin Rigo9ed73062005-12-14 18:10:45 +00001162
1163def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001164 """Write element tree or element structure to sys.stdout.
1165
1166 This function should be used for debugging only.
1167
1168 *elem* is either an ElementTree, or a single Element. The exact output
1169 format is implementation dependent. In this version, it's written as an
1170 ordinary XML file.
1171
1172 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001173 # debugging
1174 if not isinstance(elem, ElementTree):
1175 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001176 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001177 tail = elem.getroot().tail
1178 if not tail or tail[-1] != "\n":
1179 sys.stdout.write("\n")
1180
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001181# --------------------------------------------------------------------
1182# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001183
Armin Rigo9ed73062005-12-14 18:10:45 +00001184
1185def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001186 """Parse XML document into element tree.
1187
1188 *source* is a filename or file object containing XML data,
1189 *parser* is an optional parser instance defaulting to XMLParser.
1190
1191 Return an ElementTree instance.
1192
1193 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001194 tree = ElementTree()
1195 tree.parse(source, parser)
1196 return tree
1197
Armin Rigo9ed73062005-12-14 18:10:45 +00001198
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001199def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001200 """Incrementally parse XML document into ElementTree.
1201
1202 This class also reports what's going on to the user based on the
1203 *events* it is initialized with. The supported events are the strings
1204 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1205 detailed namespace information). If *events* is omitted, only
1206 "end" events are reported.
1207
1208 *source* is a filename or file object containing XML data, *events* is
1209 a list of events to report back, *parser* is an optional parser instance.
1210
1211 Returns an iterator providing (event, elem) pairs.
1212
1213 """
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001214 # Use the internal, undocumented _parser argument for now; When the
1215 # parser argument of iterparse is removed, this can be killed.
1216 pullparser = XMLPullParser(events=events, _parser=parser)
1217 def iterator():
1218 try:
1219 while True:
1220 yield from pullparser.read_events()
1221 # load event buffer
1222 data = source.read(16 * 1024)
1223 if not data:
1224 break
1225 pullparser.feed(data)
1226 root = pullparser._close_and_return_root()
1227 yield from pullparser.read_events()
1228 it.root = root
1229 finally:
1230 if close_source:
1231 source.close()
1232
Serhiy Storchaka2e576f52017-04-24 09:05:00 +03001233 class IterParseIterator(collections.abc.Iterator):
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001234 __next__ = iterator().__next__
1235 it = IterParseIterator()
1236 it.root = None
1237 del iterator, IterParseIterator
1238
Antoine Pitroue033e062010-10-29 10:38:18 +00001239 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001240 if not hasattr(source, "read"):
1241 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001242 close_source = True
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001243
1244 return it
Armin Rigo9ed73062005-12-14 18:10:45 +00001245
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001246
Eli Benderskyb5869342013-08-30 05:51:20 -07001247class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001248
Eli Benderskyb5869342013-08-30 05:51:20 -07001249 def __init__(self, events=None, *, _parser=None):
1250 # The _parser argument is for internal use only and must not be relied
1251 # upon in user code. It will be removed in a future release.
1252 # See http://bugs.python.org/issue17741 for more details.
1253
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001254 self._events_queue = collections.deque()
Eli Benderskyb5869342013-08-30 05:51:20 -07001255 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001256 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001257 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001258 events = ("end",)
1259 self._parser._setevents(self._events_queue, events)
1260
Eli Benderskyb5869342013-08-30 05:51:20 -07001261 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001262 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001263 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001264 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001265 if data:
1266 try:
1267 self._parser.feed(data)
1268 except SyntaxError as exc:
1269 self._events_queue.append(exc)
1270
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001271 def _close_and_return_root(self):
1272 # iterparse needs this to set its root attribute properly :(
1273 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001274 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001275 return root
1276
1277 def close(self):
1278 """Finish feeding data to parser.
1279
1280 Unlike XMLParser, does not return the root element. Use
1281 read_events() to consume elements from XMLPullParser.
1282 """
1283 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001284
Eli Benderskyb5869342013-08-30 05:51:20 -07001285 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001286 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001287
1288 Events are consumed from the internal event queue as they are
1289 retrieved from the iterator.
1290 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001291 events = self._events_queue
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001292 while events:
1293 event = events.popleft()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001294 if isinstance(event, Exception):
1295 raise event
1296 else:
1297 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001298
1299
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001300def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001301 """Parse XML document from string constant.
1302
1303 This function can be used to embed "XML Literals" in Python code.
1304
1305 *text* is a string containing XML data, *parser* is an
1306 optional parser instance, defaulting to the standard XMLParser.
1307
1308 Returns an Element instance.
1309
1310 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001311 if not parser:
1312 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001313 parser.feed(text)
1314 return parser.close()
1315
Armin Rigo9ed73062005-12-14 18:10:45 +00001316
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001317def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001318 """Parse XML document from string constant for its IDs.
1319
1320 *text* is a string containing XML data, *parser* is an
1321 optional parser instance, defaulting to the standard XMLParser.
1322
1323 Returns an (Element, dict) tuple, in which the
1324 dict maps element id:s to elements.
1325
1326 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001327 if not parser:
1328 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001329 parser.feed(text)
1330 tree = parser.close()
1331 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001332 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001333 id = elem.get("id")
1334 if id:
1335 ids[id] = elem
1336 return tree, ids
1337
Victor Stinner765531d2013-03-26 01:11:54 +01001338# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001339fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001340
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001341def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001342 """Parse XML document from sequence of string fragments.
1343
1344 *sequence* is a list of other sequence, *parser* is an optional parser
1345 instance, defaulting to the standard XMLParser.
1346
1347 Returns an Element instance.
1348
1349 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001350 if not parser:
1351 parser = XMLParser(target=TreeBuilder())
1352 for text in sequence:
1353 parser.feed(text)
1354 return parser.close()
1355
1356# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001357
Armin Rigo9ed73062005-12-14 18:10:45 +00001358
1359class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001360 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001361
Eli Bendersky84fae782013-03-09 07:12:48 -08001362 This builder converts a sequence of start, data, and end method
1363 calls to a well-formed element structure.
1364
1365 You can use this class to build an element structure using a custom XML
1366 parser, or a parser for some other XML-like format.
1367
1368 *element_factory* is an optional element factory which is called
1369 to create new Element instances, as necessary.
1370
1371 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001372 def __init__(self, element_factory=None):
1373 self._data = [] # data collector
1374 self._elem = [] # element stack
1375 self._last = None # last element
1376 self._tail = None # true if we're after an end tag
1377 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001378 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001379 self._factory = element_factory
1380
Armin Rigo9ed73062005-12-14 18:10:45 +00001381 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001382 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001383 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001384 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001385 return self._last
1386
1387 def _flush(self):
1388 if self._data:
1389 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001390 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001391 if self._tail:
1392 assert self._last.tail is None, "internal error (tail)"
1393 self._last.tail = text
1394 else:
1395 assert self._last.text is None, "internal error (text)"
1396 self._last.text = text
1397 self._data = []
1398
Armin Rigo9ed73062005-12-14 18:10:45 +00001399 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001400 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001401 self._data.append(data)
1402
Armin Rigo9ed73062005-12-14 18:10:45 +00001403 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001404 """Open new element and return it.
1405
1406 *tag* is the element name, *attrs* is a dict containing element
1407 attributes.
1408
1409 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001410 self._flush()
1411 self._last = elem = self._factory(tag, attrs)
1412 if self._elem:
1413 self._elem[-1].append(elem)
1414 self._elem.append(elem)
1415 self._tail = 0
1416 return elem
1417
Armin Rigo9ed73062005-12-14 18:10:45 +00001418 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001419 """Close and return current Element.
1420
1421 *tag* is the element name.
1422
1423 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001424 self._flush()
1425 self._last = self._elem.pop()
1426 assert self._last.tag == tag,\
1427 "end tag mismatch (expected %s, got %s)" % (
1428 self._last.tag, tag)
1429 self._tail = 1
1430 return self._last
1431
Armin Rigo9ed73062005-12-14 18:10:45 +00001432
Eli Bendersky84fae782013-03-09 07:12:48 -08001433# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001434class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001435 """Element structure builder for XML source data based on the expat parser.
1436
Eli Bendersky84fae782013-03-09 07:12:48 -08001437 *target* is an optional target object which defaults to an instance of the
1438 standard TreeBuilder class, *encoding* is an optional encoding string
1439 which if given, overrides the encoding specified in the XML file:
1440 http://www.iana.org/assignments/character-sets
1441
1442 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001443
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +03001444 def __init__(self, *, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001445 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001446 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001447 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001448 try:
1449 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001450 except ImportError:
1451 raise ImportError(
1452 "No module named expat; use SimpleXMLTreeBuilder instead"
1453 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001454 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001455 if target is None:
1456 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001457 # underscored names are provided for compatibility only
1458 self.parser = self._parser = parser
1459 self.target = self._target = target
1460 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001461 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001462 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001463 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001464 if hasattr(target, 'start'):
1465 parser.StartElementHandler = self._start
1466 if hasattr(target, 'end'):
1467 parser.EndElementHandler = self._end
1468 if hasattr(target, 'data'):
1469 parser.CharacterDataHandler = target.data
1470 # miscellaneous callbacks
1471 if hasattr(target, 'comment'):
1472 parser.CommentHandler = target.comment
1473 if hasattr(target, 'pi'):
1474 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001475 # Configure pyexpat: buffering, new-style attribute handling.
1476 parser.buffer_text = 1
1477 parser.ordered_attributes = 1
1478 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001479 self._doctype = None
1480 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001481 try:
1482 self.version = "Expat %d.%d.%d" % expat.version_info
1483 except AttributeError:
1484 pass # unknown
1485
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001486 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001487 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001488 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001489 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001490 # events_queue: a list of actual parsing events that will be populated
1491 # by the underlying parser.
1492 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001493 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001494 append = events_queue.append
1495 for event_name in events_to_report:
1496 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001497 parser.ordered_attributes = 1
1498 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001499 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001500 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001501 append((event, start(tag, attrib_in)))
1502 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001503 elif event_name == "end":
1504 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001505 end=self._end):
1506 append((event, end(tag)))
1507 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001508 elif event_name == "start-ns":
1509 def handler(prefix, uri, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001510 append((event, (prefix or "", uri or "")))
1511 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001512 elif event_name == "end-ns":
1513 def handler(prefix, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001514 append((event, None))
1515 parser.EndNamespaceDeclHandler = handler
1516 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001517 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001518
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001519 def _raiseerror(self, value):
1520 err = ParseError(value)
1521 err.code = value.code
1522 err.position = value.lineno, value.offset
1523 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001524
Armin Rigo9ed73062005-12-14 18:10:45 +00001525 def _fixname(self, key):
1526 # expand qname, and convert name string to ascii, if possible
1527 try:
1528 name = self._names[key]
1529 except KeyError:
1530 name = key
1531 if "}" in name:
1532 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001533 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001534 return name
1535
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001536 def _start(self, tag, attr_list):
1537 # Handler for expat's StartElementHandler. Since ordered_attributes
1538 # is set, the attributes are reported as a list of alternating
1539 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001540 fixname = self._fixname
1541 tag = fixname(tag)
1542 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001543 if attr_list:
1544 for i in range(0, len(attr_list), 2):
1545 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001546 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001547
Armin Rigo9ed73062005-12-14 18:10:45 +00001548 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001549 return self.target.end(self._fixname(tag))
1550
Armin Rigo9ed73062005-12-14 18:10:45 +00001551 def _default(self, text):
1552 prefix = text[:1]
1553 if prefix == "&":
1554 # deal with undefined entities
1555 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001556 data_handler = self.target.data
1557 except AttributeError:
1558 return
1559 try:
1560 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001561 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001562 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001563 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001564 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001565 (text, self.parser.ErrorLineNumber,
1566 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001567 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001568 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001569 err.lineno = self.parser.ErrorLineNumber
1570 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001571 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001572 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1573 self._doctype = [] # inside a doctype declaration
1574 elif self._doctype is not None:
1575 # parse doctype contents
1576 if prefix == ">":
1577 self._doctype = None
1578 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001579 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001580 if not text:
1581 return
1582 self._doctype.append(text)
1583 n = len(self._doctype)
1584 if n > 2:
1585 type = self._doctype[1]
1586 if type == "PUBLIC" and n == 4:
1587 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001588 if pubid:
1589 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001590 elif type == "SYSTEM" and n == 3:
1591 name, type, system = self._doctype
1592 pubid = None
1593 else:
1594 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001595 if hasattr(self.target, "doctype"):
1596 self.target.doctype(name, pubid, system[1:-1])
Serhiy Storchaka02ec92f2018-07-24 12:03:34 +03001597 elif hasattr(self, "doctype"):
1598 warnings.warn(
1599 "The doctype() method of XMLParser is ignored. "
1600 "Define doctype() method on the TreeBuilder target.",
1601 RuntimeWarning)
1602
Armin Rigo9ed73062005-12-14 18:10:45 +00001603 self._doctype = None
1604
Armin Rigo9ed73062005-12-14 18:10:45 +00001605 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001606 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001607 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001608 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001609 except self._error as v:
1610 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001611
Armin Rigo9ed73062005-12-14 18:10:45 +00001612 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001613 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001614 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001615 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001616 except self._error as v:
1617 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001618 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001619 close_handler = self.target.close
1620 except AttributeError:
1621 pass
1622 else:
1623 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001624 finally:
1625 # get rid of circular references
1626 del self.parser, self._parser
1627 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001628
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001629
1630# Import the C accelerators
1631try:
Eli Bendersky46955b22013-05-19 09:20:50 -07001632 # Element is going to be shadowed by the C implementation. We need to keep
1633 # the Python version of it accessible for some "creative" by external code
1634 # (see tests)
1635 _Element_Py = Element
1636
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001637 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1638 from _elementtree import *
Eli Benderskyc4e98a62013-05-19 09:24:43 -07001639except ImportError:
1640 pass