blob: 7caef553efb33c67afe3e5e47fbee9c73e6a3106 [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Martin Panterdcfebb32016-04-01 06:55:55 +000088 "XMLParser", "XMLPullParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +020098import collections
Serhiy Storchaka2e576f52017-04-24 09:05:00 +030099import collections.abc
Eli Bendersky00f402b2012-07-15 06:02:22 +0300100import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000101
Eli Bendersky27cbb192012-06-15 09:03:19 +0300102from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000103
Armin Rigo9ed73062005-12-14 18:10:45 +0000104
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000105class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800106 """An error when parsing an XML document.
107
108 In addition to its exception value, a ParseError contains
109 two extra attributes:
110 'code' - the specific exception code
111 'position' - the line and column of the error
112
113 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000114 pass
115
116# --------------------------------------------------------------------
117
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000118
119def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800120 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100121 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000122
Armin Rigo9ed73062005-12-14 18:10:45 +0000123
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000124class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800125 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000126
Eli Bendersky84fae782013-03-09 07:12:48 -0800127 This class is the reference implementation of the Element interface.
128
129 An element's length is its number of subelements. That means if you
Serhiy Storchaka56a6d852014-12-01 18:28:43 +0200130 want to check if an element is truly empty, you should check BOTH
Eli Bendersky84fae782013-03-09 07:12:48 -0800131 its length AND its text attribute.
132
133 The element tag, attribute names, and attribute values can be either
134 bytes or strings.
135
136 *tag* is the element name. *attrib* is an optional dictionary containing
137 element attributes. *extra* are additional element attributes given as
138 keyword arguments.
139
140 Example form:
141 <tag attrib>text<child/>...</tag>tail
142
143 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000144
145 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800146 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000147
148 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800149 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000150
151 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800152 """
153 Text before first subelement. This is either a string or the value None.
154 Note that if there is no text, this attribute may be either
155 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000156
Eli Bendersky84fae782013-03-09 07:12:48 -0800157 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000158
Eli Bendersky84fae782013-03-09 07:12:48 -0800159 tail = None
160 """
161 Text after this element's end tag, but before the next sibling element's
162 start tag. This is either a string or the value None. Note that if there
163 was no text, this attribute may be either None or an empty string,
164 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000165
Eli Bendersky84fae782013-03-09 07:12:48 -0800166 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000167
168 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300169 if not isinstance(attrib, dict):
170 raise TypeError("attrib must be dict, not %s" % (
171 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000172 attrib = attrib.copy()
173 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000174 self.tag = tag
175 self.attrib = attrib
176 self._children = []
177
178 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300179 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000180
Armin Rigo9ed73062005-12-14 18:10:45 +0000181 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800182 """Create a new element with the same type.
183
184 *tag* is a string containing the element name.
185 *attrib* is a dictionary containing the element attributes.
186
187 Do not call this method, use the SubElement factory function instead.
188
189 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000190 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000191
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000192 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800193 """Return copy of current element.
194
195 This creates a shallow copy. Subelements will be shared with the
196 original tree.
197
198 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000199 elem = self.makeelement(self.tag, self.attrib)
200 elem.text = self.text
201 elem.tail = self.tail
202 elem[:] = self
203 return elem
204
Armin Rigo9ed73062005-12-14 18:10:45 +0000205 def __len__(self):
206 return len(self._children)
207
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000208 def __bool__(self):
209 warnings.warn(
210 "The behavior of this method will change in future versions. "
211 "Use specific 'len(elem)' or 'elem is not None' test instead.",
212 FutureWarning, stacklevel=2
213 )
214 return len(self._children) != 0 # emulate old behaviour, for now
215
Armin Rigo9ed73062005-12-14 18:10:45 +0000216 def __getitem__(self, index):
217 return self._children[index]
218
Armin Rigo9ed73062005-12-14 18:10:45 +0000219 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000220 # if isinstance(index, slice):
221 # for elt in element:
222 # assert iselement(elt)
223 # else:
224 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 self._children[index] = element
226
Armin Rigo9ed73062005-12-14 18:10:45 +0000227 def __delitem__(self, index):
228 del self._children[index]
229
Eli Bendersky84fae782013-03-09 07:12:48 -0800230 def append(self, subelement):
231 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000232
Eli Bendersky84fae782013-03-09 07:12:48 -0800233 The new element will appear in document order after the last existing
234 subelement (or directly after the text, if it's the first subelement),
235 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000236
Eli Bendersky84fae782013-03-09 07:12:48 -0800237 """
238 self._assert_is_element(subelement)
239 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000240
241 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800242 """Append subelements from a sequence.
243
244 *elements* is a sequence with zero or more elements.
245
246 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200247 for element in elements:
248 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000249 self._children.extend(elements)
250
Eli Bendersky84fae782013-03-09 07:12:48 -0800251 def insert(self, index, subelement):
252 """Insert *subelement* at position *index*."""
253 self._assert_is_element(subelement)
254 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000255
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200256 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200257 # Need to refer to the actual Python implementation, not the
258 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700259 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200260 raise TypeError('expected an Element, not %s' % type(e).__name__)
261
Eli Bendersky84fae782013-03-09 07:12:48 -0800262 def remove(self, subelement):
263 """Remove matching subelement.
264
265 Unlike the find methods, this method compares elements based on
266 identity, NOT ON tag value or contents. To remove subelements by
267 other means, the easiest way is to use a list comprehension to
268 select what elements to keep, and then use slice assignment to update
269 the parent element.
270
271 ValueError is raised if a matching element could not be found.
272
273 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000274 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800275 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000276
277 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800278 """(Deprecated) Return all subelements.
279
280 Elements are returned in document order.
281
282 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000283 warnings.warn(
284 "This method will be removed in future versions. "
285 "Use 'list(elem)' or iteration over elem instead.",
286 DeprecationWarning, stacklevel=2
287 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000288 return self._children
289
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000290 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800291 """Find first matching element by tag name or path.
292
293 *path* is a string having either an element tag or an XPath,
294 *namespaces* is an optional mapping from namespace prefix to full name.
295
296 Return the first matching element, or None if no element was found.
297
298 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000299 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000300
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000301 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800302 """Find text for first matching element by tag name or path.
303
304 *path* is a string having either an element tag or an XPath,
305 *default* is the value to return if the element was not found,
306 *namespaces* is an optional mapping from namespace prefix to full name.
307
308 Return text content of first matching element, or default value if
309 none was found. Note that if an element is found having no text
310 content, the empty string is returned.
311
312 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000313 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000314
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000315 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800316 """Find all matching subelements by tag name or path.
317
318 *path* is a string having either an element tag or an XPath,
319 *namespaces* is an optional mapping from namespace prefix to full name.
320
321 Returns list containing all matching elements in document order.
322
323 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000324 return ElementPath.findall(self, path, namespaces)
325
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000326 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800327 """Find all matching subelements by tag name or path.
328
329 *path* is a string having either an element tag or an XPath,
330 *namespaces* is an optional mapping from namespace prefix to full name.
331
332 Return an iterable yielding all matching elements in document order.
333
334 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000335 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000336
Armin Rigo9ed73062005-12-14 18:10:45 +0000337 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800338 """Reset element.
339
340 This function removes all subelements, clears all attributes, and sets
341 the text and tail attributes to None.
342
343 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000344 self.attrib.clear()
345 self._children = []
346 self.text = self.tail = None
347
Armin Rigo9ed73062005-12-14 18:10:45 +0000348 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800349 """Get element attribute.
350
351 Equivalent to attrib.get, but some implementations may handle this a
352 bit more efficiently. *key* is what attribute to look for, and
353 *default* is what to return if the attribute was not found.
354
355 Returns a string containing the attribute value, or the default if
356 attribute was not found.
357
358 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000359 return self.attrib.get(key, default)
360
Armin Rigo9ed73062005-12-14 18:10:45 +0000361 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800362 """Set element attribute.
363
364 Equivalent to attrib[key] = value, but some implementations may handle
365 this a bit more efficiently. *key* is what attribute to set, and
366 *value* is the attribute value to set it to.
367
368 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000369 self.attrib[key] = value
370
Armin Rigo9ed73062005-12-14 18:10:45 +0000371 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800372 """Get list of attribute names.
373
374 Names are returned in an arbitrary order, just like an ordinary
375 Python dict. Equivalent to attrib.keys()
376
377 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000378 return self.attrib.keys()
379
Armin Rigo9ed73062005-12-14 18:10:45 +0000380 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800381 """Get element attributes as a sequence.
382
383 The attributes are returned in arbitrary order. Equivalent to
384 attrib.items().
385
386 Return a list of (name, value) tuples.
387
388 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000389 return self.attrib.items()
390
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000391 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800392 """Create tree iterator.
393
394 The iterator loops over the element and all subelements in document
395 order, returning all elements with a matching tag.
396
397 If the tree structure is modified during iteration, new or removed
398 elements may or may not be included. To get a stable set, use the
399 list() function on the iterator, and loop over the resulting list.
400
401 *tag* is what tags to look for (default is to return all elements)
402
403 Return an iterator containing all the matching elements.
404
405 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000406 if tag == "*":
407 tag = None
408 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000409 yield self
410 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700411 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000412
413 # compatibility
414 def getiterator(self, tag=None):
415 # Change for a DeprecationWarning in 1.4
416 warnings.warn(
417 "This method will be removed in future versions. "
418 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
419 PendingDeprecationWarning, stacklevel=2
420 )
421 return list(self.iter(tag))
422
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000423 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800424 """Create text iterator.
425
426 The iterator loops over the element and all subelements in document
427 order, returning all inner text.
428
429 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000430 tag = self.tag
431 if not isinstance(tag, str) and tag is not None:
432 return
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200433 t = self.text
434 if t:
435 yield t
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000436 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700437 yield from e.itertext()
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200438 t = e.tail
439 if t:
440 yield t
Armin Rigo9ed73062005-12-14 18:10:45 +0000441
Armin Rigo9ed73062005-12-14 18:10:45 +0000442
443def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800444 """Subelement factory which creates an element instance, and appends it
445 to an existing parent.
446
447 The element tag, attribute names, and attribute values can be either
448 bytes or Unicode strings.
449
450 *parent* is the parent element, *tag* is the subelements name, *attrib* is
451 an optional directory containing element attributes, *extra* are
452 additional attributes given as keyword arguments.
453
454 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000455 attrib = attrib.copy()
456 attrib.update(extra)
457 element = parent.makeelement(tag, attrib)
458 parent.append(element)
459 return element
460
Armin Rigo9ed73062005-12-14 18:10:45 +0000461
462def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800463 """Comment element factory.
464
465 This function creates a special element which the standard serializer
466 serializes as an XML comment.
467
468 *text* is a string containing the comment string.
469
470 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000471 element = Element(Comment)
472 element.text = text
473 return element
474
Armin Rigo9ed73062005-12-14 18:10:45 +0000475
476def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800477 """Processing Instruction element factory.
478
479 This function creates a special element which the standard serializer
480 serializes as an XML comment.
481
482 *target* is a string containing the processing instruction, *text* is a
483 string containing the processing instruction contents, if any.
484
485 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000486 element = Element(ProcessingInstruction)
487 element.text = target
488 if text:
489 element.text = element.text + " " + text
490 return element
491
492PI = ProcessingInstruction
493
Armin Rigo9ed73062005-12-14 18:10:45 +0000494
495class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800496 """Qualified name wrapper.
497
498 This class can be used to wrap a QName attribute value in order to get
499 proper namespace handing on output.
500
501 *text_or_uri* is a string containing the QName value either in the form
502 {uri}local, or if the tag argument is given, the URI part of a QName.
503
504 *tag* is an optional argument which if given, will make the first
505 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
506 be interpreted as a local name.
507
508 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000509 def __init__(self, text_or_uri, tag=None):
510 if tag:
511 text_or_uri = "{%s}%s" % (text_or_uri, tag)
512 self.text = text_or_uri
513 def __str__(self):
514 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000515 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300516 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000517 def __hash__(self):
518 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000519 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000520 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000521 return self.text <= other.text
522 return self.text <= other
523 def __lt__(self, other):
524 if isinstance(other, QName):
525 return self.text < other.text
526 return self.text < other
527 def __ge__(self, other):
528 if isinstance(other, QName):
529 return self.text >= other.text
530 return self.text >= other
531 def __gt__(self, other):
532 if isinstance(other, QName):
533 return self.text > other.text
534 return self.text > other
535 def __eq__(self, other):
536 if isinstance(other, QName):
537 return self.text == other.text
538 return self.text == other
Armin Rigo9ed73062005-12-14 18:10:45 +0000539
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000540# --------------------------------------------------------------------
541
Armin Rigo9ed73062005-12-14 18:10:45 +0000542
543class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800544 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000545
Eli Bendersky84fae782013-03-09 07:12:48 -0800546 This class also provides support for serialization to and from
547 standard XML.
548
549 *element* is an optional root element node,
550 *file* is an optional file handle or file name of an XML file whose
551 contents will be used to initialize the tree with.
552
553 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000554 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000555 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000556 self._root = element # first node
557 if file:
558 self.parse(file)
559
Armin Rigo9ed73062005-12-14 18:10:45 +0000560 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800561 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000562 return self._root
563
Armin Rigo9ed73062005-12-14 18:10:45 +0000564 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800565 """Replace root element of this tree.
566
567 This will discard the current contents of the tree and replace it
568 with the given element. Use with care!
569
570 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000571 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000572 self._root = element
573
Armin Rigo9ed73062005-12-14 18:10:45 +0000574 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800575 """Load external XML document into element tree.
576
577 *source* is a file name or file object, *parser* is an optional parser
578 instance that defaults to XMLParser.
579
580 ParseError is raised if the parser fails to parse the document.
581
582 Returns the root element of the given source document.
583
584 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000585 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000586 if not hasattr(source, "read"):
587 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000588 close_source = True
589 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700590 if parser is None:
591 # If no parser was specified, create a default XMLParser
592 parser = XMLParser()
593 if hasattr(parser, '_parse_whole'):
594 # The default XMLParser, when it comes from an accelerator,
595 # can define an internal _parse_whole API for efficiency.
596 # It can be used to parse the whole source without feeding
597 # it with chunks.
598 self._root = parser._parse_whole(source)
599 return self._root
600 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000601 data = source.read(65536)
602 if not data:
603 break
604 parser.feed(data)
605 self._root = parser.close()
606 return self._root
607 finally:
608 if close_source:
609 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000610
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000611 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800612 """Create and return tree iterator for the root element.
613
614 The iterator loops over all elements in this tree, in document order.
615
616 *tag* is a string with the tag name to iterate over
617 (default is to return all elements).
618
619 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000620 # assert self._root is not None
621 return self._root.iter(tag)
622
623 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000624 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000625 # Change for a DeprecationWarning in 1.4
626 warnings.warn(
627 "This method will be removed in future versions. "
628 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
629 PendingDeprecationWarning, stacklevel=2
630 )
631 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000632
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000633 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800634 """Find first matching element by tag name or path.
635
636 Same as getroot().find(path), which is Element.find()
637
638 *path* is a string having either an element tag or an XPath,
639 *namespaces* is an optional mapping from namespace prefix to full name.
640
641 Return the first matching element, or None if no element was found.
642
643 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000644 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000645 if path[:1] == "/":
646 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000647 warnings.warn(
648 "This search is broken in 1.3 and earlier, and will be "
649 "fixed in a future version. If you rely on the current "
650 "behaviour, change it to %r" % path,
651 FutureWarning, stacklevel=2
652 )
653 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000654
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000655 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800656 """Find first matching element by tag name or path.
657
658 Same as getroot().findtext(path), which is Element.findtext()
659
660 *path* is a string having either an element tag or an XPath,
661 *namespaces* is an optional mapping from namespace prefix to full name.
662
663 Return the first matching element, or None if no element was found.
664
665 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000666 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000667 if path[:1] == "/":
668 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000669 warnings.warn(
670 "This search is broken in 1.3 and earlier, and will be "
671 "fixed in a future version. If you rely on the current "
672 "behaviour, change it to %r" % path,
673 FutureWarning, stacklevel=2
674 )
675 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000676
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000677 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800678 """Find all matching subelements by tag name or path.
679
680 Same as getroot().findall(path), which is Element.findall().
681
682 *path* is a string having either an element tag or an XPath,
683 *namespaces* is an optional mapping from namespace prefix to full name.
684
685 Return list containing all matching elements in document order.
686
687 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000688 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000689 if path[:1] == "/":
690 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000691 warnings.warn(
692 "This search is broken in 1.3 and earlier, and will be "
693 "fixed in a future version. If you rely on the current "
694 "behaviour, change it to %r" % path,
695 FutureWarning, stacklevel=2
696 )
697 return self._root.findall(path, namespaces)
698
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000699 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800700 """Find all matching subelements by tag name or path.
701
702 Same as getroot().iterfind(path), which is element.iterfind()
703
704 *path* is a string having either an element tag or an XPath,
705 *namespaces* is an optional mapping from namespace prefix to full name.
706
707 Return an iterable yielding all matching elements in document order.
708
709 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000710 # assert self._root is not None
711 if path[:1] == "/":
712 path = "." + path
713 warnings.warn(
714 "This search is broken in 1.3 and earlier, and will be "
715 "fixed in a future version. If you rely on the current "
716 "behaviour, change it to %r" % path,
717 FutureWarning, stacklevel=2
718 )
719 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000720
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000721 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000722 encoding=None,
723 xml_declaration=None,
724 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800725 method=None, *,
726 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800727 """Write element tree to a file as XML.
728
729 Arguments:
730 *file_or_filename* -- file name or a file object opened for writing
731
732 *encoding* -- the output encoding (default: US-ASCII)
733
734 *xml_declaration* -- bool indicating if an XML declaration should be
735 added to the output. If None, an XML declaration
736 is added if encoding IS NOT either of:
737 US-ASCII, UTF-8, or Unicode
738
739 *default_namespace* -- sets the default XML namespace (for "xmlns")
740
741 *method* -- either "xml" (default), "html, "text", or "c14n"
742
743 *short_empty_elements* -- controls the formatting of elements
744 that contain no content. If True (default)
745 they are emitted as a single self-closed
746 tag, otherwise they are emitted as a pair
747 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800748
749 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000750 if not method:
751 method = "xml"
752 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000753 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000754 if not encoding:
755 if method == "c14n":
756 encoding = "utf-8"
757 else:
758 encoding = "us-ascii"
Martin Panter89f76d32015-09-23 01:14:35 +0000759 enc_lower = encoding.lower()
760 with _get_writer(file_or_filename, enc_lower) as write:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300761 if method == "xml" and (xml_declaration or
762 (xml_declaration is None and
Martin Panter89f76d32015-09-23 01:14:35 +0000763 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
Eli Bendersky00f402b2012-07-15 06:02:22 +0300764 declared_encoding = encoding
Martin Panter89f76d32015-09-23 01:14:35 +0000765 if enc_lower == "unicode":
Eli Bendersky00f402b2012-07-15 06:02:22 +0300766 # Retrieve the default encoding for the xml declaration
767 import locale
768 declared_encoding = locale.getpreferredencoding()
769 write("<?xml version='1.0' encoding='%s'?>\n" % (
770 declared_encoding,))
771 if method == "text":
772 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000773 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300774 qnames, namespaces = _namespaces(self._root, default_namespace)
775 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800776 serialize(write, self._root, qnames, namespaces,
777 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000778
779 def write_c14n(self, file):
780 # lxml.etree compatibility. use output method instead
781 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000782
783# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000784# serialization support
785
Eli Bendersky00f402b2012-07-15 06:02:22 +0300786@contextlib.contextmanager
787def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300788 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300789 try:
790 write = file_or_filename.write
791 except AttributeError:
792 # file_or_filename is a file name
793 if encoding == "unicode":
794 file = open(file_or_filename, "w")
795 else:
796 file = open(file_or_filename, "w", encoding=encoding,
797 errors="xmlcharrefreplace")
798 with file:
799 yield file.write
800 else:
801 # file_or_filename is a file-like object
802 # encoding determines if it is a text or binary writer
803 if encoding == "unicode":
804 # use a text writer as is
805 yield write
806 else:
807 # wrap a binary writer with TextIOWrapper
808 with contextlib.ExitStack() as stack:
809 if isinstance(file_or_filename, io.BufferedIOBase):
810 file = file_or_filename
811 elif isinstance(file_or_filename, io.RawIOBase):
812 file = io.BufferedWriter(file_or_filename)
813 # Keep the original file open when the BufferedWriter is
814 # destroyed
815 stack.callback(file.detach)
816 else:
817 # This is to handle passed objects that aren't in the
818 # IOBase hierarchy, but just have a write method
819 file = io.BufferedIOBase()
820 file.writable = lambda: True
821 file.write = write
822 try:
823 # TextIOWrapper uses this methods to determine
824 # if BOM (for UTF-16, etc) should be added
825 file.seekable = file_or_filename.seekable
826 file.tell = file_or_filename.tell
827 except AttributeError:
828 pass
829 file = io.TextIOWrapper(file,
830 encoding=encoding,
831 errors="xmlcharrefreplace",
832 newline="\n")
833 # Keep the original file open when the TextIOWrapper is
834 # destroyed
835 stack.callback(file.detach)
836 yield file.write
837
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000838def _namespaces(elem, default_namespace=None):
839 # identify namespaces used in this tree
840
841 # maps qnames to *encoded* prefix:local names
842 qnames = {None: None}
843
844 # maps uri:s to prefixes
845 namespaces = {}
846 if default_namespace:
847 namespaces[default_namespace] = ""
848
849 def add_qname(qname):
850 # calculate serialized qname representation
851 try:
852 if qname[:1] == "{":
853 uri, tag = qname[1:].rsplit("}", 1)
854 prefix = namespaces.get(uri)
855 if prefix is None:
856 prefix = _namespace_map.get(uri)
857 if prefix is None:
858 prefix = "ns%d" % len(namespaces)
859 if prefix != "xml":
860 namespaces[uri] = prefix
861 if prefix:
862 qnames[qname] = "%s:%s" % (prefix, tag)
863 else:
864 qnames[qname] = tag # default element
865 else:
866 if default_namespace:
867 # FIXME: can this be handled in XML 1.0?
868 raise ValueError(
869 "cannot use non-qualified names with "
870 "default_namespace option"
871 )
872 qnames[qname] = qname
873 except TypeError:
874 _raise_serialization_error(qname)
875
876 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300877 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000878 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000879 if isinstance(tag, QName):
880 if tag.text not in qnames:
881 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000882 elif isinstance(tag, str):
883 if tag not in qnames:
884 add_qname(tag)
885 elif tag is not None and tag is not Comment and tag is not PI:
886 _raise_serialization_error(tag)
887 for key, value in elem.items():
888 if isinstance(key, QName):
889 key = key.text
890 if key not in qnames:
891 add_qname(key)
892 if isinstance(value, QName) and value.text not in qnames:
893 add_qname(value.text)
894 text = elem.text
895 if isinstance(text, QName) and text.text not in qnames:
896 add_qname(text.text)
897 return qnames, namespaces
898
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800899def _serialize_xml(write, elem, qnames, namespaces,
900 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000901 tag = elem.tag
902 text = elem.text
903 if tag is Comment:
904 write("<!--%s-->" % text)
905 elif tag is ProcessingInstruction:
906 write("<?%s?>" % text)
907 else:
908 tag = qnames[tag]
909 if tag is None:
910 if text:
911 write(_escape_cdata(text))
912 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800913 _serialize_xml(write, e, qnames, None,
914 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000915 else:
916 write("<" + tag)
917 items = list(elem.items())
918 if items or namespaces:
919 if namespaces:
920 for v, k in sorted(namespaces.items(),
921 key=lambda x: x[1]): # sort on prefix
922 if k:
923 k = ":" + k
924 write(" xmlns%s=\"%s\"" % (
925 k,
926 _escape_attrib(v)
927 ))
928 for k, v in sorted(items): # lexical order
929 if isinstance(k, QName):
930 k = k.text
931 if isinstance(v, QName):
932 v = qnames[v.text]
933 else:
934 v = _escape_attrib(v)
935 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800936 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000937 write(">")
938 if text:
939 write(_escape_cdata(text))
940 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800941 _serialize_xml(write, e, qnames, None,
942 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000943 write("</" + tag + ">")
944 else:
945 write(" />")
946 if elem.tail:
947 write(_escape_cdata(elem.tail))
948
949HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300950 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000951
952try:
953 HTML_EMPTY = set(HTML_EMPTY)
954except NameError:
955 pass
956
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800957def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000958 tag = elem.tag
959 text = elem.text
960 if tag is Comment:
961 write("<!--%s-->" % _escape_cdata(text))
962 elif tag is ProcessingInstruction:
963 write("<?%s?>" % _escape_cdata(text))
964 else:
965 tag = qnames[tag]
966 if tag is None:
967 if text:
968 write(_escape_cdata(text))
969 for e in elem:
970 _serialize_html(write, e, qnames, None)
971 else:
972 write("<" + tag)
973 items = list(elem.items())
974 if items or namespaces:
975 if namespaces:
976 for v, k in sorted(namespaces.items(),
977 key=lambda x: x[1]): # sort on prefix
978 if k:
979 k = ":" + k
980 write(" xmlns%s=\"%s\"" % (
981 k,
982 _escape_attrib(v)
983 ))
984 for k, v in sorted(items): # lexical order
985 if isinstance(k, QName):
986 k = k.text
987 if isinstance(v, QName):
988 v = qnames[v.text]
989 else:
990 v = _escape_attrib_html(v)
991 # FIXME: handle boolean attributes
992 write(" %s=\"%s\"" % (qnames[k], v))
993 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200994 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000995 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200996 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000997 write(text)
998 else:
999 write(_escape_cdata(text))
1000 for e in elem:
1001 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +02001002 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001003 write("</" + tag + ">")
1004 if elem.tail:
1005 write(_escape_cdata(elem.tail))
1006
1007def _serialize_text(write, elem):
1008 for part in elem.itertext():
1009 write(part)
1010 if elem.tail:
1011 write(elem.tail)
1012
1013_serialize = {
1014 "xml": _serialize_xml,
1015 "html": _serialize_html,
1016 "text": _serialize_text,
1017# this optional method is imported at the end of the module
1018# "c14n": _serialize_c14n,
1019}
Armin Rigo9ed73062005-12-14 18:10:45 +00001020
Armin Rigo9ed73062005-12-14 18:10:45 +00001021
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001022def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001023 """Register a namespace prefix.
1024
1025 The registry is global, and any existing mapping for either the
1026 given prefix or the namespace URI will be removed.
1027
1028 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1029 attributes in this namespace will be serialized with prefix if possible.
1030
1031 ValueError is raised if prefix is reserved or is invalid.
1032
1033 """
R David Murray44b548d2016-09-08 13:59:53 -04001034 if re.match(r"ns\d+$", prefix):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001035 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001036 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001037 if k == uri or v == prefix:
1038 del _namespace_map[k]
1039 _namespace_map[uri] = prefix
1040
1041_namespace_map = {
1042 # "well-known" namespace prefixes
1043 "http://www.w3.org/XML/1998/namespace": "xml",
1044 "http://www.w3.org/1999/xhtml": "html",
1045 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1046 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1047 # xml schema
1048 "http://www.w3.org/2001/XMLSchema": "xs",
1049 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1050 # dublin core
1051 "http://purl.org/dc/elements/1.1/": "dc",
1052}
Florent Xicluna16395052012-02-16 23:28:35 +01001053# For tests and troubleshooting
1054register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001055
1056def _raise_serialization_error(text):
1057 raise TypeError(
1058 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1059 )
1060
1061def _escape_cdata(text):
1062 # escape character data
1063 try:
1064 # it's worth avoiding do-nothing calls for strings that are
1065 # shorter than 500 character, or so. assume that's, by far,
1066 # the most common case in most applications.
1067 if "&" in text:
1068 text = text.replace("&", "&amp;")
1069 if "<" in text:
1070 text = text.replace("<", "&lt;")
1071 if ">" in text:
1072 text = text.replace(">", "&gt;")
1073 return text
1074 except (TypeError, AttributeError):
1075 _raise_serialization_error(text)
1076
1077def _escape_attrib(text):
1078 # escape attribute value
1079 try:
1080 if "&" in text:
1081 text = text.replace("&", "&amp;")
1082 if "<" in text:
1083 text = text.replace("<", "&lt;")
1084 if ">" in text:
1085 text = text.replace(">", "&gt;")
1086 if "\"" in text:
1087 text = text.replace("\"", "&quot;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001088 # The following business with carriage returns is to satisfy
Raymond Hettinger11fa3ff2016-09-11 23:23:24 -07001089 # Section 2.11 of the XML specification, stating that
Raymond Hettinger076366c2016-09-11 23:18:03 -07001090 # CR or CR LN should be replaced with just LN
1091 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1092 if "\r\n" in text:
1093 text = text.replace("\r\n", "\n")
1094 if "\r" in text:
1095 text = text.replace("\r", "\n")
1096 #The following four lines are issue 17582
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001097 if "\n" in text:
1098 text = text.replace("\n", "&#10;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001099 if "\t" in text:
1100 text = text.replace("\t", "&#09;")
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001101 return text
1102 except (TypeError, AttributeError):
1103 _raise_serialization_error(text)
1104
1105def _escape_attrib_html(text):
1106 # escape attribute value
1107 try:
1108 if "&" in text:
1109 text = text.replace("&", "&amp;")
1110 if ">" in text:
1111 text = text.replace(">", "&gt;")
1112 if "\"" in text:
1113 text = text.replace("\"", "&quot;")
1114 return text
1115 except (TypeError, AttributeError):
1116 _raise_serialization_error(text)
1117
1118# --------------------------------------------------------------------
1119
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001120def tostring(element, encoding=None, method=None, *,
1121 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001122 """Generate string representation of XML element.
1123
1124 All subelements are included. If encoding is "unicode", a string
1125 is returned. Otherwise a bytestring is returned.
1126
1127 *element* is an Element instance, *encoding* is an optional output
1128 encoding defaulting to US-ASCII, *method* is an optional output which can
1129 be one of "xml" (default), "html", "text" or "c14n".
1130
1131 Returns an (optionally) encoded string containing the XML data.
1132
1133 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001134 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001135 ElementTree(element).write(stream, encoding, method=method,
1136 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001137 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001138
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001139class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001140 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001141 def __init__(self, lst):
1142 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001143
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001144 def writable(self):
1145 return True
1146
1147 def seekable(self):
1148 return True
1149
1150 def write(self, b):
1151 self.lst.append(b)
1152
1153 def tell(self):
1154 return len(self.lst)
1155
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001156def tostringlist(element, encoding=None, method=None, *,
1157 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001158 lst = []
1159 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001160 ElementTree(element).write(stream, encoding, method=method,
1161 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001162 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001163
Armin Rigo9ed73062005-12-14 18:10:45 +00001164
1165def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001166 """Write element tree or element structure to sys.stdout.
1167
1168 This function should be used for debugging only.
1169
1170 *elem* is either an ElementTree, or a single Element. The exact output
1171 format is implementation dependent. In this version, it's written as an
1172 ordinary XML file.
1173
1174 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001175 # debugging
1176 if not isinstance(elem, ElementTree):
1177 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001178 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001179 tail = elem.getroot().tail
1180 if not tail or tail[-1] != "\n":
1181 sys.stdout.write("\n")
1182
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001183# --------------------------------------------------------------------
1184# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001185
Armin Rigo9ed73062005-12-14 18:10:45 +00001186
1187def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001188 """Parse XML document into element tree.
1189
1190 *source* is a filename or file object containing XML data,
1191 *parser* is an optional parser instance defaulting to XMLParser.
1192
1193 Return an ElementTree instance.
1194
1195 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001196 tree = ElementTree()
1197 tree.parse(source, parser)
1198 return tree
1199
Armin Rigo9ed73062005-12-14 18:10:45 +00001200
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001201def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001202 """Incrementally parse XML document into ElementTree.
1203
1204 This class also reports what's going on to the user based on the
1205 *events* it is initialized with. The supported events are the strings
1206 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1207 detailed namespace information). If *events* is omitted, only
1208 "end" events are reported.
1209
1210 *source* is a filename or file object containing XML data, *events* is
1211 a list of events to report back, *parser* is an optional parser instance.
1212
1213 Returns an iterator providing (event, elem) pairs.
1214
1215 """
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001216 # Use the internal, undocumented _parser argument for now; When the
1217 # parser argument of iterparse is removed, this can be killed.
1218 pullparser = XMLPullParser(events=events, _parser=parser)
1219 def iterator():
1220 try:
1221 while True:
1222 yield from pullparser.read_events()
1223 # load event buffer
1224 data = source.read(16 * 1024)
1225 if not data:
1226 break
1227 pullparser.feed(data)
1228 root = pullparser._close_and_return_root()
1229 yield from pullparser.read_events()
1230 it.root = root
1231 finally:
1232 if close_source:
1233 source.close()
1234
Serhiy Storchaka2e576f52017-04-24 09:05:00 +03001235 class IterParseIterator(collections.abc.Iterator):
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001236 __next__ = iterator().__next__
1237 it = IterParseIterator()
1238 it.root = None
1239 del iterator, IterParseIterator
1240
Antoine Pitroue033e062010-10-29 10:38:18 +00001241 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001242 if not hasattr(source, "read"):
1243 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001244 close_source = True
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001245
1246 return it
Armin Rigo9ed73062005-12-14 18:10:45 +00001247
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001248
Eli Benderskyb5869342013-08-30 05:51:20 -07001249class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001250
Eli Benderskyb5869342013-08-30 05:51:20 -07001251 def __init__(self, events=None, *, _parser=None):
1252 # The _parser argument is for internal use only and must not be relied
1253 # upon in user code. It will be removed in a future release.
1254 # See http://bugs.python.org/issue17741 for more details.
1255
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001256 self._events_queue = collections.deque()
Eli Benderskyb5869342013-08-30 05:51:20 -07001257 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001258 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001259 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001260 events = ("end",)
1261 self._parser._setevents(self._events_queue, events)
1262
Eli Benderskyb5869342013-08-30 05:51:20 -07001263 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001264 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001265 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001266 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001267 if data:
1268 try:
1269 self._parser.feed(data)
1270 except SyntaxError as exc:
1271 self._events_queue.append(exc)
1272
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001273 def _close_and_return_root(self):
1274 # iterparse needs this to set its root attribute properly :(
1275 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001276 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001277 return root
1278
1279 def close(self):
1280 """Finish feeding data to parser.
1281
1282 Unlike XMLParser, does not return the root element. Use
1283 read_events() to consume elements from XMLPullParser.
1284 """
1285 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001286
Eli Benderskyb5869342013-08-30 05:51:20 -07001287 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001288 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001289
1290 Events are consumed from the internal event queue as they are
1291 retrieved from the iterator.
1292 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001293 events = self._events_queue
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001294 while events:
1295 event = events.popleft()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001296 if isinstance(event, Exception):
1297 raise event
1298 else:
1299 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001300
1301
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001302def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001303 """Parse XML document from string constant.
1304
1305 This function can be used to embed "XML Literals" in Python code.
1306
1307 *text* is a string containing XML data, *parser* is an
1308 optional parser instance, defaulting to the standard XMLParser.
1309
1310 Returns an Element instance.
1311
1312 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001313 if not parser:
1314 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001315 parser.feed(text)
1316 return parser.close()
1317
Armin Rigo9ed73062005-12-14 18:10:45 +00001318
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001319def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001320 """Parse XML document from string constant for its IDs.
1321
1322 *text* is a string containing XML data, *parser* is an
1323 optional parser instance, defaulting to the standard XMLParser.
1324
1325 Returns an (Element, dict) tuple, in which the
1326 dict maps element id:s to elements.
1327
1328 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001329 if not parser:
1330 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001331 parser.feed(text)
1332 tree = parser.close()
1333 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001334 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001335 id = elem.get("id")
1336 if id:
1337 ids[id] = elem
1338 return tree, ids
1339
Victor Stinner765531d2013-03-26 01:11:54 +01001340# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001341fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001342
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001343def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001344 """Parse XML document from sequence of string fragments.
1345
1346 *sequence* is a list of other sequence, *parser* is an optional parser
1347 instance, defaulting to the standard XMLParser.
1348
1349 Returns an Element instance.
1350
1351 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001352 if not parser:
1353 parser = XMLParser(target=TreeBuilder())
1354 for text in sequence:
1355 parser.feed(text)
1356 return parser.close()
1357
1358# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001359
Armin Rigo9ed73062005-12-14 18:10:45 +00001360
1361class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001362 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001363
Eli Bendersky84fae782013-03-09 07:12:48 -08001364 This builder converts a sequence of start, data, and end method
1365 calls to a well-formed element structure.
1366
1367 You can use this class to build an element structure using a custom XML
1368 parser, or a parser for some other XML-like format.
1369
1370 *element_factory* is an optional element factory which is called
1371 to create new Element instances, as necessary.
1372
1373 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001374 def __init__(self, element_factory=None):
1375 self._data = [] # data collector
1376 self._elem = [] # element stack
1377 self._last = None # last element
1378 self._tail = None # true if we're after an end tag
1379 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001380 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001381 self._factory = element_factory
1382
Armin Rigo9ed73062005-12-14 18:10:45 +00001383 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001384 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001385 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001386 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001387 return self._last
1388
1389 def _flush(self):
1390 if self._data:
1391 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001392 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001393 if self._tail:
1394 assert self._last.tail is None, "internal error (tail)"
1395 self._last.tail = text
1396 else:
1397 assert self._last.text is None, "internal error (text)"
1398 self._last.text = text
1399 self._data = []
1400
Armin Rigo9ed73062005-12-14 18:10:45 +00001401 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001402 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001403 self._data.append(data)
1404
Armin Rigo9ed73062005-12-14 18:10:45 +00001405 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001406 """Open new element and return it.
1407
1408 *tag* is the element name, *attrs* is a dict containing element
1409 attributes.
1410
1411 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001412 self._flush()
1413 self._last = elem = self._factory(tag, attrs)
1414 if self._elem:
1415 self._elem[-1].append(elem)
1416 self._elem.append(elem)
1417 self._tail = 0
1418 return elem
1419
Armin Rigo9ed73062005-12-14 18:10:45 +00001420 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001421 """Close and return current Element.
1422
1423 *tag* is the element name.
1424
1425 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001426 self._flush()
1427 self._last = self._elem.pop()
1428 assert self._last.tag == tag,\
1429 "end tag mismatch (expected %s, got %s)" % (
1430 self._last.tag, tag)
1431 self._tail = 1
1432 return self._last
1433
Serhiy Storchaka762ec972017-03-30 18:12:06 +03001434_sentinel = ['sentinel']
Armin Rigo9ed73062005-12-14 18:10:45 +00001435
Eli Bendersky84fae782013-03-09 07:12:48 -08001436# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001437class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001438 """Element structure builder for XML source data based on the expat parser.
1439
Martin Panter29ce0822016-06-04 07:12:51 +00001440 *html* are predefined HTML entities (deprecated and not supported),
Eli Bendersky84fae782013-03-09 07:12:48 -08001441 *target* is an optional target object which defaults to an instance of the
1442 standard TreeBuilder class, *encoding* is an optional encoding string
1443 which if given, overrides the encoding specified in the XML file:
1444 http://www.iana.org/assignments/character-sets
1445
1446 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001447
Serhiy Storchaka762ec972017-03-30 18:12:06 +03001448 def __init__(self, html=_sentinel, target=None, encoding=None):
1449 if html is not _sentinel:
1450 warnings.warn(
1451 "The html argument of XMLParser() is deprecated",
1452 DeprecationWarning, stacklevel=2)
Armin Rigo9ed73062005-12-14 18:10:45 +00001453 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001454 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001455 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001456 try:
1457 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001458 except ImportError:
1459 raise ImportError(
1460 "No module named expat; use SimpleXMLTreeBuilder instead"
1461 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001462 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001463 if target is None:
1464 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001465 # underscored names are provided for compatibility only
1466 self.parser = self._parser = parser
1467 self.target = self._target = target
1468 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001469 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001470 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001471 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001472 if hasattr(target, 'start'):
1473 parser.StartElementHandler = self._start
1474 if hasattr(target, 'end'):
1475 parser.EndElementHandler = self._end
1476 if hasattr(target, 'data'):
1477 parser.CharacterDataHandler = target.data
1478 # miscellaneous callbacks
1479 if hasattr(target, 'comment'):
1480 parser.CommentHandler = target.comment
1481 if hasattr(target, 'pi'):
1482 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001483 # Configure pyexpat: buffering, new-style attribute handling.
1484 parser.buffer_text = 1
1485 parser.ordered_attributes = 1
1486 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001487 self._doctype = None
1488 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001489 try:
1490 self.version = "Expat %d.%d.%d" % expat.version_info
1491 except AttributeError:
1492 pass # unknown
1493
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001494 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001495 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001496 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001497 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001498 # events_queue: a list of actual parsing events that will be populated
1499 # by the underlying parser.
1500 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001501 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001502 append = events_queue.append
1503 for event_name in events_to_report:
1504 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001505 parser.ordered_attributes = 1
1506 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001507 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001508 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001509 append((event, start(tag, attrib_in)))
1510 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001511 elif event_name == "end":
1512 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001513 end=self._end):
1514 append((event, end(tag)))
1515 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001516 elif event_name == "start-ns":
1517 def handler(prefix, uri, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001518 append((event, (prefix or "", uri or "")))
1519 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001520 elif event_name == "end-ns":
1521 def handler(prefix, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001522 append((event, None))
1523 parser.EndNamespaceDeclHandler = handler
1524 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001525 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001526
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001527 def _raiseerror(self, value):
1528 err = ParseError(value)
1529 err.code = value.code
1530 err.position = value.lineno, value.offset
1531 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001532
Armin Rigo9ed73062005-12-14 18:10:45 +00001533 def _fixname(self, key):
1534 # expand qname, and convert name string to ascii, if possible
1535 try:
1536 name = self._names[key]
1537 except KeyError:
1538 name = key
1539 if "}" in name:
1540 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001541 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001542 return name
1543
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001544 def _start(self, tag, attr_list):
1545 # Handler for expat's StartElementHandler. Since ordered_attributes
1546 # is set, the attributes are reported as a list of alternating
1547 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001548 fixname = self._fixname
1549 tag = fixname(tag)
1550 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001551 if attr_list:
1552 for i in range(0, len(attr_list), 2):
1553 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001554 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001555
Armin Rigo9ed73062005-12-14 18:10:45 +00001556 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001557 return self.target.end(self._fixname(tag))
1558
Armin Rigo9ed73062005-12-14 18:10:45 +00001559 def _default(self, text):
1560 prefix = text[:1]
1561 if prefix == "&":
1562 # deal with undefined entities
1563 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001564 data_handler = self.target.data
1565 except AttributeError:
1566 return
1567 try:
1568 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001569 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001570 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001571 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001572 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001573 (text, self.parser.ErrorLineNumber,
1574 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001575 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001576 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001577 err.lineno = self.parser.ErrorLineNumber
1578 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001579 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001580 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1581 self._doctype = [] # inside a doctype declaration
1582 elif self._doctype is not None:
1583 # parse doctype contents
1584 if prefix == ">":
1585 self._doctype = None
1586 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001587 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001588 if not text:
1589 return
1590 self._doctype.append(text)
1591 n = len(self._doctype)
1592 if n > 2:
1593 type = self._doctype[1]
1594 if type == "PUBLIC" and n == 4:
1595 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001596 if pubid:
1597 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001598 elif type == "SYSTEM" and n == 3:
1599 name, type, system = self._doctype
1600 pubid = None
1601 else:
1602 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001603 if hasattr(self.target, "doctype"):
1604 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001605 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001606 # warn about deprecated call
1607 self._XMLParser__doctype(name, pubid, system[1:-1])
1608 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001609 self._doctype = None
1610
Armin Rigo9ed73062005-12-14 18:10:45 +00001611 def doctype(self, name, pubid, system):
Eli Bendersky84fae782013-03-09 07:12:48 -08001612 """(Deprecated) Handle doctype declaration
1613
1614 *name* is the Doctype name, *pubid* is the public identifier,
1615 and *system* is the system identifier.
1616
1617 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001618 warnings.warn(
1619 "This method of XMLParser is deprecated. Define doctype() "
1620 "method on the TreeBuilder target.",
1621 DeprecationWarning,
1622 )
1623
1624 # sentinel, if doctype is redefined in a subclass
1625 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001626
Armin Rigo9ed73062005-12-14 18:10:45 +00001627 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001628 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001629 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001630 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001631 except self._error as v:
1632 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001633
Armin Rigo9ed73062005-12-14 18:10:45 +00001634 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001635 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001636 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001637 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001638 except self._error as v:
1639 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001640 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001641 close_handler = self.target.close
1642 except AttributeError:
1643 pass
1644 else:
1645 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001646 finally:
1647 # get rid of circular references
1648 del self.parser, self._parser
1649 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001650
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001651
1652# Import the C accelerators
1653try:
Eli Bendersky46955b22013-05-19 09:20:50 -07001654 # Element is going to be shadowed by the C implementation. We need to keep
1655 # the Python version of it accessible for some "creative" by external code
1656 # (see tests)
1657 _Element_Py = Element
1658
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001659 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1660 from _elementtree import *
Eli Benderskyc4e98a62013-05-19 09:24:43 -07001661except ImportError:
1662 pass