blob: 6c1345a5ad6ac640d5f7a09b8bf3f402c5cca28a [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Eli Benderskyc4e98a62013-05-19 09:24:43 -070088 "XMLParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
98import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +000099
Eli Bendersky27cbb192012-06-15 09:03:19 +0300100from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000101
Armin Rigo9ed73062005-12-14 18:10:45 +0000102
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000103class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800104 """An error when parsing an XML document.
105
106 In addition to its exception value, a ParseError contains
107 two extra attributes:
108 'code' - the specific exception code
109 'position' - the line and column of the error
110
111 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000112 pass
113
114# --------------------------------------------------------------------
115
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000116
117def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800118 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100119 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000120
Armin Rigo9ed73062005-12-14 18:10:45 +0000121
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000122class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800123 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000124
Eli Bendersky84fae782013-03-09 07:12:48 -0800125 This class is the reference implementation of the Element interface.
126
127 An element's length is its number of subelements. That means if you
128 you want to check if an element is truly empty, you should check BOTH
129 its length AND its text attribute.
130
131 The element tag, attribute names, and attribute values can be either
132 bytes or strings.
133
134 *tag* is the element name. *attrib* is an optional dictionary containing
135 element attributes. *extra* are additional element attributes given as
136 keyword arguments.
137
138 Example form:
139 <tag attrib>text<child/>...</tag>tail
140
141 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000142
143 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800144 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000145
146 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800147 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000148
149 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800150 """
151 Text before first subelement. This is either a string or the value None.
152 Note that if there is no text, this attribute may be either
153 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000154
Eli Bendersky84fae782013-03-09 07:12:48 -0800155 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000156
Eli Bendersky84fae782013-03-09 07:12:48 -0800157 tail = None
158 """
159 Text after this element's end tag, but before the next sibling element's
160 start tag. This is either a string or the value None. Note that if there
161 was no text, this attribute may be either None or an empty string,
162 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000163
Eli Bendersky84fae782013-03-09 07:12:48 -0800164 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000165
166 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300167 if not isinstance(attrib, dict):
168 raise TypeError("attrib must be dict, not %s" % (
169 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000170 attrib = attrib.copy()
171 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000172 self.tag = tag
173 self.attrib = attrib
174 self._children = []
175
176 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300177 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000178
Armin Rigo9ed73062005-12-14 18:10:45 +0000179 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800180 """Create a new element with the same type.
181
182 *tag* is a string containing the element name.
183 *attrib* is a dictionary containing the element attributes.
184
185 Do not call this method, use the SubElement factory function instead.
186
187 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000188 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000189
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000190 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800191 """Return copy of current element.
192
193 This creates a shallow copy. Subelements will be shared with the
194 original tree.
195
196 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000197 elem = self.makeelement(self.tag, self.attrib)
198 elem.text = self.text
199 elem.tail = self.tail
200 elem[:] = self
201 return elem
202
Armin Rigo9ed73062005-12-14 18:10:45 +0000203 def __len__(self):
204 return len(self._children)
205
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 def __bool__(self):
207 warnings.warn(
208 "The behavior of this method will change in future versions. "
209 "Use specific 'len(elem)' or 'elem is not None' test instead.",
210 FutureWarning, stacklevel=2
211 )
212 return len(self._children) != 0 # emulate old behaviour, for now
213
Armin Rigo9ed73062005-12-14 18:10:45 +0000214 def __getitem__(self, index):
215 return self._children[index]
216
Armin Rigo9ed73062005-12-14 18:10:45 +0000217 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000218 # if isinstance(index, slice):
219 # for elt in element:
220 # assert iselement(elt)
221 # else:
222 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000223 self._children[index] = element
224
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 def __delitem__(self, index):
226 del self._children[index]
227
Eli Bendersky84fae782013-03-09 07:12:48 -0800228 def append(self, subelement):
229 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000230
Eli Bendersky84fae782013-03-09 07:12:48 -0800231 The new element will appear in document order after the last existing
232 subelement (or directly after the text, if it's the first subelement),
233 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000234
Eli Bendersky84fae782013-03-09 07:12:48 -0800235 """
236 self._assert_is_element(subelement)
237 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000238
239 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800240 """Append subelements from a sequence.
241
242 *elements* is a sequence with zero or more elements.
243
244 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200245 for element in elements:
246 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000247 self._children.extend(elements)
248
Eli Bendersky84fae782013-03-09 07:12:48 -0800249 def insert(self, index, subelement):
250 """Insert *subelement* at position *index*."""
251 self._assert_is_element(subelement)
252 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000253
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200254 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200255 # Need to refer to the actual Python implementation, not the
256 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700257 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200258 raise TypeError('expected an Element, not %s' % type(e).__name__)
259
Eli Bendersky84fae782013-03-09 07:12:48 -0800260 def remove(self, subelement):
261 """Remove matching subelement.
262
263 Unlike the find methods, this method compares elements based on
264 identity, NOT ON tag value or contents. To remove subelements by
265 other means, the easiest way is to use a list comprehension to
266 select what elements to keep, and then use slice assignment to update
267 the parent element.
268
269 ValueError is raised if a matching element could not be found.
270
271 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000272 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800273 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000274
275 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800276 """(Deprecated) Return all subelements.
277
278 Elements are returned in document order.
279
280 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000281 warnings.warn(
282 "This method will be removed in future versions. "
283 "Use 'list(elem)' or iteration over elem instead.",
284 DeprecationWarning, stacklevel=2
285 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000286 return self._children
287
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000288 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800289 """Find first matching element by tag name or path.
290
291 *path* is a string having either an element tag or an XPath,
292 *namespaces* is an optional mapping from namespace prefix to full name.
293
294 Return the first matching element, or None if no element was found.
295
296 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000297 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000298
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000299 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800300 """Find text for first matching element by tag name or path.
301
302 *path* is a string having either an element tag or an XPath,
303 *default* is the value to return if the element was not found,
304 *namespaces* is an optional mapping from namespace prefix to full name.
305
306 Return text content of first matching element, or default value if
307 none was found. Note that if an element is found having no text
308 content, the empty string is returned.
309
310 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000311 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000312
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000313 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800314 """Find all matching subelements by tag name or path.
315
316 *path* is a string having either an element tag or an XPath,
317 *namespaces* is an optional mapping from namespace prefix to full name.
318
319 Returns list containing all matching elements in document order.
320
321 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000322 return ElementPath.findall(self, path, namespaces)
323
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000324 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800325 """Find all matching subelements by tag name or path.
326
327 *path* is a string having either an element tag or an XPath,
328 *namespaces* is an optional mapping from namespace prefix to full name.
329
330 Return an iterable yielding all matching elements in document order.
331
332 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000333 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000334
Armin Rigo9ed73062005-12-14 18:10:45 +0000335 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800336 """Reset element.
337
338 This function removes all subelements, clears all attributes, and sets
339 the text and tail attributes to None.
340
341 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000342 self.attrib.clear()
343 self._children = []
344 self.text = self.tail = None
345
Armin Rigo9ed73062005-12-14 18:10:45 +0000346 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800347 """Get element attribute.
348
349 Equivalent to attrib.get, but some implementations may handle this a
350 bit more efficiently. *key* is what attribute to look for, and
351 *default* is what to return if the attribute was not found.
352
353 Returns a string containing the attribute value, or the default if
354 attribute was not found.
355
356 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000357 return self.attrib.get(key, default)
358
Armin Rigo9ed73062005-12-14 18:10:45 +0000359 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800360 """Set element attribute.
361
362 Equivalent to attrib[key] = value, but some implementations may handle
363 this a bit more efficiently. *key* is what attribute to set, and
364 *value* is the attribute value to set it to.
365
366 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000367 self.attrib[key] = value
368
Armin Rigo9ed73062005-12-14 18:10:45 +0000369 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800370 """Get list of attribute names.
371
372 Names are returned in an arbitrary order, just like an ordinary
373 Python dict. Equivalent to attrib.keys()
374
375 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000376 return self.attrib.keys()
377
Armin Rigo9ed73062005-12-14 18:10:45 +0000378 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800379 """Get element attributes as a sequence.
380
381 The attributes are returned in arbitrary order. Equivalent to
382 attrib.items().
383
384 Return a list of (name, value) tuples.
385
386 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000387 return self.attrib.items()
388
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000389 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800390 """Create tree iterator.
391
392 The iterator loops over the element and all subelements in document
393 order, returning all elements with a matching tag.
394
395 If the tree structure is modified during iteration, new or removed
396 elements may or may not be included. To get a stable set, use the
397 list() function on the iterator, and loop over the resulting list.
398
399 *tag* is what tags to look for (default is to return all elements)
400
401 Return an iterator containing all the matching elements.
402
403 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000404 if tag == "*":
405 tag = None
406 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000407 yield self
408 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700409 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000410
411 # compatibility
412 def getiterator(self, tag=None):
413 # Change for a DeprecationWarning in 1.4
414 warnings.warn(
415 "This method will be removed in future versions. "
416 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
417 PendingDeprecationWarning, stacklevel=2
418 )
419 return list(self.iter(tag))
420
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000421 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800422 """Create text iterator.
423
424 The iterator loops over the element and all subelements in document
425 order, returning all inner text.
426
427 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000428 tag = self.tag
429 if not isinstance(tag, str) and tag is not None:
430 return
431 if self.text:
432 yield self.text
433 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700434 yield from e.itertext()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000435 if e.tail:
436 yield e.tail
Armin Rigo9ed73062005-12-14 18:10:45 +0000437
Armin Rigo9ed73062005-12-14 18:10:45 +0000438
439def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800440 """Subelement factory which creates an element instance, and appends it
441 to an existing parent.
442
443 The element tag, attribute names, and attribute values can be either
444 bytes or Unicode strings.
445
446 *parent* is the parent element, *tag* is the subelements name, *attrib* is
447 an optional directory containing element attributes, *extra* are
448 additional attributes given as keyword arguments.
449
450 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000451 attrib = attrib.copy()
452 attrib.update(extra)
453 element = parent.makeelement(tag, attrib)
454 parent.append(element)
455 return element
456
Armin Rigo9ed73062005-12-14 18:10:45 +0000457
458def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800459 """Comment element factory.
460
461 This function creates a special element which the standard serializer
462 serializes as an XML comment.
463
464 *text* is a string containing the comment string.
465
466 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000467 element = Element(Comment)
468 element.text = text
469 return element
470
Armin Rigo9ed73062005-12-14 18:10:45 +0000471
472def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800473 """Processing Instruction element factory.
474
475 This function creates a special element which the standard serializer
476 serializes as an XML comment.
477
478 *target* is a string containing the processing instruction, *text* is a
479 string containing the processing instruction contents, if any.
480
481 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000482 element = Element(ProcessingInstruction)
483 element.text = target
484 if text:
485 element.text = element.text + " " + text
486 return element
487
488PI = ProcessingInstruction
489
Armin Rigo9ed73062005-12-14 18:10:45 +0000490
491class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800492 """Qualified name wrapper.
493
494 This class can be used to wrap a QName attribute value in order to get
495 proper namespace handing on output.
496
497 *text_or_uri* is a string containing the QName value either in the form
498 {uri}local, or if the tag argument is given, the URI part of a QName.
499
500 *tag* is an optional argument which if given, will make the first
501 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
502 be interpreted as a local name.
503
504 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000505 def __init__(self, text_or_uri, tag=None):
506 if tag:
507 text_or_uri = "{%s}%s" % (text_or_uri, tag)
508 self.text = text_or_uri
509 def __str__(self):
510 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000511 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300512 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000513 def __hash__(self):
514 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000515 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000516 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000517 return self.text <= other.text
518 return self.text <= other
519 def __lt__(self, other):
520 if isinstance(other, QName):
521 return self.text < other.text
522 return self.text < other
523 def __ge__(self, other):
524 if isinstance(other, QName):
525 return self.text >= other.text
526 return self.text >= other
527 def __gt__(self, other):
528 if isinstance(other, QName):
529 return self.text > other.text
530 return self.text > other
531 def __eq__(self, other):
532 if isinstance(other, QName):
533 return self.text == other.text
534 return self.text == other
535 def __ne__(self, other):
536 if isinstance(other, QName):
537 return self.text != other.text
538 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000539
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000540# --------------------------------------------------------------------
541
Armin Rigo9ed73062005-12-14 18:10:45 +0000542
543class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800544 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000545
Eli Bendersky84fae782013-03-09 07:12:48 -0800546 This class also provides support for serialization to and from
547 standard XML.
548
549 *element* is an optional root element node,
550 *file* is an optional file handle or file name of an XML file whose
551 contents will be used to initialize the tree with.
552
553 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000554 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000555 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000556 self._root = element # first node
557 if file:
558 self.parse(file)
559
Armin Rigo9ed73062005-12-14 18:10:45 +0000560 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800561 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000562 return self._root
563
Armin Rigo9ed73062005-12-14 18:10:45 +0000564 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800565 """Replace root element of this tree.
566
567 This will discard the current contents of the tree and replace it
568 with the given element. Use with care!
569
570 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000571 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000572 self._root = element
573
Armin Rigo9ed73062005-12-14 18:10:45 +0000574 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800575 """Load external XML document into element tree.
576
577 *source* is a file name or file object, *parser* is an optional parser
578 instance that defaults to XMLParser.
579
580 ParseError is raised if the parser fails to parse the document.
581
582 Returns the root element of the given source document.
583
584 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000585 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000586 if not hasattr(source, "read"):
587 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000588 close_source = True
589 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700590 if parser is None:
591 # If no parser was specified, create a default XMLParser
592 parser = XMLParser()
593 if hasattr(parser, '_parse_whole'):
594 # The default XMLParser, when it comes from an accelerator,
595 # can define an internal _parse_whole API for efficiency.
596 # It can be used to parse the whole source without feeding
597 # it with chunks.
598 self._root = parser._parse_whole(source)
599 return self._root
600 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000601 data = source.read(65536)
602 if not data:
603 break
604 parser.feed(data)
605 self._root = parser.close()
606 return self._root
607 finally:
608 if close_source:
609 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000610
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000611 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800612 """Create and return tree iterator for the root element.
613
614 The iterator loops over all elements in this tree, in document order.
615
616 *tag* is a string with the tag name to iterate over
617 (default is to return all elements).
618
619 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000620 # assert self._root is not None
621 return self._root.iter(tag)
622
623 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000624 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000625 # Change for a DeprecationWarning in 1.4
626 warnings.warn(
627 "This method will be removed in future versions. "
628 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
629 PendingDeprecationWarning, stacklevel=2
630 )
631 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000632
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000633 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800634 """Find first matching element by tag name or path.
635
636 Same as getroot().find(path), which is Element.find()
637
638 *path* is a string having either an element tag or an XPath,
639 *namespaces* is an optional mapping from namespace prefix to full name.
640
641 Return the first matching element, or None if no element was found.
642
643 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000644 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000645 if path[:1] == "/":
646 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000647 warnings.warn(
648 "This search is broken in 1.3 and earlier, and will be "
649 "fixed in a future version. If you rely on the current "
650 "behaviour, change it to %r" % path,
651 FutureWarning, stacklevel=2
652 )
653 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000654
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000655 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800656 """Find first matching element by tag name or path.
657
658 Same as getroot().findtext(path), which is Element.findtext()
659
660 *path* is a string having either an element tag or an XPath,
661 *namespaces* is an optional mapping from namespace prefix to full name.
662
663 Return the first matching element, or None if no element was found.
664
665 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000666 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000667 if path[:1] == "/":
668 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000669 warnings.warn(
670 "This search is broken in 1.3 and earlier, and will be "
671 "fixed in a future version. If you rely on the current "
672 "behaviour, change it to %r" % path,
673 FutureWarning, stacklevel=2
674 )
675 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000676
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000677 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800678 """Find all matching subelements by tag name or path.
679
680 Same as getroot().findall(path), which is Element.findall().
681
682 *path* is a string having either an element tag or an XPath,
683 *namespaces* is an optional mapping from namespace prefix to full name.
684
685 Return list containing all matching elements in document order.
686
687 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000688 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000689 if path[:1] == "/":
690 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000691 warnings.warn(
692 "This search is broken in 1.3 and earlier, and will be "
693 "fixed in a future version. If you rely on the current "
694 "behaviour, change it to %r" % path,
695 FutureWarning, stacklevel=2
696 )
697 return self._root.findall(path, namespaces)
698
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000699 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800700 """Find all matching subelements by tag name or path.
701
702 Same as getroot().iterfind(path), which is element.iterfind()
703
704 *path* is a string having either an element tag or an XPath,
705 *namespaces* is an optional mapping from namespace prefix to full name.
706
707 Return an iterable yielding all matching elements in document order.
708
709 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000710 # assert self._root is not None
711 if path[:1] == "/":
712 path = "." + path
713 warnings.warn(
714 "This search is broken in 1.3 and earlier, and will be "
715 "fixed in a future version. If you rely on the current "
716 "behaviour, change it to %r" % path,
717 FutureWarning, stacklevel=2
718 )
719 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000720
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000721 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000722 encoding=None,
723 xml_declaration=None,
724 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800725 method=None, *,
726 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800727 """Write element tree to a file as XML.
728
729 Arguments:
730 *file_or_filename* -- file name or a file object opened for writing
731
732 *encoding* -- the output encoding (default: US-ASCII)
733
734 *xml_declaration* -- bool indicating if an XML declaration should be
735 added to the output. If None, an XML declaration
736 is added if encoding IS NOT either of:
737 US-ASCII, UTF-8, or Unicode
738
739 *default_namespace* -- sets the default XML namespace (for "xmlns")
740
741 *method* -- either "xml" (default), "html, "text", or "c14n"
742
743 *short_empty_elements* -- controls the formatting of elements
744 that contain no content. If True (default)
745 they are emitted as a single self-closed
746 tag, otherwise they are emitted as a pair
747 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800748
749 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000750 if not method:
751 method = "xml"
752 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000753 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000754 if not encoding:
755 if method == "c14n":
756 encoding = "utf-8"
757 else:
758 encoding = "us-ascii"
Florent Xiclunac17f1722010-08-08 19:48:29 +0000759 else:
760 encoding = encoding.lower()
Eli Bendersky00f402b2012-07-15 06:02:22 +0300761 with _get_writer(file_or_filename, encoding) as write:
762 if method == "xml" and (xml_declaration or
763 (xml_declaration is None and
764 encoding not in ("utf-8", "us-ascii", "unicode"))):
765 declared_encoding = encoding
766 if encoding == "unicode":
767 # Retrieve the default encoding for the xml declaration
768 import locale
769 declared_encoding = locale.getpreferredencoding()
770 write("<?xml version='1.0' encoding='%s'?>\n" % (
771 declared_encoding,))
772 if method == "text":
773 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000774 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300775 qnames, namespaces = _namespaces(self._root, default_namespace)
776 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800777 serialize(write, self._root, qnames, namespaces,
778 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000779
780 def write_c14n(self, file):
781 # lxml.etree compatibility. use output method instead
782 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000783
784# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000785# serialization support
786
Eli Bendersky00f402b2012-07-15 06:02:22 +0300787@contextlib.contextmanager
788def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300789 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300790 try:
791 write = file_or_filename.write
792 except AttributeError:
793 # file_or_filename is a file name
794 if encoding == "unicode":
795 file = open(file_or_filename, "w")
796 else:
797 file = open(file_or_filename, "w", encoding=encoding,
798 errors="xmlcharrefreplace")
799 with file:
800 yield file.write
801 else:
802 # file_or_filename is a file-like object
803 # encoding determines if it is a text or binary writer
804 if encoding == "unicode":
805 # use a text writer as is
806 yield write
807 else:
808 # wrap a binary writer with TextIOWrapper
809 with contextlib.ExitStack() as stack:
810 if isinstance(file_or_filename, io.BufferedIOBase):
811 file = file_or_filename
812 elif isinstance(file_or_filename, io.RawIOBase):
813 file = io.BufferedWriter(file_or_filename)
814 # Keep the original file open when the BufferedWriter is
815 # destroyed
816 stack.callback(file.detach)
817 else:
818 # This is to handle passed objects that aren't in the
819 # IOBase hierarchy, but just have a write method
820 file = io.BufferedIOBase()
821 file.writable = lambda: True
822 file.write = write
823 try:
824 # TextIOWrapper uses this methods to determine
825 # if BOM (for UTF-16, etc) should be added
826 file.seekable = file_or_filename.seekable
827 file.tell = file_or_filename.tell
828 except AttributeError:
829 pass
830 file = io.TextIOWrapper(file,
831 encoding=encoding,
832 errors="xmlcharrefreplace",
833 newline="\n")
834 # Keep the original file open when the TextIOWrapper is
835 # destroyed
836 stack.callback(file.detach)
837 yield file.write
838
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000839def _namespaces(elem, default_namespace=None):
840 # identify namespaces used in this tree
841
842 # maps qnames to *encoded* prefix:local names
843 qnames = {None: None}
844
845 # maps uri:s to prefixes
846 namespaces = {}
847 if default_namespace:
848 namespaces[default_namespace] = ""
849
850 def add_qname(qname):
851 # calculate serialized qname representation
852 try:
853 if qname[:1] == "{":
854 uri, tag = qname[1:].rsplit("}", 1)
855 prefix = namespaces.get(uri)
856 if prefix is None:
857 prefix = _namespace_map.get(uri)
858 if prefix is None:
859 prefix = "ns%d" % len(namespaces)
860 if prefix != "xml":
861 namespaces[uri] = prefix
862 if prefix:
863 qnames[qname] = "%s:%s" % (prefix, tag)
864 else:
865 qnames[qname] = tag # default element
866 else:
867 if default_namespace:
868 # FIXME: can this be handled in XML 1.0?
869 raise ValueError(
870 "cannot use non-qualified names with "
871 "default_namespace option"
872 )
873 qnames[qname] = qname
874 except TypeError:
875 _raise_serialization_error(qname)
876
877 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300878 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000879 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000880 if isinstance(tag, QName):
881 if tag.text not in qnames:
882 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000883 elif isinstance(tag, str):
884 if tag not in qnames:
885 add_qname(tag)
886 elif tag is not None and tag is not Comment and tag is not PI:
887 _raise_serialization_error(tag)
888 for key, value in elem.items():
889 if isinstance(key, QName):
890 key = key.text
891 if key not in qnames:
892 add_qname(key)
893 if isinstance(value, QName) and value.text not in qnames:
894 add_qname(value.text)
895 text = elem.text
896 if isinstance(text, QName) and text.text not in qnames:
897 add_qname(text.text)
898 return qnames, namespaces
899
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800900def _serialize_xml(write, elem, qnames, namespaces,
901 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000902 tag = elem.tag
903 text = elem.text
904 if tag is Comment:
905 write("<!--%s-->" % text)
906 elif tag is ProcessingInstruction:
907 write("<?%s?>" % text)
908 else:
909 tag = qnames[tag]
910 if tag is None:
911 if text:
912 write(_escape_cdata(text))
913 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800914 _serialize_xml(write, e, qnames, None,
915 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000916 else:
917 write("<" + tag)
918 items = list(elem.items())
919 if items or namespaces:
920 if namespaces:
921 for v, k in sorted(namespaces.items(),
922 key=lambda x: x[1]): # sort on prefix
923 if k:
924 k = ":" + k
925 write(" xmlns%s=\"%s\"" % (
926 k,
927 _escape_attrib(v)
928 ))
929 for k, v in sorted(items): # lexical order
930 if isinstance(k, QName):
931 k = k.text
932 if isinstance(v, QName):
933 v = qnames[v.text]
934 else:
935 v = _escape_attrib(v)
936 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800937 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000938 write(">")
939 if text:
940 write(_escape_cdata(text))
941 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800942 _serialize_xml(write, e, qnames, None,
943 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000944 write("</" + tag + ">")
945 else:
946 write(" />")
947 if elem.tail:
948 write(_escape_cdata(elem.tail))
949
950HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300951 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000952
953try:
954 HTML_EMPTY = set(HTML_EMPTY)
955except NameError:
956 pass
957
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800958def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000959 tag = elem.tag
960 text = elem.text
961 if tag is Comment:
962 write("<!--%s-->" % _escape_cdata(text))
963 elif tag is ProcessingInstruction:
964 write("<?%s?>" % _escape_cdata(text))
965 else:
966 tag = qnames[tag]
967 if tag is None:
968 if text:
969 write(_escape_cdata(text))
970 for e in elem:
971 _serialize_html(write, e, qnames, None)
972 else:
973 write("<" + tag)
974 items = list(elem.items())
975 if items or namespaces:
976 if namespaces:
977 for v, k in sorted(namespaces.items(),
978 key=lambda x: x[1]): # sort on prefix
979 if k:
980 k = ":" + k
981 write(" xmlns%s=\"%s\"" % (
982 k,
983 _escape_attrib(v)
984 ))
985 for k, v in sorted(items): # lexical order
986 if isinstance(k, QName):
987 k = k.text
988 if isinstance(v, QName):
989 v = qnames[v.text]
990 else:
991 v = _escape_attrib_html(v)
992 # FIXME: handle boolean attributes
993 write(" %s=\"%s\"" % (qnames[k], v))
994 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200995 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000996 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200997 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000998 write(text)
999 else:
1000 write(_escape_cdata(text))
1001 for e in elem:
1002 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +02001003 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001004 write("</" + tag + ">")
1005 if elem.tail:
1006 write(_escape_cdata(elem.tail))
1007
1008def _serialize_text(write, elem):
1009 for part in elem.itertext():
1010 write(part)
1011 if elem.tail:
1012 write(elem.tail)
1013
1014_serialize = {
1015 "xml": _serialize_xml,
1016 "html": _serialize_html,
1017 "text": _serialize_text,
1018# this optional method is imported at the end of the module
1019# "c14n": _serialize_c14n,
1020}
Armin Rigo9ed73062005-12-14 18:10:45 +00001021
Armin Rigo9ed73062005-12-14 18:10:45 +00001022
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001023def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001024 """Register a namespace prefix.
1025
1026 The registry is global, and any existing mapping for either the
1027 given prefix or the namespace URI will be removed.
1028
1029 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1030 attributes in this namespace will be serialized with prefix if possible.
1031
1032 ValueError is raised if prefix is reserved or is invalid.
1033
1034 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001035 if re.match("ns\d+$", prefix):
1036 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001037 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001038 if k == uri or v == prefix:
1039 del _namespace_map[k]
1040 _namespace_map[uri] = prefix
1041
1042_namespace_map = {
1043 # "well-known" namespace prefixes
1044 "http://www.w3.org/XML/1998/namespace": "xml",
1045 "http://www.w3.org/1999/xhtml": "html",
1046 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1047 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1048 # xml schema
1049 "http://www.w3.org/2001/XMLSchema": "xs",
1050 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1051 # dublin core
1052 "http://purl.org/dc/elements/1.1/": "dc",
1053}
Florent Xicluna16395052012-02-16 23:28:35 +01001054# For tests and troubleshooting
1055register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001056
1057def _raise_serialization_error(text):
1058 raise TypeError(
1059 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1060 )
1061
1062def _escape_cdata(text):
1063 # escape character data
1064 try:
1065 # it's worth avoiding do-nothing calls for strings that are
1066 # shorter than 500 character, or so. assume that's, by far,
1067 # the most common case in most applications.
1068 if "&" in text:
1069 text = text.replace("&", "&amp;")
1070 if "<" in text:
1071 text = text.replace("<", "&lt;")
1072 if ">" in text:
1073 text = text.replace(">", "&gt;")
1074 return text
1075 except (TypeError, AttributeError):
1076 _raise_serialization_error(text)
1077
1078def _escape_attrib(text):
1079 # escape attribute value
1080 try:
1081 if "&" in text:
1082 text = text.replace("&", "&amp;")
1083 if "<" in text:
1084 text = text.replace("<", "&lt;")
1085 if ">" in text:
1086 text = text.replace(">", "&gt;")
1087 if "\"" in text:
1088 text = text.replace("\"", "&quot;")
1089 if "\n" in text:
1090 text = text.replace("\n", "&#10;")
1091 return text
1092 except (TypeError, AttributeError):
1093 _raise_serialization_error(text)
1094
1095def _escape_attrib_html(text):
1096 # escape attribute value
1097 try:
1098 if "&" in text:
1099 text = text.replace("&", "&amp;")
1100 if ">" in text:
1101 text = text.replace(">", "&gt;")
1102 if "\"" in text:
1103 text = text.replace("\"", "&quot;")
1104 return text
1105 except (TypeError, AttributeError):
1106 _raise_serialization_error(text)
1107
1108# --------------------------------------------------------------------
1109
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001110def tostring(element, encoding=None, method=None, *,
1111 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001112 """Generate string representation of XML element.
1113
1114 All subelements are included. If encoding is "unicode", a string
1115 is returned. Otherwise a bytestring is returned.
1116
1117 *element* is an Element instance, *encoding* is an optional output
1118 encoding defaulting to US-ASCII, *method* is an optional output which can
1119 be one of "xml" (default), "html", "text" or "c14n".
1120
1121 Returns an (optionally) encoded string containing the XML data.
1122
1123 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001124 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001125 ElementTree(element).write(stream, encoding, method=method,
1126 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001127 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001128
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001129class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001130 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001131 def __init__(self, lst):
1132 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001133
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001134 def writable(self):
1135 return True
1136
1137 def seekable(self):
1138 return True
1139
1140 def write(self, b):
1141 self.lst.append(b)
1142
1143 def tell(self):
1144 return len(self.lst)
1145
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001146def tostringlist(element, encoding=None, method=None, *,
1147 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001148 lst = []
1149 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001150 ElementTree(element).write(stream, encoding, method=method,
1151 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001152 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001153
Armin Rigo9ed73062005-12-14 18:10:45 +00001154
1155def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001156 """Write element tree or element structure to sys.stdout.
1157
1158 This function should be used for debugging only.
1159
1160 *elem* is either an ElementTree, or a single Element. The exact output
1161 format is implementation dependent. In this version, it's written as an
1162 ordinary XML file.
1163
1164 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001165 # debugging
1166 if not isinstance(elem, ElementTree):
1167 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001168 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001169 tail = elem.getroot().tail
1170 if not tail or tail[-1] != "\n":
1171 sys.stdout.write("\n")
1172
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001173# --------------------------------------------------------------------
1174# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001175
Armin Rigo9ed73062005-12-14 18:10:45 +00001176
1177def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001178 """Parse XML document into element tree.
1179
1180 *source* is a filename or file object containing XML data,
1181 *parser* is an optional parser instance defaulting to XMLParser.
1182
1183 Return an ElementTree instance.
1184
1185 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001186 tree = ElementTree()
1187 tree.parse(source, parser)
1188 return tree
1189
Armin Rigo9ed73062005-12-14 18:10:45 +00001190
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001191def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001192 """Incrementally parse XML document into ElementTree.
1193
1194 This class also reports what's going on to the user based on the
1195 *events* it is initialized with. The supported events are the strings
1196 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1197 detailed namespace information). If *events* is omitted, only
1198 "end" events are reported.
1199
1200 *source* is a filename or file object containing XML data, *events* is
1201 a list of events to report back, *parser* is an optional parser instance.
1202
1203 Returns an iterator providing (event, elem) pairs.
1204
1205 """
Antoine Pitroue033e062010-10-29 10:38:18 +00001206 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001207 if not hasattr(source, "read"):
1208 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001209 close_source = True
Antoine Pitroue033e062010-10-29 10:38:18 +00001210 return _IterParseIterator(source, events, parser, close_source)
Armin Rigo9ed73062005-12-14 18:10:45 +00001211
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001212
Eli Benderskyb5869342013-08-30 05:51:20 -07001213class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001214
Eli Benderskyb5869342013-08-30 05:51:20 -07001215 def __init__(self, events=None, *, _parser=None):
1216 # The _parser argument is for internal use only and must not be relied
1217 # upon in user code. It will be removed in a future release.
1218 # See http://bugs.python.org/issue17741 for more details.
1219
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001220 # _elementtree.c expects a list, not a deque
1221 self._events_queue = []
Armin Rigo9ed73062005-12-14 18:10:45 +00001222 self._index = 0
Eli Benderskyb5869342013-08-30 05:51:20 -07001223 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001224 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001225 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001226 events = ("end",)
1227 self._parser._setevents(self._events_queue, events)
1228
Eli Benderskyb5869342013-08-30 05:51:20 -07001229 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001230 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001231 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001232 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001233 if data:
1234 try:
1235 self._parser.feed(data)
1236 except SyntaxError as exc:
1237 self._events_queue.append(exc)
1238
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001239 def _close_and_return_root(self):
1240 # iterparse needs this to set its root attribute properly :(
1241 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001242 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001243 return root
1244
1245 def close(self):
1246 """Finish feeding data to parser.
1247
1248 Unlike XMLParser, does not return the root element. Use
1249 read_events() to consume elements from XMLPullParser.
1250 """
1251 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001252
Eli Benderskyb5869342013-08-30 05:51:20 -07001253 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001254 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001255
1256 Events are consumed from the internal event queue as they are
1257 retrieved from the iterator.
1258 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001259 events = self._events_queue
1260 while True:
1261 index = self._index
1262 try:
1263 event = events[self._index]
1264 # Avoid retaining references to past events
1265 events[self._index] = None
1266 except IndexError:
1267 break
1268 index += 1
1269 # Compact the list in a O(1) amortized fashion
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001270 # As noted above, _elementree.c needs a list, not a deque
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001271 if index * 2 >= len(events):
1272 events[:index] = []
1273 self._index = 0
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001274 else:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001275 self._index = index
1276 if isinstance(event, Exception):
1277 raise event
1278 else:
1279 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001280
1281
Antoine Pitrou0acbcb52013-08-23 23:04:30 +02001282class _IterParseIterator:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001283
1284 def __init__(self, source, events, parser, close_source=False):
Eli Benderskyb5869342013-08-30 05:51:20 -07001285 # Use the internal, undocumented _parser argument for now; When the
1286 # parser argument of iterparse is removed, this can be killed.
1287 self._parser = XMLPullParser(events=events, _parser=parser)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001288 self._file = source
1289 self._close_file = close_source
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001290 self.root = self._root = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001291
Georg Brandla18af4e2007-04-21 15:47:16 +00001292 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +00001293 while 1:
Eli Benderskyb5869342013-08-30 05:51:20 -07001294 for event in self._parser.read_events():
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001295 return event
Antoine Pitrou0acbcb52013-08-23 23:04:30 +02001296 if self._parser._parser is None:
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001297 self.root = self._root
Florent Xicluna91d51932011-11-01 23:31:09 +01001298 if self._close_file:
1299 self._file.close()
1300 raise StopIteration
1301 # load event buffer
Eli Benderskyb5869342013-08-30 05:51:20 -07001302 data = self._file.read(16 * 1024)
Florent Xicluna91d51932011-11-01 23:31:09 +01001303 if data:
Eli Benderskyb5869342013-08-30 05:51:20 -07001304 self._parser.feed(data)
Florent Xicluna91d51932011-11-01 23:31:09 +01001305 else:
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001306 self._root = self._parser._close_and_return_root()
Armin Rigo9ed73062005-12-14 18:10:45 +00001307
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001308 def __iter__(self):
1309 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001310
Armin Rigo9ed73062005-12-14 18:10:45 +00001311
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001312def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001313 """Parse XML document from string constant.
1314
1315 This function can be used to embed "XML Literals" in Python code.
1316
1317 *text* is a string containing XML data, *parser* is an
1318 optional parser instance, defaulting to the standard XMLParser.
1319
1320 Returns an Element instance.
1321
1322 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001323 if not parser:
1324 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001325 parser.feed(text)
1326 return parser.close()
1327
Armin Rigo9ed73062005-12-14 18:10:45 +00001328
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001329def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001330 """Parse XML document from string constant for its IDs.
1331
1332 *text* is a string containing XML data, *parser* is an
1333 optional parser instance, defaulting to the standard XMLParser.
1334
1335 Returns an (Element, dict) tuple, in which the
1336 dict maps element id:s to elements.
1337
1338 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001339 if not parser:
1340 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001341 parser.feed(text)
1342 tree = parser.close()
1343 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001344 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001345 id = elem.get("id")
1346 if id:
1347 ids[id] = elem
1348 return tree, ids
1349
Victor Stinner765531d2013-03-26 01:11:54 +01001350# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001351fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001352
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001353def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001354 """Parse XML document from sequence of string fragments.
1355
1356 *sequence* is a list of other sequence, *parser* is an optional parser
1357 instance, defaulting to the standard XMLParser.
1358
1359 Returns an Element instance.
1360
1361 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001362 if not parser:
1363 parser = XMLParser(target=TreeBuilder())
1364 for text in sequence:
1365 parser.feed(text)
1366 return parser.close()
1367
1368# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001369
Armin Rigo9ed73062005-12-14 18:10:45 +00001370
1371class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001372 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001373
Eli Bendersky84fae782013-03-09 07:12:48 -08001374 This builder converts a sequence of start, data, and end method
1375 calls to a well-formed element structure.
1376
1377 You can use this class to build an element structure using a custom XML
1378 parser, or a parser for some other XML-like format.
1379
1380 *element_factory* is an optional element factory which is called
1381 to create new Element instances, as necessary.
1382
1383 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001384 def __init__(self, element_factory=None):
1385 self._data = [] # data collector
1386 self._elem = [] # element stack
1387 self._last = None # last element
1388 self._tail = None # true if we're after an end tag
1389 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001390 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001391 self._factory = element_factory
1392
Armin Rigo9ed73062005-12-14 18:10:45 +00001393 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001394 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001395 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001396 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001397 return self._last
1398
1399 def _flush(self):
1400 if self._data:
1401 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001402 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001403 if self._tail:
1404 assert self._last.tail is None, "internal error (tail)"
1405 self._last.tail = text
1406 else:
1407 assert self._last.text is None, "internal error (text)"
1408 self._last.text = text
1409 self._data = []
1410
Armin Rigo9ed73062005-12-14 18:10:45 +00001411 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001412 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001413 self._data.append(data)
1414
Armin Rigo9ed73062005-12-14 18:10:45 +00001415 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001416 """Open new element and return it.
1417
1418 *tag* is the element name, *attrs* is a dict containing element
1419 attributes.
1420
1421 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001422 self._flush()
1423 self._last = elem = self._factory(tag, attrs)
1424 if self._elem:
1425 self._elem[-1].append(elem)
1426 self._elem.append(elem)
1427 self._tail = 0
1428 return elem
1429
Armin Rigo9ed73062005-12-14 18:10:45 +00001430 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001431 """Close and return current Element.
1432
1433 *tag* is the element name.
1434
1435 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001436 self._flush()
1437 self._last = self._elem.pop()
1438 assert self._last.tag == tag,\
1439 "end tag mismatch (expected %s, got %s)" % (
1440 self._last.tag, tag)
1441 self._tail = 1
1442 return self._last
1443
Armin Rigo9ed73062005-12-14 18:10:45 +00001444
Eli Bendersky84fae782013-03-09 07:12:48 -08001445# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001446class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001447 """Element structure builder for XML source data based on the expat parser.
1448
1449 *html* are predefined HTML entities (not supported currently),
1450 *target* is an optional target object which defaults to an instance of the
1451 standard TreeBuilder class, *encoding* is an optional encoding string
1452 which if given, overrides the encoding specified in the XML file:
1453 http://www.iana.org/assignments/character-sets
1454
1455 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001456
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001457 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001458 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001459 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001460 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001461 try:
1462 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001463 except ImportError:
1464 raise ImportError(
1465 "No module named expat; use SimpleXMLTreeBuilder instead"
1466 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001467 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001468 if target is None:
1469 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001470 # underscored names are provided for compatibility only
1471 self.parser = self._parser = parser
1472 self.target = self._target = target
1473 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001474 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001475 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001476 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001477 if hasattr(target, 'start'):
1478 parser.StartElementHandler = self._start
1479 if hasattr(target, 'end'):
1480 parser.EndElementHandler = self._end
1481 if hasattr(target, 'data'):
1482 parser.CharacterDataHandler = target.data
1483 # miscellaneous callbacks
1484 if hasattr(target, 'comment'):
1485 parser.CommentHandler = target.comment
1486 if hasattr(target, 'pi'):
1487 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001488 # Configure pyexpat: buffering, new-style attribute handling.
1489 parser.buffer_text = 1
1490 parser.ordered_attributes = 1
1491 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001492 self._doctype = None
1493 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001494 try:
1495 self.version = "Expat %d.%d.%d" % expat.version_info
1496 except AttributeError:
1497 pass # unknown
1498
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001499 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001500 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001501 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001502 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001503 # events_queue: a list of actual parsing events that will be populated
1504 # by the underlying parser.
1505 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001506 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001507 append = events_queue.append
1508 for event_name in events_to_report:
1509 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001510 parser.ordered_attributes = 1
1511 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001512 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001513 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001514 append((event, start(tag, attrib_in)))
1515 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001516 elif event_name == "end":
1517 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001518 end=self._end):
1519 append((event, end(tag)))
1520 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001521 elif event_name == "start-ns":
1522 def handler(prefix, uri, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001523 append((event, (prefix or "", uri or "")))
1524 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001525 elif event_name == "end-ns":
1526 def handler(prefix, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001527 append((event, None))
1528 parser.EndNamespaceDeclHandler = handler
1529 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001530 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001531
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001532 def _raiseerror(self, value):
1533 err = ParseError(value)
1534 err.code = value.code
1535 err.position = value.lineno, value.offset
1536 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001537
Armin Rigo9ed73062005-12-14 18:10:45 +00001538 def _fixname(self, key):
1539 # expand qname, and convert name string to ascii, if possible
1540 try:
1541 name = self._names[key]
1542 except KeyError:
1543 name = key
1544 if "}" in name:
1545 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001546 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001547 return name
1548
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001549 def _start(self, tag, attr_list):
1550 # Handler for expat's StartElementHandler. Since ordered_attributes
1551 # is set, the attributes are reported as a list of alternating
1552 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001553 fixname = self._fixname
1554 tag = fixname(tag)
1555 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001556 if attr_list:
1557 for i in range(0, len(attr_list), 2):
1558 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001559 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001560
Armin Rigo9ed73062005-12-14 18:10:45 +00001561 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001562 return self.target.end(self._fixname(tag))
1563
Armin Rigo9ed73062005-12-14 18:10:45 +00001564 def _default(self, text):
1565 prefix = text[:1]
1566 if prefix == "&":
1567 # deal with undefined entities
1568 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001569 data_handler = self.target.data
1570 except AttributeError:
1571 return
1572 try:
1573 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001574 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001575 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001576 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001577 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001578 (text, self.parser.ErrorLineNumber,
1579 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001580 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001581 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001582 err.lineno = self.parser.ErrorLineNumber
1583 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001584 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001585 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1586 self._doctype = [] # inside a doctype declaration
1587 elif self._doctype is not None:
1588 # parse doctype contents
1589 if prefix == ">":
1590 self._doctype = None
1591 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001592 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001593 if not text:
1594 return
1595 self._doctype.append(text)
1596 n = len(self._doctype)
1597 if n > 2:
1598 type = self._doctype[1]
1599 if type == "PUBLIC" and n == 4:
1600 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001601 if pubid:
1602 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001603 elif type == "SYSTEM" and n == 3:
1604 name, type, system = self._doctype
1605 pubid = None
1606 else:
1607 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001608 if hasattr(self.target, "doctype"):
1609 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001610 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001611 # warn about deprecated call
1612 self._XMLParser__doctype(name, pubid, system[1:-1])
1613 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001614 self._doctype = None
1615
Armin Rigo9ed73062005-12-14 18:10:45 +00001616 def doctype(self, name, pubid, system):
Eli Bendersky84fae782013-03-09 07:12:48 -08001617 """(Deprecated) Handle doctype declaration
1618
1619 *name* is the Doctype name, *pubid* is the public identifier,
1620 and *system* is the system identifier.
1621
1622 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001623 warnings.warn(
1624 "This method of XMLParser is deprecated. Define doctype() "
1625 "method on the TreeBuilder target.",
1626 DeprecationWarning,
1627 )
1628
1629 # sentinel, if doctype is redefined in a subclass
1630 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001631
Armin Rigo9ed73062005-12-14 18:10:45 +00001632 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001633 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001634 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001635 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001636 except self._error as v:
1637 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001638
Armin Rigo9ed73062005-12-14 18:10:45 +00001639 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001640 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001641 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001642 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001643 except self._error as v:
1644 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001645 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001646 close_handler = self.target.close
1647 except AttributeError:
1648 pass
1649 else:
1650 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001651 finally:
1652 # get rid of circular references
1653 del self.parser, self._parser
1654 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001655
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001656
1657# Import the C accelerators
1658try:
Eli Bendersky46955b22013-05-19 09:20:50 -07001659 # Element is going to be shadowed by the C implementation. We need to keep
1660 # the Python version of it accessible for some "creative" by external code
1661 # (see tests)
1662 _Element_Py = Element
1663
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001664 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1665 from _elementtree import *
Eli Benderskyc4e98a62013-05-19 09:24:43 -07001666except ImportError:
1667 pass