blob: b4e110d5dea7945ebc036b46f8ff54709c7f8d66 [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Eli Benderskyc4e98a62013-05-19 09:24:43 -070088 "XMLParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
98import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +000099
Eli Bendersky27cbb192012-06-15 09:03:19 +0300100from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000101
Armin Rigo9ed73062005-12-14 18:10:45 +0000102
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000103class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800104 """An error when parsing an XML document.
105
106 In addition to its exception value, a ParseError contains
107 two extra attributes:
108 'code' - the specific exception code
109 'position' - the line and column of the error
110
111 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000112 pass
113
114# --------------------------------------------------------------------
115
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000116
117def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800118 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100119 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000120
Armin Rigo9ed73062005-12-14 18:10:45 +0000121
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000122class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800123 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000124
Eli Bendersky84fae782013-03-09 07:12:48 -0800125 This class is the reference implementation of the Element interface.
126
127 An element's length is its number of subelements. That means if you
Serhiy Storchaka56a6d852014-12-01 18:28:43 +0200128 want to check if an element is truly empty, you should check BOTH
Eli Bendersky84fae782013-03-09 07:12:48 -0800129 its length AND its text attribute.
130
131 The element tag, attribute names, and attribute values can be either
132 bytes or strings.
133
134 *tag* is the element name. *attrib* is an optional dictionary containing
135 element attributes. *extra* are additional element attributes given as
136 keyword arguments.
137
138 Example form:
139 <tag attrib>text<child/>...</tag>tail
140
141 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000142
143 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800144 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000145
146 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800147 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000148
149 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800150 """
151 Text before first subelement. This is either a string or the value None.
152 Note that if there is no text, this attribute may be either
153 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000154
Eli Bendersky84fae782013-03-09 07:12:48 -0800155 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000156
Eli Bendersky84fae782013-03-09 07:12:48 -0800157 tail = None
158 """
159 Text after this element's end tag, but before the next sibling element's
160 start tag. This is either a string or the value None. Note that if there
161 was no text, this attribute may be either None or an empty string,
162 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000163
Eli Bendersky84fae782013-03-09 07:12:48 -0800164 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000165
166 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300167 if not isinstance(attrib, dict):
168 raise TypeError("attrib must be dict, not %s" % (
169 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000170 attrib = attrib.copy()
171 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000172 self.tag = tag
173 self.attrib = attrib
174 self._children = []
175
176 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300177 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000178
Armin Rigo9ed73062005-12-14 18:10:45 +0000179 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800180 """Create a new element with the same type.
181
182 *tag* is a string containing the element name.
183 *attrib* is a dictionary containing the element attributes.
184
185 Do not call this method, use the SubElement factory function instead.
186
187 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000188 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000189
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000190 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800191 """Return copy of current element.
192
193 This creates a shallow copy. Subelements will be shared with the
194 original tree.
195
196 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000197 elem = self.makeelement(self.tag, self.attrib)
198 elem.text = self.text
199 elem.tail = self.tail
200 elem[:] = self
201 return elem
202
Armin Rigo9ed73062005-12-14 18:10:45 +0000203 def __len__(self):
204 return len(self._children)
205
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 def __bool__(self):
207 warnings.warn(
208 "The behavior of this method will change in future versions. "
209 "Use specific 'len(elem)' or 'elem is not None' test instead.",
210 FutureWarning, stacklevel=2
211 )
212 return len(self._children) != 0 # emulate old behaviour, for now
213
Armin Rigo9ed73062005-12-14 18:10:45 +0000214 def __getitem__(self, index):
215 return self._children[index]
216
Armin Rigo9ed73062005-12-14 18:10:45 +0000217 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000218 # if isinstance(index, slice):
219 # for elt in element:
220 # assert iselement(elt)
221 # else:
222 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000223 self._children[index] = element
224
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 def __delitem__(self, index):
226 del self._children[index]
227
Eli Bendersky84fae782013-03-09 07:12:48 -0800228 def append(self, subelement):
229 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000230
Eli Bendersky84fae782013-03-09 07:12:48 -0800231 The new element will appear in document order after the last existing
232 subelement (or directly after the text, if it's the first subelement),
233 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000234
Eli Bendersky84fae782013-03-09 07:12:48 -0800235 """
236 self._assert_is_element(subelement)
237 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000238
239 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800240 """Append subelements from a sequence.
241
242 *elements* is a sequence with zero or more elements.
243
244 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200245 for element in elements:
246 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000247 self._children.extend(elements)
248
Eli Bendersky84fae782013-03-09 07:12:48 -0800249 def insert(self, index, subelement):
250 """Insert *subelement* at position *index*."""
251 self._assert_is_element(subelement)
252 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000253
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200254 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200255 # Need to refer to the actual Python implementation, not the
256 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700257 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200258 raise TypeError('expected an Element, not %s' % type(e).__name__)
259
Eli Bendersky84fae782013-03-09 07:12:48 -0800260 def remove(self, subelement):
261 """Remove matching subelement.
262
263 Unlike the find methods, this method compares elements based on
264 identity, NOT ON tag value or contents. To remove subelements by
265 other means, the easiest way is to use a list comprehension to
266 select what elements to keep, and then use slice assignment to update
267 the parent element.
268
269 ValueError is raised if a matching element could not be found.
270
271 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000272 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800273 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000274
275 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800276 """(Deprecated) Return all subelements.
277
278 Elements are returned in document order.
279
280 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000281 warnings.warn(
282 "This method will be removed in future versions. "
283 "Use 'list(elem)' or iteration over elem instead.",
284 DeprecationWarning, stacklevel=2
285 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000286 return self._children
287
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000288 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800289 """Find first matching element by tag name or path.
290
291 *path* is a string having either an element tag or an XPath,
292 *namespaces* is an optional mapping from namespace prefix to full name.
293
294 Return the first matching element, or None if no element was found.
295
296 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000297 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000298
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000299 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800300 """Find text for first matching element by tag name or path.
301
302 *path* is a string having either an element tag or an XPath,
303 *default* is the value to return if the element was not found,
304 *namespaces* is an optional mapping from namespace prefix to full name.
305
306 Return text content of first matching element, or default value if
307 none was found. Note that if an element is found having no text
308 content, the empty string is returned.
309
310 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000311 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000312
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000313 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800314 """Find all matching subelements by tag name or path.
315
316 *path* is a string having either an element tag or an XPath,
317 *namespaces* is an optional mapping from namespace prefix to full name.
318
319 Returns list containing all matching elements in document order.
320
321 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000322 return ElementPath.findall(self, path, namespaces)
323
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000324 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800325 """Find all matching subelements by tag name or path.
326
327 *path* is a string having either an element tag or an XPath,
328 *namespaces* is an optional mapping from namespace prefix to full name.
329
330 Return an iterable yielding all matching elements in document order.
331
332 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000333 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000334
Armin Rigo9ed73062005-12-14 18:10:45 +0000335 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800336 """Reset element.
337
338 This function removes all subelements, clears all attributes, and sets
339 the text and tail attributes to None.
340
341 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000342 self.attrib.clear()
343 self._children = []
344 self.text = self.tail = None
345
Armin Rigo9ed73062005-12-14 18:10:45 +0000346 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800347 """Get element attribute.
348
349 Equivalent to attrib.get, but some implementations may handle this a
350 bit more efficiently. *key* is what attribute to look for, and
351 *default* is what to return if the attribute was not found.
352
353 Returns a string containing the attribute value, or the default if
354 attribute was not found.
355
356 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000357 return self.attrib.get(key, default)
358
Armin Rigo9ed73062005-12-14 18:10:45 +0000359 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800360 """Set element attribute.
361
362 Equivalent to attrib[key] = value, but some implementations may handle
363 this a bit more efficiently. *key* is what attribute to set, and
364 *value* is the attribute value to set it to.
365
366 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000367 self.attrib[key] = value
368
Armin Rigo9ed73062005-12-14 18:10:45 +0000369 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800370 """Get list of attribute names.
371
372 Names are returned in an arbitrary order, just like an ordinary
373 Python dict. Equivalent to attrib.keys()
374
375 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000376 return self.attrib.keys()
377
Armin Rigo9ed73062005-12-14 18:10:45 +0000378 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800379 """Get element attributes as a sequence.
380
381 The attributes are returned in arbitrary order. Equivalent to
382 attrib.items().
383
384 Return a list of (name, value) tuples.
385
386 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000387 return self.attrib.items()
388
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000389 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800390 """Create tree iterator.
391
392 The iterator loops over the element and all subelements in document
393 order, returning all elements with a matching tag.
394
395 If the tree structure is modified during iteration, new or removed
396 elements may or may not be included. To get a stable set, use the
397 list() function on the iterator, and loop over the resulting list.
398
399 *tag* is what tags to look for (default is to return all elements)
400
401 Return an iterator containing all the matching elements.
402
403 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000404 if tag == "*":
405 tag = None
406 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000407 yield self
408 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700409 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000410
411 # compatibility
412 def getiterator(self, tag=None):
413 # Change for a DeprecationWarning in 1.4
414 warnings.warn(
415 "This method will be removed in future versions. "
416 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
417 PendingDeprecationWarning, stacklevel=2
418 )
419 return list(self.iter(tag))
420
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000421 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800422 """Create text iterator.
423
424 The iterator loops over the element and all subelements in document
425 order, returning all inner text.
426
427 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000428 tag = self.tag
429 if not isinstance(tag, str) and tag is not None:
430 return
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200431 t = self.text
432 if t:
433 yield t
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000434 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700435 yield from e.itertext()
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200436 t = e.tail
437 if t:
438 yield t
Armin Rigo9ed73062005-12-14 18:10:45 +0000439
Armin Rigo9ed73062005-12-14 18:10:45 +0000440
441def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800442 """Subelement factory which creates an element instance, and appends it
443 to an existing parent.
444
445 The element tag, attribute names, and attribute values can be either
446 bytes or Unicode strings.
447
448 *parent* is the parent element, *tag* is the subelements name, *attrib* is
449 an optional directory containing element attributes, *extra* are
450 additional attributes given as keyword arguments.
451
452 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000453 attrib = attrib.copy()
454 attrib.update(extra)
455 element = parent.makeelement(tag, attrib)
456 parent.append(element)
457 return element
458
Armin Rigo9ed73062005-12-14 18:10:45 +0000459
460def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800461 """Comment element factory.
462
463 This function creates a special element which the standard serializer
464 serializes as an XML comment.
465
466 *text* is a string containing the comment string.
467
468 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000469 element = Element(Comment)
470 element.text = text
471 return element
472
Armin Rigo9ed73062005-12-14 18:10:45 +0000473
474def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800475 """Processing Instruction element factory.
476
477 This function creates a special element which the standard serializer
478 serializes as an XML comment.
479
480 *target* is a string containing the processing instruction, *text* is a
481 string containing the processing instruction contents, if any.
482
483 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000484 element = Element(ProcessingInstruction)
485 element.text = target
486 if text:
487 element.text = element.text + " " + text
488 return element
489
490PI = ProcessingInstruction
491
Armin Rigo9ed73062005-12-14 18:10:45 +0000492
493class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800494 """Qualified name wrapper.
495
496 This class can be used to wrap a QName attribute value in order to get
497 proper namespace handing on output.
498
499 *text_or_uri* is a string containing the QName value either in the form
500 {uri}local, or if the tag argument is given, the URI part of a QName.
501
502 *tag* is an optional argument which if given, will make the first
503 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
504 be interpreted as a local name.
505
506 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000507 def __init__(self, text_or_uri, tag=None):
508 if tag:
509 text_or_uri = "{%s}%s" % (text_or_uri, tag)
510 self.text = text_or_uri
511 def __str__(self):
512 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000513 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300514 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000515 def __hash__(self):
516 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000517 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000518 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000519 return self.text <= other.text
520 return self.text <= other
521 def __lt__(self, other):
522 if isinstance(other, QName):
523 return self.text < other.text
524 return self.text < other
525 def __ge__(self, other):
526 if isinstance(other, QName):
527 return self.text >= other.text
528 return self.text >= other
529 def __gt__(self, other):
530 if isinstance(other, QName):
531 return self.text > other.text
532 return self.text > other
533 def __eq__(self, other):
534 if isinstance(other, QName):
535 return self.text == other.text
536 return self.text == other
Armin Rigo9ed73062005-12-14 18:10:45 +0000537
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000538# --------------------------------------------------------------------
539
Armin Rigo9ed73062005-12-14 18:10:45 +0000540
541class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800542 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000543
Eli Bendersky84fae782013-03-09 07:12:48 -0800544 This class also provides support for serialization to and from
545 standard XML.
546
547 *element* is an optional root element node,
548 *file* is an optional file handle or file name of an XML file whose
549 contents will be used to initialize the tree with.
550
551 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000552 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000553 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000554 self._root = element # first node
555 if file:
556 self.parse(file)
557
Armin Rigo9ed73062005-12-14 18:10:45 +0000558 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800559 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000560 return self._root
561
Armin Rigo9ed73062005-12-14 18:10:45 +0000562 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800563 """Replace root element of this tree.
564
565 This will discard the current contents of the tree and replace it
566 with the given element. Use with care!
567
568 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000569 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000570 self._root = element
571
Armin Rigo9ed73062005-12-14 18:10:45 +0000572 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800573 """Load external XML document into element tree.
574
575 *source* is a file name or file object, *parser* is an optional parser
576 instance that defaults to XMLParser.
577
578 ParseError is raised if the parser fails to parse the document.
579
580 Returns the root element of the given source document.
581
582 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000583 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000584 if not hasattr(source, "read"):
585 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000586 close_source = True
587 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700588 if parser is None:
589 # If no parser was specified, create a default XMLParser
590 parser = XMLParser()
591 if hasattr(parser, '_parse_whole'):
592 # The default XMLParser, when it comes from an accelerator,
593 # can define an internal _parse_whole API for efficiency.
594 # It can be used to parse the whole source without feeding
595 # it with chunks.
596 self._root = parser._parse_whole(source)
597 return self._root
598 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000599 data = source.read(65536)
600 if not data:
601 break
602 parser.feed(data)
603 self._root = parser.close()
604 return self._root
605 finally:
606 if close_source:
607 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000608
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000609 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800610 """Create and return tree iterator for the root element.
611
612 The iterator loops over all elements in this tree, in document order.
613
614 *tag* is a string with the tag name to iterate over
615 (default is to return all elements).
616
617 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000618 # assert self._root is not None
619 return self._root.iter(tag)
620
621 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000622 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000623 # Change for a DeprecationWarning in 1.4
624 warnings.warn(
625 "This method will be removed in future versions. "
626 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
627 PendingDeprecationWarning, stacklevel=2
628 )
629 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000630
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000631 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800632 """Find first matching element by tag name or path.
633
634 Same as getroot().find(path), which is Element.find()
635
636 *path* is a string having either an element tag or an XPath,
637 *namespaces* is an optional mapping from namespace prefix to full name.
638
639 Return the first matching element, or None if no element was found.
640
641 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000642 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000643 if path[:1] == "/":
644 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000645 warnings.warn(
646 "This search is broken in 1.3 and earlier, and will be "
647 "fixed in a future version. If you rely on the current "
648 "behaviour, change it to %r" % path,
649 FutureWarning, stacklevel=2
650 )
651 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000652
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000653 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800654 """Find first matching element by tag name or path.
655
656 Same as getroot().findtext(path), which is Element.findtext()
657
658 *path* is a string having either an element tag or an XPath,
659 *namespaces* is an optional mapping from namespace prefix to full name.
660
661 Return the first matching element, or None if no element was found.
662
663 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000664 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000665 if path[:1] == "/":
666 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000667 warnings.warn(
668 "This search is broken in 1.3 and earlier, and will be "
669 "fixed in a future version. If you rely on the current "
670 "behaviour, change it to %r" % path,
671 FutureWarning, stacklevel=2
672 )
673 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000674
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000675 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800676 """Find all matching subelements by tag name or path.
677
678 Same as getroot().findall(path), which is Element.findall().
679
680 *path* is a string having either an element tag or an XPath,
681 *namespaces* is an optional mapping from namespace prefix to full name.
682
683 Return list containing all matching elements in document order.
684
685 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000686 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000687 if path[:1] == "/":
688 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000689 warnings.warn(
690 "This search is broken in 1.3 and earlier, and will be "
691 "fixed in a future version. If you rely on the current "
692 "behaviour, change it to %r" % path,
693 FutureWarning, stacklevel=2
694 )
695 return self._root.findall(path, namespaces)
696
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000697 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800698 """Find all matching subelements by tag name or path.
699
700 Same as getroot().iterfind(path), which is element.iterfind()
701
702 *path* is a string having either an element tag or an XPath,
703 *namespaces* is an optional mapping from namespace prefix to full name.
704
705 Return an iterable yielding all matching elements in document order.
706
707 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000708 # assert self._root is not None
709 if path[:1] == "/":
710 path = "." + path
711 warnings.warn(
712 "This search is broken in 1.3 and earlier, and will be "
713 "fixed in a future version. If you rely on the current "
714 "behaviour, change it to %r" % path,
715 FutureWarning, stacklevel=2
716 )
717 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000718
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000719 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000720 encoding=None,
721 xml_declaration=None,
722 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800723 method=None, *,
724 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800725 """Write element tree to a file as XML.
726
727 Arguments:
728 *file_or_filename* -- file name or a file object opened for writing
729
730 *encoding* -- the output encoding (default: US-ASCII)
731
732 *xml_declaration* -- bool indicating if an XML declaration should be
733 added to the output. If None, an XML declaration
734 is added if encoding IS NOT either of:
735 US-ASCII, UTF-8, or Unicode
736
737 *default_namespace* -- sets the default XML namespace (for "xmlns")
738
739 *method* -- either "xml" (default), "html, "text", or "c14n"
740
741 *short_empty_elements* -- controls the formatting of elements
742 that contain no content. If True (default)
743 they are emitted as a single self-closed
744 tag, otherwise they are emitted as a pair
745 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800746
747 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000748 if not method:
749 method = "xml"
750 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000751 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000752 if not encoding:
753 if method == "c14n":
754 encoding = "utf-8"
755 else:
756 encoding = "us-ascii"
Martin Panter89f76d32015-09-23 01:14:35 +0000757 enc_lower = encoding.lower()
758 with _get_writer(file_or_filename, enc_lower) as write:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300759 if method == "xml" and (xml_declaration or
760 (xml_declaration is None and
Martin Panter89f76d32015-09-23 01:14:35 +0000761 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
Eli Bendersky00f402b2012-07-15 06:02:22 +0300762 declared_encoding = encoding
Martin Panter89f76d32015-09-23 01:14:35 +0000763 if enc_lower == "unicode":
Eli Bendersky00f402b2012-07-15 06:02:22 +0300764 # Retrieve the default encoding for the xml declaration
765 import locale
766 declared_encoding = locale.getpreferredencoding()
767 write("<?xml version='1.0' encoding='%s'?>\n" % (
768 declared_encoding,))
769 if method == "text":
770 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000771 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300772 qnames, namespaces = _namespaces(self._root, default_namespace)
773 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800774 serialize(write, self._root, qnames, namespaces,
775 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000776
777 def write_c14n(self, file):
778 # lxml.etree compatibility. use output method instead
779 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000780
781# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000782# serialization support
783
Eli Bendersky00f402b2012-07-15 06:02:22 +0300784@contextlib.contextmanager
785def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300786 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300787 try:
788 write = file_or_filename.write
789 except AttributeError:
790 # file_or_filename is a file name
791 if encoding == "unicode":
792 file = open(file_or_filename, "w")
793 else:
794 file = open(file_or_filename, "w", encoding=encoding,
795 errors="xmlcharrefreplace")
796 with file:
797 yield file.write
798 else:
799 # file_or_filename is a file-like object
800 # encoding determines if it is a text or binary writer
801 if encoding == "unicode":
802 # use a text writer as is
803 yield write
804 else:
805 # wrap a binary writer with TextIOWrapper
806 with contextlib.ExitStack() as stack:
807 if isinstance(file_or_filename, io.BufferedIOBase):
808 file = file_or_filename
809 elif isinstance(file_or_filename, io.RawIOBase):
810 file = io.BufferedWriter(file_or_filename)
811 # Keep the original file open when the BufferedWriter is
812 # destroyed
813 stack.callback(file.detach)
814 else:
815 # This is to handle passed objects that aren't in the
816 # IOBase hierarchy, but just have a write method
817 file = io.BufferedIOBase()
818 file.writable = lambda: True
819 file.write = write
820 try:
821 # TextIOWrapper uses this methods to determine
822 # if BOM (for UTF-16, etc) should be added
823 file.seekable = file_or_filename.seekable
824 file.tell = file_or_filename.tell
825 except AttributeError:
826 pass
827 file = io.TextIOWrapper(file,
828 encoding=encoding,
829 errors="xmlcharrefreplace",
830 newline="\n")
831 # Keep the original file open when the TextIOWrapper is
832 # destroyed
833 stack.callback(file.detach)
834 yield file.write
835
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000836def _namespaces(elem, default_namespace=None):
837 # identify namespaces used in this tree
838
839 # maps qnames to *encoded* prefix:local names
840 qnames = {None: None}
841
842 # maps uri:s to prefixes
843 namespaces = {}
844 if default_namespace:
845 namespaces[default_namespace] = ""
846
847 def add_qname(qname):
848 # calculate serialized qname representation
849 try:
850 if qname[:1] == "{":
851 uri, tag = qname[1:].rsplit("}", 1)
852 prefix = namespaces.get(uri)
853 if prefix is None:
854 prefix = _namespace_map.get(uri)
855 if prefix is None:
856 prefix = "ns%d" % len(namespaces)
857 if prefix != "xml":
858 namespaces[uri] = prefix
859 if prefix:
860 qnames[qname] = "%s:%s" % (prefix, tag)
861 else:
862 qnames[qname] = tag # default element
863 else:
864 if default_namespace:
865 # FIXME: can this be handled in XML 1.0?
866 raise ValueError(
867 "cannot use non-qualified names with "
868 "default_namespace option"
869 )
870 qnames[qname] = qname
871 except TypeError:
872 _raise_serialization_error(qname)
873
874 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300875 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000876 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000877 if isinstance(tag, QName):
878 if tag.text not in qnames:
879 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000880 elif isinstance(tag, str):
881 if tag not in qnames:
882 add_qname(tag)
883 elif tag is not None and tag is not Comment and tag is not PI:
884 _raise_serialization_error(tag)
885 for key, value in elem.items():
886 if isinstance(key, QName):
887 key = key.text
888 if key not in qnames:
889 add_qname(key)
890 if isinstance(value, QName) and value.text not in qnames:
891 add_qname(value.text)
892 text = elem.text
893 if isinstance(text, QName) and text.text not in qnames:
894 add_qname(text.text)
895 return qnames, namespaces
896
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800897def _serialize_xml(write, elem, qnames, namespaces,
898 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000899 tag = elem.tag
900 text = elem.text
901 if tag is Comment:
902 write("<!--%s-->" % text)
903 elif tag is ProcessingInstruction:
904 write("<?%s?>" % text)
905 else:
906 tag = qnames[tag]
907 if tag is None:
908 if text:
909 write(_escape_cdata(text))
910 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800911 _serialize_xml(write, e, qnames, None,
912 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000913 else:
914 write("<" + tag)
915 items = list(elem.items())
916 if items or namespaces:
917 if namespaces:
918 for v, k in sorted(namespaces.items(),
919 key=lambda x: x[1]): # sort on prefix
920 if k:
921 k = ":" + k
922 write(" xmlns%s=\"%s\"" % (
923 k,
924 _escape_attrib(v)
925 ))
926 for k, v in sorted(items): # lexical order
927 if isinstance(k, QName):
928 k = k.text
929 if isinstance(v, QName):
930 v = qnames[v.text]
931 else:
932 v = _escape_attrib(v)
933 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800934 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000935 write(">")
936 if text:
937 write(_escape_cdata(text))
938 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800939 _serialize_xml(write, e, qnames, None,
940 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000941 write("</" + tag + ">")
942 else:
943 write(" />")
944 if elem.tail:
945 write(_escape_cdata(elem.tail))
946
947HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300948 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000949
950try:
951 HTML_EMPTY = set(HTML_EMPTY)
952except NameError:
953 pass
954
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800955def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000956 tag = elem.tag
957 text = elem.text
958 if tag is Comment:
959 write("<!--%s-->" % _escape_cdata(text))
960 elif tag is ProcessingInstruction:
961 write("<?%s?>" % _escape_cdata(text))
962 else:
963 tag = qnames[tag]
964 if tag is None:
965 if text:
966 write(_escape_cdata(text))
967 for e in elem:
968 _serialize_html(write, e, qnames, None)
969 else:
970 write("<" + tag)
971 items = list(elem.items())
972 if items or namespaces:
973 if namespaces:
974 for v, k in sorted(namespaces.items(),
975 key=lambda x: x[1]): # sort on prefix
976 if k:
977 k = ":" + k
978 write(" xmlns%s=\"%s\"" % (
979 k,
980 _escape_attrib(v)
981 ))
982 for k, v in sorted(items): # lexical order
983 if isinstance(k, QName):
984 k = k.text
985 if isinstance(v, QName):
986 v = qnames[v.text]
987 else:
988 v = _escape_attrib_html(v)
989 # FIXME: handle boolean attributes
990 write(" %s=\"%s\"" % (qnames[k], v))
991 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200992 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000993 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200994 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000995 write(text)
996 else:
997 write(_escape_cdata(text))
998 for e in elem:
999 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +02001000 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001001 write("</" + tag + ">")
1002 if elem.tail:
1003 write(_escape_cdata(elem.tail))
1004
1005def _serialize_text(write, elem):
1006 for part in elem.itertext():
1007 write(part)
1008 if elem.tail:
1009 write(elem.tail)
1010
1011_serialize = {
1012 "xml": _serialize_xml,
1013 "html": _serialize_html,
1014 "text": _serialize_text,
1015# this optional method is imported at the end of the module
1016# "c14n": _serialize_c14n,
1017}
Armin Rigo9ed73062005-12-14 18:10:45 +00001018
Armin Rigo9ed73062005-12-14 18:10:45 +00001019
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001020def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001021 """Register a namespace prefix.
1022
1023 The registry is global, and any existing mapping for either the
1024 given prefix or the namespace URI will be removed.
1025
1026 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1027 attributes in this namespace will be serialized with prefix if possible.
1028
1029 ValueError is raised if prefix is reserved or is invalid.
1030
1031 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001032 if re.match("ns\d+$", prefix):
1033 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001034 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001035 if k == uri or v == prefix:
1036 del _namespace_map[k]
1037 _namespace_map[uri] = prefix
1038
1039_namespace_map = {
1040 # "well-known" namespace prefixes
1041 "http://www.w3.org/XML/1998/namespace": "xml",
1042 "http://www.w3.org/1999/xhtml": "html",
1043 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1044 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1045 # xml schema
1046 "http://www.w3.org/2001/XMLSchema": "xs",
1047 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1048 # dublin core
1049 "http://purl.org/dc/elements/1.1/": "dc",
1050}
Florent Xicluna16395052012-02-16 23:28:35 +01001051# For tests and troubleshooting
1052register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001053
1054def _raise_serialization_error(text):
1055 raise TypeError(
1056 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1057 )
1058
1059def _escape_cdata(text):
1060 # escape character data
1061 try:
1062 # it's worth avoiding do-nothing calls for strings that are
1063 # shorter than 500 character, or so. assume that's, by far,
1064 # the most common case in most applications.
1065 if "&" in text:
1066 text = text.replace("&", "&amp;")
1067 if "<" in text:
1068 text = text.replace("<", "&lt;")
1069 if ">" in text:
1070 text = text.replace(">", "&gt;")
1071 return text
1072 except (TypeError, AttributeError):
1073 _raise_serialization_error(text)
1074
1075def _escape_attrib(text):
1076 # escape attribute value
1077 try:
1078 if "&" in text:
1079 text = text.replace("&", "&amp;")
1080 if "<" in text:
1081 text = text.replace("<", "&lt;")
1082 if ">" in text:
1083 text = text.replace(">", "&gt;")
1084 if "\"" in text:
1085 text = text.replace("\"", "&quot;")
1086 if "\n" in text:
1087 text = text.replace("\n", "&#10;")
1088 return text
1089 except (TypeError, AttributeError):
1090 _raise_serialization_error(text)
1091
1092def _escape_attrib_html(text):
1093 # escape attribute value
1094 try:
1095 if "&" in text:
1096 text = text.replace("&", "&amp;")
1097 if ">" in text:
1098 text = text.replace(">", "&gt;")
1099 if "\"" in text:
1100 text = text.replace("\"", "&quot;")
1101 return text
1102 except (TypeError, AttributeError):
1103 _raise_serialization_error(text)
1104
1105# --------------------------------------------------------------------
1106
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001107def tostring(element, encoding=None, method=None, *,
1108 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001109 """Generate string representation of XML element.
1110
1111 All subelements are included. If encoding is "unicode", a string
1112 is returned. Otherwise a bytestring is returned.
1113
1114 *element* is an Element instance, *encoding* is an optional output
1115 encoding defaulting to US-ASCII, *method* is an optional output which can
1116 be one of "xml" (default), "html", "text" or "c14n".
1117
1118 Returns an (optionally) encoded string containing the XML data.
1119
1120 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001121 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001122 ElementTree(element).write(stream, encoding, method=method,
1123 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001124 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001125
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001126class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001127 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001128 def __init__(self, lst):
1129 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001130
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001131 def writable(self):
1132 return True
1133
1134 def seekable(self):
1135 return True
1136
1137 def write(self, b):
1138 self.lst.append(b)
1139
1140 def tell(self):
1141 return len(self.lst)
1142
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001143def tostringlist(element, encoding=None, method=None, *,
1144 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001145 lst = []
1146 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001147 ElementTree(element).write(stream, encoding, method=method,
1148 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001149 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001150
Armin Rigo9ed73062005-12-14 18:10:45 +00001151
1152def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001153 """Write element tree or element structure to sys.stdout.
1154
1155 This function should be used for debugging only.
1156
1157 *elem* is either an ElementTree, or a single Element. The exact output
1158 format is implementation dependent. In this version, it's written as an
1159 ordinary XML file.
1160
1161 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001162 # debugging
1163 if not isinstance(elem, ElementTree):
1164 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001165 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001166 tail = elem.getroot().tail
1167 if not tail or tail[-1] != "\n":
1168 sys.stdout.write("\n")
1169
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001170# --------------------------------------------------------------------
1171# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001172
Armin Rigo9ed73062005-12-14 18:10:45 +00001173
1174def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001175 """Parse XML document into element tree.
1176
1177 *source* is a filename or file object containing XML data,
1178 *parser* is an optional parser instance defaulting to XMLParser.
1179
1180 Return an ElementTree instance.
1181
1182 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001183 tree = ElementTree()
1184 tree.parse(source, parser)
1185 return tree
1186
Armin Rigo9ed73062005-12-14 18:10:45 +00001187
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001188def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001189 """Incrementally parse XML document into ElementTree.
1190
1191 This class also reports what's going on to the user based on the
1192 *events* it is initialized with. The supported events are the strings
1193 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1194 detailed namespace information). If *events* is omitted, only
1195 "end" events are reported.
1196
1197 *source* is a filename or file object containing XML data, *events* is
1198 a list of events to report back, *parser* is an optional parser instance.
1199
1200 Returns an iterator providing (event, elem) pairs.
1201
1202 """
Antoine Pitroue033e062010-10-29 10:38:18 +00001203 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001204 if not hasattr(source, "read"):
1205 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001206 close_source = True
Serhiy Storchakae3d4ec42015-11-23 15:44:03 +02001207 try:
1208 return _IterParseIterator(source, events, parser, close_source)
1209 except:
1210 if close_source:
1211 source.close()
1212 raise
Armin Rigo9ed73062005-12-14 18:10:45 +00001213
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001214
Eli Benderskyb5869342013-08-30 05:51:20 -07001215class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001216
Eli Benderskyb5869342013-08-30 05:51:20 -07001217 def __init__(self, events=None, *, _parser=None):
1218 # The _parser argument is for internal use only and must not be relied
1219 # upon in user code. It will be removed in a future release.
1220 # See http://bugs.python.org/issue17741 for more details.
1221
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001222 # _elementtree.c expects a list, not a deque
1223 self._events_queue = []
Armin Rigo9ed73062005-12-14 18:10:45 +00001224 self._index = 0
Eli Benderskyb5869342013-08-30 05:51:20 -07001225 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001226 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001227 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001228 events = ("end",)
1229 self._parser._setevents(self._events_queue, events)
1230
Eli Benderskyb5869342013-08-30 05:51:20 -07001231 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001232 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001233 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001234 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001235 if data:
1236 try:
1237 self._parser.feed(data)
1238 except SyntaxError as exc:
1239 self._events_queue.append(exc)
1240
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001241 def _close_and_return_root(self):
1242 # iterparse needs this to set its root attribute properly :(
1243 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001244 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001245 return root
1246
1247 def close(self):
1248 """Finish feeding data to parser.
1249
1250 Unlike XMLParser, does not return the root element. Use
1251 read_events() to consume elements from XMLPullParser.
1252 """
1253 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001254
Eli Benderskyb5869342013-08-30 05:51:20 -07001255 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001256 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001257
1258 Events are consumed from the internal event queue as they are
1259 retrieved from the iterator.
1260 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001261 events = self._events_queue
1262 while True:
1263 index = self._index
1264 try:
1265 event = events[self._index]
1266 # Avoid retaining references to past events
1267 events[self._index] = None
1268 except IndexError:
1269 break
1270 index += 1
1271 # Compact the list in a O(1) amortized fashion
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001272 # As noted above, _elementree.c needs a list, not a deque
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001273 if index * 2 >= len(events):
1274 events[:index] = []
1275 self._index = 0
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001276 else:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001277 self._index = index
1278 if isinstance(event, Exception):
1279 raise event
1280 else:
1281 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001282
1283
Antoine Pitrou0acbcb52013-08-23 23:04:30 +02001284class _IterParseIterator:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001285
1286 def __init__(self, source, events, parser, close_source=False):
Eli Benderskyb5869342013-08-30 05:51:20 -07001287 # Use the internal, undocumented _parser argument for now; When the
1288 # parser argument of iterparse is removed, this can be killed.
1289 self._parser = XMLPullParser(events=events, _parser=parser)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001290 self._file = source
1291 self._close_file = close_source
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001292 self.root = self._root = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001293
Georg Brandla18af4e2007-04-21 15:47:16 +00001294 def __next__(self):
Serhiy Storchakae3d4ec42015-11-23 15:44:03 +02001295 try:
1296 while 1:
1297 for event in self._parser.read_events():
1298 return event
1299 if self._parser._parser is None:
1300 break
1301 # load event buffer
1302 data = self._file.read(16 * 1024)
1303 if data:
1304 self._parser.feed(data)
1305 else:
1306 self._root = self._parser._close_and_return_root()
1307 self.root = self._root
1308 except:
1309 if self._close_file:
1310 self._file.close()
1311 raise
1312 if self._close_file:
1313 self._file.close()
1314 raise StopIteration
Armin Rigo9ed73062005-12-14 18:10:45 +00001315
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001316 def __iter__(self):
1317 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001318
Armin Rigo9ed73062005-12-14 18:10:45 +00001319
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001320def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001321 """Parse XML document from string constant.
1322
1323 This function can be used to embed "XML Literals" in Python code.
1324
1325 *text* is a string containing XML data, *parser* is an
1326 optional parser instance, defaulting to the standard XMLParser.
1327
1328 Returns an Element instance.
1329
1330 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001331 if not parser:
1332 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001333 parser.feed(text)
1334 return parser.close()
1335
Armin Rigo9ed73062005-12-14 18:10:45 +00001336
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001337def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001338 """Parse XML document from string constant for its IDs.
1339
1340 *text* is a string containing XML data, *parser* is an
1341 optional parser instance, defaulting to the standard XMLParser.
1342
1343 Returns an (Element, dict) tuple, in which the
1344 dict maps element id:s to elements.
1345
1346 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001347 if not parser:
1348 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001349 parser.feed(text)
1350 tree = parser.close()
1351 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001352 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001353 id = elem.get("id")
1354 if id:
1355 ids[id] = elem
1356 return tree, ids
1357
Victor Stinner765531d2013-03-26 01:11:54 +01001358# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001359fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001360
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001361def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001362 """Parse XML document from sequence of string fragments.
1363
1364 *sequence* is a list of other sequence, *parser* is an optional parser
1365 instance, defaulting to the standard XMLParser.
1366
1367 Returns an Element instance.
1368
1369 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001370 if not parser:
1371 parser = XMLParser(target=TreeBuilder())
1372 for text in sequence:
1373 parser.feed(text)
1374 return parser.close()
1375
1376# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001377
Armin Rigo9ed73062005-12-14 18:10:45 +00001378
1379class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001380 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001381
Eli Bendersky84fae782013-03-09 07:12:48 -08001382 This builder converts a sequence of start, data, and end method
1383 calls to a well-formed element structure.
1384
1385 You can use this class to build an element structure using a custom XML
1386 parser, or a parser for some other XML-like format.
1387
1388 *element_factory* is an optional element factory which is called
1389 to create new Element instances, as necessary.
1390
1391 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001392 def __init__(self, element_factory=None):
1393 self._data = [] # data collector
1394 self._elem = [] # element stack
1395 self._last = None # last element
1396 self._tail = None # true if we're after an end tag
1397 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001398 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001399 self._factory = element_factory
1400
Armin Rigo9ed73062005-12-14 18:10:45 +00001401 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001402 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001403 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001404 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001405 return self._last
1406
1407 def _flush(self):
1408 if self._data:
1409 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001410 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001411 if self._tail:
1412 assert self._last.tail is None, "internal error (tail)"
1413 self._last.tail = text
1414 else:
1415 assert self._last.text is None, "internal error (text)"
1416 self._last.text = text
1417 self._data = []
1418
Armin Rigo9ed73062005-12-14 18:10:45 +00001419 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001420 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001421 self._data.append(data)
1422
Armin Rigo9ed73062005-12-14 18:10:45 +00001423 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001424 """Open new element and return it.
1425
1426 *tag* is the element name, *attrs* is a dict containing element
1427 attributes.
1428
1429 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001430 self._flush()
1431 self._last = elem = self._factory(tag, attrs)
1432 if self._elem:
1433 self._elem[-1].append(elem)
1434 self._elem.append(elem)
1435 self._tail = 0
1436 return elem
1437
Armin Rigo9ed73062005-12-14 18:10:45 +00001438 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001439 """Close and return current Element.
1440
1441 *tag* is the element name.
1442
1443 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001444 self._flush()
1445 self._last = self._elem.pop()
1446 assert self._last.tag == tag,\
1447 "end tag mismatch (expected %s, got %s)" % (
1448 self._last.tag, tag)
1449 self._tail = 1
1450 return self._last
1451
Armin Rigo9ed73062005-12-14 18:10:45 +00001452
Eli Bendersky84fae782013-03-09 07:12:48 -08001453# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001454class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001455 """Element structure builder for XML source data based on the expat parser.
1456
1457 *html* are predefined HTML entities (not supported currently),
1458 *target* is an optional target object which defaults to an instance of the
1459 standard TreeBuilder class, *encoding* is an optional encoding string
1460 which if given, overrides the encoding specified in the XML file:
1461 http://www.iana.org/assignments/character-sets
1462
1463 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001464
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001465 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001466 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001467 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001468 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001469 try:
1470 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001471 except ImportError:
1472 raise ImportError(
1473 "No module named expat; use SimpleXMLTreeBuilder instead"
1474 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001475 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001476 if target is None:
1477 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001478 # underscored names are provided for compatibility only
1479 self.parser = self._parser = parser
1480 self.target = self._target = target
1481 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001482 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001483 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001484 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001485 if hasattr(target, 'start'):
1486 parser.StartElementHandler = self._start
1487 if hasattr(target, 'end'):
1488 parser.EndElementHandler = self._end
1489 if hasattr(target, 'data'):
1490 parser.CharacterDataHandler = target.data
1491 # miscellaneous callbacks
1492 if hasattr(target, 'comment'):
1493 parser.CommentHandler = target.comment
1494 if hasattr(target, 'pi'):
1495 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001496 # Configure pyexpat: buffering, new-style attribute handling.
1497 parser.buffer_text = 1
1498 parser.ordered_attributes = 1
1499 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001500 self._doctype = None
1501 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001502 try:
1503 self.version = "Expat %d.%d.%d" % expat.version_info
1504 except AttributeError:
1505 pass # unknown
1506
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001507 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001508 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001509 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001510 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001511 # events_queue: a list of actual parsing events that will be populated
1512 # by the underlying parser.
1513 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001514 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001515 append = events_queue.append
1516 for event_name in events_to_report:
1517 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001518 parser.ordered_attributes = 1
1519 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001520 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001521 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001522 append((event, start(tag, attrib_in)))
1523 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001524 elif event_name == "end":
1525 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001526 end=self._end):
1527 append((event, end(tag)))
1528 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001529 elif event_name == "start-ns":
1530 def handler(prefix, uri, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001531 append((event, (prefix or "", uri or "")))
1532 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001533 elif event_name == "end-ns":
1534 def handler(prefix, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001535 append((event, None))
1536 parser.EndNamespaceDeclHandler = handler
1537 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001538 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001539
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001540 def _raiseerror(self, value):
1541 err = ParseError(value)
1542 err.code = value.code
1543 err.position = value.lineno, value.offset
1544 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001545
Armin Rigo9ed73062005-12-14 18:10:45 +00001546 def _fixname(self, key):
1547 # expand qname, and convert name string to ascii, if possible
1548 try:
1549 name = self._names[key]
1550 except KeyError:
1551 name = key
1552 if "}" in name:
1553 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001554 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001555 return name
1556
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001557 def _start(self, tag, attr_list):
1558 # Handler for expat's StartElementHandler. Since ordered_attributes
1559 # is set, the attributes are reported as a list of alternating
1560 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001561 fixname = self._fixname
1562 tag = fixname(tag)
1563 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001564 if attr_list:
1565 for i in range(0, len(attr_list), 2):
1566 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001567 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001568
Armin Rigo9ed73062005-12-14 18:10:45 +00001569 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001570 return self.target.end(self._fixname(tag))
1571
Armin Rigo9ed73062005-12-14 18:10:45 +00001572 def _default(self, text):
1573 prefix = text[:1]
1574 if prefix == "&":
1575 # deal with undefined entities
1576 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001577 data_handler = self.target.data
1578 except AttributeError:
1579 return
1580 try:
1581 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001582 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001583 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001584 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001585 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001586 (text, self.parser.ErrorLineNumber,
1587 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001588 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001589 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001590 err.lineno = self.parser.ErrorLineNumber
1591 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001592 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001593 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1594 self._doctype = [] # inside a doctype declaration
1595 elif self._doctype is not None:
1596 # parse doctype contents
1597 if prefix == ">":
1598 self._doctype = None
1599 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001600 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001601 if not text:
1602 return
1603 self._doctype.append(text)
1604 n = len(self._doctype)
1605 if n > 2:
1606 type = self._doctype[1]
1607 if type == "PUBLIC" and n == 4:
1608 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001609 if pubid:
1610 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001611 elif type == "SYSTEM" and n == 3:
1612 name, type, system = self._doctype
1613 pubid = None
1614 else:
1615 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001616 if hasattr(self.target, "doctype"):
1617 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001618 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001619 # warn about deprecated call
1620 self._XMLParser__doctype(name, pubid, system[1:-1])
1621 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001622 self._doctype = None
1623
Armin Rigo9ed73062005-12-14 18:10:45 +00001624 def doctype(self, name, pubid, system):
Eli Bendersky84fae782013-03-09 07:12:48 -08001625 """(Deprecated) Handle doctype declaration
1626
1627 *name* is the Doctype name, *pubid* is the public identifier,
1628 and *system* is the system identifier.
1629
1630 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001631 warnings.warn(
1632 "This method of XMLParser is deprecated. Define doctype() "
1633 "method on the TreeBuilder target.",
1634 DeprecationWarning,
1635 )
1636
1637 # sentinel, if doctype is redefined in a subclass
1638 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001639
Armin Rigo9ed73062005-12-14 18:10:45 +00001640 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001641 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001642 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001643 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001644 except self._error as v:
1645 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001646
Armin Rigo9ed73062005-12-14 18:10:45 +00001647 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001648 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001649 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001650 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001651 except self._error as v:
1652 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001653 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001654 close_handler = self.target.close
1655 except AttributeError:
1656 pass
1657 else:
1658 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001659 finally:
1660 # get rid of circular references
1661 del self.parser, self._parser
1662 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001663
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001664
1665# Import the C accelerators
1666try:
Eli Bendersky46955b22013-05-19 09:20:50 -07001667 # Element is going to be shadowed by the C implementation. We need to keep
1668 # the Python version of it accessible for some "creative" by external code
1669 # (see tests)
1670 _Element_Py = Element
1671
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001672 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1673 from _elementtree import *
Eli Benderskyc4e98a62013-05-19 09:24:43 -07001674except ImportError:
1675 pass